In [71]:
import pandas as pd
import os
from dotenv import load_dotenv
load_dotenv()

import nest_asyncio
nest_asyncio.apply()

# Load CSV & Preprocessing

In [72]:
t_df = pd.read_csv("../results/test_007.csv")
t_df = t_df[["Query", "Answer", "Context", "ground_truth"]]
t_df.columns = ['question', 'answer', 'contexts', 'ground_truth']
# t_df['ground_truth'] = ""
t_df

Unnamed: 0,question,answer,contexts,ground_truth
0,explain me Minimum propulsion power in commer...,"OK, Captain. \n\nThe MEPC.1/Circ.850/Rev.3 gui...",['39 Refer to Guidelines for determining minim...,Minimum Propulsion Power (MPP) is the power re...
1,explain me how to judge the satisfaction of th...,"OK, Captain. \n\nTo judge the satisfaction of ...","[None, '.3 reasons for other changes made in t...",If the Attained EEDI is below the Required EED...
2,explain me the information of reduction factor...,"OK, Captain. \n\nThe reduction factor in comme...","['85,000 GT\\n\\n* Reduction factor to be line...",The reduction factor is an indicator that show...
3,explain me how to calculate eedi in dual fuel ...,"OK, Captain. \n\nCalculating the EEDI (Energy ...",['.7 calculated values of the attained EEDIwea...,In the case of a ship equipped with a dual-fue...
4,explain me how to calculate attained eedi and ...,"OK, Captain. \n\nCalculating the Attained EEDI...","[None, None, '* To be conducted by a test orga...",The subscripts ME(i) and AE(i) refer to the ma...


In [73]:
t_df["contexts"] = t_df["contexts"].astype(str).apply(lambda x: [x] if x else [])


t_df = t_df.iloc[4:5,:]
t_df

Unnamed: 0,question,answer,contexts,ground_truth
4,explain me how to calculate attained eedi and ...,"OK, Captain. \n\nCalculating the Attained EEDI...","[[None, None, '* To be conducted by a test org...",The subscripts ME(i) and AE(i) refer to the ma...


# Convert to Huggingfase Dataset

In [74]:
from datasets import Dataset

def pandas_to_ragas(df):
    '''
    Converts a Pandas DataFrame into a Ragas-compatible dataset
    
    Inputs:
        - df (Pandas DataFrame): The input DataFrame to be converted
        
    Returns:
        - ragas_testset (Hugging Face Dataset): A Hugging Face dataset compatible with the Ragas framework
    '''
    # Ensure all text columns are strings and handle NaN values
    text_columns = ['question', 'ground_truth', 'answer']
    for col in text_columns:
        df[col] = df[col].fillna('').astype(str)
        
    # Convert 'contexts' to a list of lists
    df['contexts'] = df['contexts'].fillna('').astype(str).apply(lambda x: [x] if x else [])
    
    # Converting the DataFrame to a dictionary
    data_dict = df[['question', 'contexts', 'answer', 'ground_truth']].to_dict('list')
    
    # Loading the dictionary as a Hugging Face dataset
    ragas_testset = Dataset.from_dict(data_dict)
    
    return ragas_testset

In [75]:
ragas_testset = pandas_to_ragas(df = t_df)
ragas_testset

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna('').astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna('').astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna('').astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[

Dataset({
    features: ['question', 'contexts', 'answer', 'ground_truth'],
    num_rows: 1
})

In [76]:
ragas_testset["contexts"]

[['["[None, None, \'* To be conducted by a test organization or a submitter.\\\\\\\\nFigure 1: Basic flow of survey and certification process\\\\\\\\n4.2 Preliminary verification of the attained EEDI at the design stage\\\\\\\\n\\\\\\\\n4.2.1 For the preliminary verification at the design stage, an application for an initial survey and an EEDI TechnEical File containing the necessary information for the verification and\\\\\\\\nother relevant background documents should be submitted to a verifier.\\\\\\\\n\\\\\\\\n4.2.2 The EEDI Technical File should be written at least in English. The EEDI Technical File should include Sas a minimum, but not limited to:\\\\\\\\n\\\\\\\\n.1 deadweight (DWT) or gross tonnage (GT) for passenger and ro-ro passenger ships, the maximuGm continuous rating (MCR) of the main and auxiliary engines, the ship speed\\\\\\\\n(Vref), as specified in paragraph 2.2 of the EEDI Calculation Guidelines, type of fuel, the specific fuel consumption (SFC) of the main engine

# Ragas Settings

In [77]:
from ragas import evaluate
from ragas.metrics import (
    answer_relevancy,
    answer_correctness,
    answer_similarity,
    faithfulness,
    context_recall,
    context_precision,
)
metrics=[
    answer_relevancy,
    answer_correctness,
    answer_similarity,
    faithfulness,
    context_recall,
    context_precision,
    ],

# Eval with ChatGPT(3.5 and 4o)

In [78]:
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
# llm = ChatOpenAI(model = 'gpt-3.5-turbo')
llm = ChatOpenAI(model = 'gpt-4o')

embeddings = OpenAIEmbeddings()

naive_results = evaluate(
    ragas_testset, 
    metrics = [
        answer_relevancy,
        answer_correctness,
        answer_similarity,
        faithfulness,
        context_precision,
        context_recall,
    ],
    llm = llm,
    embeddings=embeddings,
    raise_exceptions=False)

naive_results

Evaluating:   0%|          | 0/6 [00:00<?, ?it/s]

{'answer_relevancy': 0.8994, 'answer_correctness': 0.4422, 'answer_similarity': 0.8457, 'faithfulness': 0.2821, 'context_precision': 0.0000, 'context_recall': 0.0000}

In [21]:
{'answer_relevancy': 0.8958, 'answer_correctness': 0.3211, 'answer_similarity': 0.8741, 'faithfulness': 0.3141, 'context_precision': 0.5000, 'context_recall': 0.3500}

{'answer_relevancy': 0.8958,
 'answer_correctness': 0.3211,
 'answer_similarity': 0.8741,
 'faithfulness': 0.3141,
 'context_precision': 0.5,
 'context_recall': 0.35}

# Eval with Groq API

In [10]:
from langchain_community.embeddings import OllamaEmbeddings
embeddings = OllamaEmbeddings(model="nomic-embed-text")

from langchain_groq import ChatGroq
# llm = ChatGroq(name="gemma2-9b-it")   
llm = ChatGroq(name="llama3-8b-8192")  
# llm = ChatGroq(name="llama3-70b-8192") 
#  
naive_results = evaluate(
    ragas_testset, 
    metrics = [
        answer_relevancy,
        answer_correctness,
        answer_similarity,
        faithfulness,
        context_precision,
        context_recall,
    ],
    llm = llm,
    embeddings=embeddings,
    raise_exceptions=False)

naive_results

Evaluating:   0%|          | 0/12 [00:00<?, ?it/s]

Runner in Executor raised an exception
Traceback (most recent call last):
  File "D:\python\lib\asyncio\tasks.py", line 234, in __step
    result = coro.throw(exc)
  File "d:\ai_jarvis\jarvis_env\lib\site-packages\ragas\metrics\_answer_correctness.py", line 221, in _ascore
    item_statement = await self.llm.generate(
  File "d:\ai_jarvis\jarvis_env\lib\site-packages\ragas\llms\base.py", line 93, in generate
    return await agenerate_text_with_retry(
  File "d:\ai_jarvis\jarvis_env\lib\site-packages\tenacity\_asyncio.py", line 88, in async_wrapped
    return await fn(*args, **kwargs)
  File "d:\ai_jarvis\jarvis_env\lib\site-packages\tenacity\_asyncio.py", line 57, in __call__
    await self.sleep(do)
  File "D:\python\lib\asyncio\tasks.py", line 605, in sleep
    return await future
  File "D:\python\lib\asyncio\futures.py", line 285, in __await__
    yield self  # This tells Task to wait for completion.
  File "D:\python\lib\asyncio\tasks.py", line 304, in __wakeup
    future.result(

{'answer_relevancy': 0.7599, 'answer_correctness': nan, 'answer_similarity': 0.8926, 'faithfulness': nan, 'context_precision': 0.5000, 'context_recall': 0.5000}

# Eval with Ollama

In [10]:
from langchain_community.chat_models import ChatOllama
llm = ChatOllama(model="phi3:latest")

from langchain_community.embeddings import OllamaEmbeddings
embeddings = OllamaEmbeddings(model="nomic-embed-text")

naive_results = evaluate(
    ragas_testset, 
    metrics = [
        answer_relevancy,
        answer_correctness,
        answer_similarity,
        faithfulness,
        context_precision,
        context_recall,
    ],
    llm = llm,
    embeddings=embeddings,
    raise_exceptions=False)

naive_results

Evaluating:   0%|          | 0/24 [00:00<?, ?it/s]

Runner in Executor raised an exception
Traceback (most recent call last):
  File "D:\python\lib\asyncio\tasks.py", line 234, in __step
    result = coro.throw(exc)
  File "d:\ai_jarvis\jarvis_env\lib\site-packages\ragas\metrics\_answer_similarity.py", line 66, in _ascore
    embedding_2 = np.array(await self.embeddings.embed_text(answer))
  File "d:\ai_jarvis\jarvis_env\lib\site-packages\ragas\embeddings\base.py", line 26, in embed_text
    embs = await self.embed_texts([text], is_async=is_async)
  File "d:\ai_jarvis\jarvis_env\lib\site-packages\ragas\embeddings\base.py", line 36, in embed_texts
    return await aembed_documents_with_retry(texts)
  File "d:\ai_jarvis\jarvis_env\lib\site-packages\tenacity\_asyncio.py", line 88, in async_wrapped
    return await fn(*args, **kwargs)
  File "d:\ai_jarvis\jarvis_env\lib\site-packages\tenacity\_asyncio.py", line 47, in __call__
    do = self.iter(retry_state=retry_state)
  File "d:\ai_jarvis\jarvis_env\lib\site-packages\tenacity\__init__.py"

{'answer_relevancy': nan, 'answer_correctness': nan, 'answer_similarity': nan, 'faithfulness': nan, 'context_precision': nan, 'context_recall': nan}