In [21]:
import pandas as pd

import os
from dotenv import load_dotenv
load_dotenv()

import nest_asyncio
nest_asyncio.apply()

# Load CSV & Preprocessing

In [22]:
t_df = pd.read_csv("../results/test_003.csv")
t_df = t_df[["Query", "Answer", "Context"]]
t_df.columns = ['question', 'answer', 'contexts']
t_df['ground_truth'] = ""
t_df

Unnamed: 0,question,answer,contexts,ground_truth
0,what is the feature of FW generator?,"OK, Captain.\n\nThe FW generator is a high-tem...",['PACKAGE LIST\\nHULL NO. : 8250/8251 PAGE : 0...,
1,what is the feature of FW generator?,"OK, Captain. Based on the given package list ...",['PACKAGE LIST\\nHULL NO. : 8250/8251 PAGE : 0...,
2,what is the feature of FW generator?,"OK, Captain!\n\nAccording to the PACKAGE LIST ...",['PACKAGE LIST\\nHULL NO. : 8250/8251 PAGE : 0...,
3,what is the feature of FW generator?,"OK, Captain. \n\nThe FW generator is a low pre...",['PACKAGE LIST\\nHULL NO. : 8250/8251 PAGE : 0...,


In [23]:
t_df["contexts"] = t_df["contexts"].astype(str).apply(lambda x: [x] if x else [])
t_df

Unnamed: 0,question,answer,contexts,ground_truth
0,what is the feature of FW generator?,"OK, Captain.\n\nThe FW generator is a high-tem...",[['PACKAGE LIST\\nHULL NO. : 8250/8251 PAGE : ...,
1,what is the feature of FW generator?,"OK, Captain. Based on the given package list ...",[['PACKAGE LIST\\nHULL NO. : 8250/8251 PAGE : ...,
2,what is the feature of FW generator?,"OK, Captain!\n\nAccording to the PACKAGE LIST ...",[['PACKAGE LIST\\nHULL NO. : 8250/8251 PAGE : ...,
3,what is the feature of FW generator?,"OK, Captain. \n\nThe FW generator is a low pre...",[['PACKAGE LIST\\nHULL NO. : 8250/8251 PAGE : ...,


# Convert to Huggingfase Dataset

In [24]:
from datasets import Dataset

def pandas_to_ragas(df):
    '''
    Converts a Pandas DataFrame into a Ragas-compatible dataset
    
    Inputs:
        - df (Pandas DataFrame): The input DataFrame to be converted
        
    Returns:
        - ragas_testset (Hugging Face Dataset): A Hugging Face dataset compatible with the Ragas framework
    '''
    # Ensure all text columns are strings and handle NaN values
    text_columns = ['question', 'ground_truth', 'answer']
    for col in text_columns:
        df[col] = df[col].fillna('').astype(str)
        
    # Convert 'contexts' to a list of lists
    df['contexts'] = df['contexts'].fillna('').astype(str).apply(lambda x: [x] if x else [])
    
    # Converting the DataFrame to a dictionary
    data_dict = df[['question', 'contexts', 'answer', 'ground_truth']].to_dict('list')
    
    # Loading the dictionary as a Hugging Face dataset
    ragas_testset = Dataset.from_dict(data_dict)
    
    return ragas_testset

In [25]:
ragas_testset = pandas_to_ragas(df = t_df)
ragas_testset

Dataset({
    features: ['question', 'contexts', 'answer', 'ground_truth'],
    num_rows: 4
})

In [26]:
ragas_testset["contexts"]

[['["[\'PACKAGE LIST\\\\\\\\nHULL NO. : 8250/8251 PAGE : 0\\\\\\\\nPOR NO.\\\\\\\\nSER. SEQ. DESCRIPTION Q’TY REMARK\\\\\\\\nNO. NO.\\\\\\\\nM536 AA F.W. GENERATOR EVAPORATING TYPE 1 Separately\\\\\\\\n\\\\\\\\nM536 BB SPARE PARTS & TOOLS FOR F.W. GENERATOR 1\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n1. EACH UNIT WITHIN A PACKAGE OR SHIPPING CONTAINER SHALL BE\\\\\\\\nCLEARLY MARKED IN A MANNER AS MAY BE DESIGNATED BY THE BUYER\\\\\\\\nBY STAMPING, TAGGING OR OTHER SUITABLE MEANS WITH\\\\\\\\nIDENTIFICATION OF SUPPLY.\\\\\\\\nTHE OUTSIDE OF EACH PACKAGE AND OR PROTECTIVE DEVICES SHALL\\\\\\\\nBE CLEARLY MARKED, REFERING SHIPPING MARK.\\\\\\\\n2. ABOVE POR NO. (SER. NO. - SEQ. NO.) AND DESCRIPTION MUST BE\\\\\\\\nMARKED ON EACH PACKAGE AND PACKING LIST.\', None, \'the approval drawing.\\\\\\\\n20) Eye-plate to be suitably fitted for rotor of generator, steam turbines, electric motors,\\\\\\\\nheat exchanger’s cover/tube/bundles and heavy strainer’s cover/filter/element of about\\\\\\\\n40 kg &

# Ragas Settings

In [27]:
from ragas import evaluate
from ragas.metrics import (
    answer_relevancy,
    answer_correctness,
    answer_similarity,
    faithfulness,
    context_recall,
    context_precision,
)
metrics=[
    answer_relevancy,
    answer_correctness,
    answer_similarity,
    faithfulness,
    context_recall,
    context_precision,
    ],

# Eval with ChatGPT(3.5 and 4o)

In [28]:
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
# llm = ChatOpenAI(model = 'gpt-3.5-turbo')
llm = ChatOpenAI(model = 'gpt-4o')

embeddings = OpenAIEmbeddings()

naive_results = evaluate(
    ragas_testset, 
    metrics = [
        answer_relevancy,
        answer_correctness,
        answer_similarity,
        faithfulness,
        context_precision,
        context_recall,
    ],
    llm = llm,
    embeddings=embeddings,
    raise_exceptions=False)

naive_results

Evaluating:   0%|          | 0/24 [00:00<?, ?it/s]

The LLM did not return a valid classification.
The LLM did not return a valid classification.
The LLM did not return a valid classification.
The LLM did not return a valid classification.
  value = np.nanmean(self.scores[cn])


{'answer_relevancy': 0.6577, 'answer_correctness': 0.1600, 'answer_similarity': 0.6399, 'faithfulness': 0.0839, 'context_precision': 0.0000, 'context_recall': nan}

# Eval with Groq API

In [29]:
from langchain_community.embeddings import OllamaEmbeddings
embeddings = OllamaEmbeddings(model="nomic-embed-text")

from langchain_groq import ChatGroq
# llm = ChatGroq(name="gemma2-9b-it")   
# llm = ChatGroq(name="llama3-8b-8192")  
llm = ChatGroq(name="llama3-70b-8192") 
#  
naive_results = evaluate(
    ragas_testset, 
    metrics = [
        answer_relevancy,
        answer_correctness,
        answer_similarity,
        faithfulness,
        context_precision,
        context_recall,
    ],
    llm = llm,
    embeddings=embeddings,
    raise_exceptions=False)

naive_results

Evaluating:   0%|          | 0/24 [00:00<?, ?it/s]

Runner in Executor raised an exception
Traceback (most recent call last):
  File "d:\ai_jarvis\jarvis_env\lib\site-packages\langchain_core\output_parsers\pydantic.py", line 35, in _parse_obj
    return self.pydantic_object.parse_obj(obj)
  File "d:\ai_jarvis\jarvis_env\lib\site-packages\pydantic\v1\main.py", line 526, in parse_obj
    return cls(**obj)
  File "d:\ai_jarvis\jarvis_env\lib\site-packages\pydantic\v1\main.py", line 341, in __init__
    raise validation_error
pydantic.v1.error_wrappers.ValidationError: 1 validation error for StatementsAnswers
__root__
  none is not an allowed value (type=type_error.none.not_allowed)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "d:\ai_jarvis\jarvis_env\lib\site-packages\ragas\llms\output_parser.py", line 61, in aparse
    output = super().parse(result)
  File "d:\ai_jarvis\jarvis_env\lib\site-packages\langchain_core\output_parsers\pydantic.py", line 64, in parse
    return su

{'answer_relevancy': 0.6733, 'answer_correctness': nan, 'answer_similarity': 0.6966, 'faithfulness': nan, 'context_precision': 0.0000, 'context_recall': 0.5556}

# Eval with Ollama

In [30]:
from langchain_community.chat_models import ChatOllama
llm = ChatOllama(model="phi3:latest")

from langchain_community.embeddings import OllamaEmbeddings
embeddings = OllamaEmbeddings(model="nomic-embed-text")

naive_results = evaluate(
    ragas_testset, 
    metrics = [
        answer_relevancy,
        answer_correctness,
        answer_similarity,
        faithfulness,
        context_precision,
        context_recall,
    ],
    llm = llm,
    embeddings=embeddings,
    raise_exceptions=False)

naive_results

Evaluating:   0%|          | 0/24 [00:00<?, ?it/s]

Runner in Executor raised an exception
Traceback (most recent call last):
  File "D:\python\lib\asyncio\tasks.py", line 234, in __step
    result = coro.throw(exc)
  File "d:\ai_jarvis\jarvis_env\lib\site-packages\ragas\metrics\_answer_similarity.py", line 66, in _ascore
    embedding_2 = np.array(await self.embeddings.embed_text(answer))
  File "d:\ai_jarvis\jarvis_env\lib\site-packages\ragas\embeddings\base.py", line 26, in embed_text
    embs = await self.embed_texts([text], is_async=is_async)
  File "d:\ai_jarvis\jarvis_env\lib\site-packages\ragas\embeddings\base.py", line 36, in embed_texts
    return await aembed_documents_with_retry(texts)
  File "d:\ai_jarvis\jarvis_env\lib\site-packages\tenacity\_asyncio.py", line 88, in async_wrapped
    return await fn(*args, **kwargs)
  File "d:\ai_jarvis\jarvis_env\lib\site-packages\tenacity\_asyncio.py", line 47, in __call__
    do = self.iter(retry_state=retry_state)
  File "d:\ai_jarvis\jarvis_env\lib\site-packages\tenacity\__init__.py"

{'answer_relevancy': nan, 'answer_correctness': nan, 'answer_similarity': nan, 'faithfulness': nan, 'context_precision': 0.0000, 'context_recall': nan}