In [1]:
import pandas as pd
from langchain_community.document_loaders import DataFrameLoader

# Loading in our knowledge item (KI) dataset as a Pandas DataFrame
df_kis = pd.read_csv('D:/ai_jarvis/results/test_003.csv')
df_kis = df_kis[['Query','Answer','Context']]

# Loading the KI data with the LangChain data loader# Creating the document loader around our Pandas dataframe
ki_doc_loader = DataFrameLoader(data_frame = df_kis, page_content_column = 'Context')

# Loading the documents as LangChain documents using the dataframe document loader
docs = ki_doc_loader.load()
print(len(docs))
docs

4


[Document(page_content="['PACKAGE LIST\\\\nHULL NO. : 8250/8251 PAGE : 0\\\\nPOR NO.\\\\nSER. SEQ. DESCRIPTION Q’TY REMARK\\\\nNO. NO.\\\\nM536 AA F.W. GENERATOR EVAPORATING TYPE 1 Separately\\\\n\\\\nM536 BB SPARE PARTS & TOOLS FOR F.W. GENERATOR 1\\\\n\\\\n\\\\n\\\\n1. EACH UNIT WITHIN A PACKAGE OR SHIPPING CONTAINER SHALL BE\\\\nCLEARLY MARKED IN A MANNER AS MAY BE DESIGNATED BY THE BUYER\\\\nBY STAMPING, TAGGING OR OTHER SUITABLE MEANS WITH\\\\nIDENTIFICATION OF SUPPLY.\\\\nTHE OUTSIDE OF EACH PACKAGE AND OR PROTECTIVE DEVICES SHALL\\\\nBE CLEARLY MARKED, REFERING SHIPPING MARK.\\\\n2. ABOVE POR NO. (SER. NO. - SEQ. NO.) AND DESCRIPTION MUST BE\\\\nMARKED ON EACH PACKAGE AND PACKING LIST.', None, 'the approval drawing.\\\\n20) Eye-plate to be suitably fitted for rotor of generator, steam turbines, electric motors,\\\\nheat exchanger’s cover/tube/bundles and heavy strainer’s cover/filter/element of about\\\\n40 kg & above (And lifting eyes to have a min. diameter of 23 mm).\\\\n21) 

In [2]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
import nest_asyncio
nest_asyncio.apply()

In [4]:
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context, conditional

# Setting the generator LLM, critic LLM, and embeddings algorithms
chat_model = ChatOpenAI(model = 'gpt-3.5-turbo')
embeddings = OpenAIEmbeddings()

# Instantiating the test set generator
testset_generator = TestsetGenerator.from_langchain(
    generator_llm = chat_model,
    critic_llm = chat_model,
    embeddings = embeddings
)

# Generating the testset with our KI documents
testset = testset_generator.generate_with_langchain_docs(
    documents = docs,
    test_size = 3,
    distributions = {
        simple: 0.5,
        reasoning: 0.2,
        multi_context: 0.2,
        conditional: 0.1
    }
)

# Transforming the testset to a Pandas DataFrame
df_testset = testset.to_pandas()
df_testset

embedding nodes:   0%|          | 0/8 [00:00<?, ?it/s]

Filename and doc_id are the same for all nodes.


Generating:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,What is included in the spare parts & tools pa...,[['PACKAGE LIST\\nHULL NO. : 8250/8251 PAGE : ...,The spare parts & tools package for the F.W. g...,simple,[{'Query': 'what is the feature of FW generato...,True
1,What information is typically included in a pa...,[['PACKAGE LIST\\nHULL NO. : 8250/8251 PAGE : ...,A package list for equipment like a F.W. gener...,simple,[{'Query': 'what is the feature of FW generato...,True
2,"What does ""M536 AA"" refer to in the package li...",[['PACKAGE LIST\\nHULL NO. : 8250/8251 PAGE : ...,The answer to given question is not present in...,reasoning,[{'Query': 'what is the feature of FW generato...,True
3,What's included in the spare parts & tools pac...,[['PACKAGE LIST\\nHULL NO. : 8250/8251 PAGE : ...,The answer to given question is not present in...,multi_context,[{'Query': 'what is the feature of FW generato...,True


In [5]:
# Preparing the DataFrame to generate the new ground truth
df_testset.rename(columns = {'ground_truth': 'answer'}, inplace = True)
df_testset['ground_truth'] = ''
df_testset

Unnamed: 0,question,contexts,answer,evolution_type,metadata,episode_done,ground_truth
0,What is included in the spare parts & tools pa...,[['PACKAGE LIST\\nHULL NO. : 8250/8251 PAGE : ...,The spare parts & tools package for the F.W. g...,simple,[{'Query': 'what is the feature of FW generato...,True,
1,What information is typically included in a pa...,[['PACKAGE LIST\\nHULL NO. : 8250/8251 PAGE : ...,A package list for equipment like a F.W. gener...,simple,[{'Query': 'what is the feature of FW generato...,True,
2,"What does ""M536 AA"" refer to in the package li...",[['PACKAGE LIST\\nHULL NO. : 8250/8251 PAGE : ...,The answer to given question is not present in...,reasoning,[{'Query': 'what is the feature of FW generato...,True,
3,What's included in the spare parts & tools pac...,[['PACKAGE LIST\\nHULL NO. : 8250/8251 PAGE : ...,The answer to given question is not present in...,multi_context,[{'Query': 'what is the feature of FW generato...,True,


In [6]:
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate

# Creating the ground truth simulation prompt template
GT_SIMULATION_PROMPT = '''You are an expert evaluator for question-answering systems. Your task is to provide the ideal ground truth answer based on the given question and context. Please follow these guidelines:

1. Question: {question}

2. Context: {context}

3. Instructions:
   - Carefully analyze the question and the provided context.
   - Formulate a comprehensive and accurate answer based solely on the information given in the context.
   - Ensure your answer directly addresses the question.
   - Include all relevant information from the context, but do not add any external knowledge.
   - If the context doesn't contain enough information to fully answer the question, state this clearly and provide the best possible partial answer.
   - Use a formal, objective tone.

Remember, your goal is to provide the ideal answer that should be used as the benchmark for evaluating the AI's performance.'''


# Creating the prompt engineering emplate to generate the simulated ground truth
gt_generation_prompt = ChatPromptTemplate.from_messages(messages = [
    HumanMessagePromptTemplate.from_template(template = GT_SIMULATION_PROMPT)
])

# Instantiating the Llama 3 model via Perplexity
llm = ChatOpenAI(model = 'gpt-4o')

# Creating the inference chain to generate the simulated ground truth
gt_generation_chain = gt_generation_prompt | llm

def generate_ground_truth_text(row):
    '''
    Generates simulated ground truth text per a given the provided question and context
    
    Inputs:
        - row (Pandas DataFrame record): A single record from the Pandas DataFrame
        
    Returns:
        - gt_text (str): The ground truth text generated by the AI model per the record
    '''
    
    # Checking to see if the ground truth text has already been generated
    if row['ground_truth'] == '':
        
        # Generating the ground truth text
        gt_text = gt_generation_chain.invoke(
            {
                'question': row['question'],
                'context': row['contexts']
            }
        ).content
        
        return gt_text
    
    else:
        
        # Returning what is already in place if the string is not empty
        return row['ground_truth']

# Generating the ground truth
df_testset['ground_truth'] = df_testset.apply(generate_ground_truth_text, axis = 1)
df_testset

Unnamed: 0,question,contexts,answer,evolution_type,metadata,episode_done,ground_truth
0,What is included in the spare parts & tools pa...,[['PACKAGE LIST\\nHULL NO. : 8250/8251 PAGE : ...,The spare parts & tools package for the F.W. g...,simple,[{'Query': 'what is the feature of FW generato...,True,The context provided does not contain detailed...
1,What information is typically included in a pa...,[['PACKAGE LIST\\nHULL NO. : 8250/8251 PAGE : ...,A package list for equipment like a F.W. gener...,simple,[{'Query': 'what is the feature of FW generato...,True,A typical package list for equipment such as a...
2,"What does ""M536 AA"" refer to in the package li...",[['PACKAGE LIST\\nHULL NO. : 8250/8251 PAGE : ...,The answer to given question is not present in...,reasoning,[{'Query': 'what is the feature of FW generato...,True,"""M536 AA"" in the package list refers to a ""F.W..."
3,What's included in the spare parts & tools pac...,[['PACKAGE LIST\\nHULL NO. : 8250/8251 PAGE : ...,The answer to given question is not present in...,multi_context,[{'Query': 'what is the feature of FW generato...,True,The spare parts and tools package for the F.W....


In [7]:
df_testset["contexts"][0]

["['PACKAGE LIST\\\\nHULL NO. : 8250/8251 PAGE : 0\\\\nPOR NO.\\\\nSER. SEQ. DESCRIPTION Q’TY REMARK\\\\nNO. NO.\\\\nM536 AA F.W. GENERATOR EVAPORATING TYPE 1 Separately\\\\n\\\\nM536 BB SPARE PARTS & TOOLS FOR F.W. GENERATOR 1\\\\n\\\\n\\\\n\\\\n1. EACH UNIT WITHIN A PACKAGE OR SHIPPING CONTAINER SHALL BE\\\\nCLEARLY MARKED IN A MANNER AS MAY BE DESIGNATED BY THE BUYER\\\\nBY STAMPING, TAGGING OR OTHER SUITABLE MEANS WITH\\\\nIDENTIFICATION OF SUPPLY.\\\\nTHE OUTSIDE OF EACH PACKAGE AND OR PROTECTIVE DEVICES SHALL\\\\nBE CLEARLY MARKED, REFERING SHIPPING MARK.\\\\n2. ABOVE POR NO. (SER. NO. - SEQ. NO.) AND DESCRIPTION MUST BE\\\\nMARKED ON EACH PACKAGE AND PACKING LIST.', None, 'the approval drawing.\\\\n20) Eye-plate to be suitably fitted for rotor of generator, steam turbines, electric motors,\\\\nheat exchanger’s cover/tube/bundles and heavy strainer’s cover/filter/element of about\\\\n40 kg & above (And lifting eyes to have a min. diameter of 23 mm).\\\\n21) Test provision such as

In [8]:
from datasets import Dataset

def pandas_to_ragas(df):
    '''
    Converts a Pandas DataFrame into a Ragas-compatible dataset
    
    Inputs:
        - df (Pandas DataFrame): The input DataFrame to be converted
        
    Returns:
        - ragas_testset (Hugging Face Dataset): A Hugging Face dataset compatible with the Ragas framework
    '''
    # Ensure all text columns are strings and handle NaN values
    text_columns = ['question', 'ground_truth', 'answer']
    for col in text_columns:
        df[col] = df[col].fillna('').astype(str)
        
    # Convert 'contexts' to a list of lists
    df['contexts'] = df['contexts'].fillna('').astype(str).apply(lambda x: [x] if x else [])
    
    # Converting the DataFrame to a dictionary
    data_dict = df[['question', 'contexts', 'answer', 'ground_truth']].to_dict('list')
    
    # Loading the dictionary as a Hugging Face dataset
    ragas_testset = Dataset.from_dict(data_dict)
    
    return ragas_testset


# Converting the Pandas DataFrame into a Ragas-compatible Hugging Face dataset
ragas_testset = pandas_to_ragas(df = df_testset)
ragas_testset

Dataset({
    features: ['question', 'contexts', 'answer', 'ground_truth'],
    num_rows: 4
})

In [9]:
from ragas import evaluate
from ragas.metrics import (
    answer_relevancy,
    answer_correctness,
    answer_similarity,
    faithfulness,
    context_recall,
    context_precision,
)
metrics=[
    answer_relevancy,
    answer_correctness,
    answer_similarity,
    faithfulness,
    context_recall,
    context_precision,
    ],

In [10]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.chat_models import ChatOllama
# llm = ChatOllama(model="phi3:latest")
# embeddings = OllamaEmbeddings(model="nomic-embed-text")


from langchain_groq import ChatGroq
# llm = ChatGroq(name="gemma2-9b-it")   
# llm = ChatGroq(name="llama3-8b-8192")  
# llm = ChatGroq(name="llama3-70b-8192")  

naive_results = evaluate(
    ragas_testset, 
    metrics = [
        answer_relevancy,
        answer_correctness,
        answer_similarity,
        faithfulness,
        context_precision,
        context_recall,
    ],
    llm = llm,
    embeddings=embeddings,
    raise_exceptions=False)

naive_results

Evaluating:   0%|          | 0/24 [00:00<?, ?it/s]

No statements were generated from the answer.
No statements were generated from the answer.


{'answer_relevancy': 0.4923, 'answer_correctness': 0.2591, 'answer_similarity': 0.8144, 'faithfulness': 1.0000, 'context_precision': 0.7500, 'context_recall': 0.9250}