In [20]:
import pandas as pd
import os
import json

from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
)
from ragas import evaluate
from datasets import Dataset    
from ragas.llms import LangchainLLMWrapper
from langchain_openai import ChatOpenAI
from ragas.embeddings.base import (
    BaseRagasEmbeddings,
    LangchainEmbeddingsWrapper,
    embedding_factory,
)

In [21]:
num_queries = 10

In [24]:
folder_path = './answer_relevancy_qualitative_2/'

dfs_dict = {}
for file_name in os.listdir(folder_path):
    if file_name.endswith('.json') and file_name.startswith('baseline'):
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'r') as f:
            data = json.load(f)
            dfs_dict[file_name] = pd.DataFrame(data)

In [25]:
print("Files in dfs_dict:")
for file_name in dfs_dict.keys():
    print(file_name)

Files in dfs_dict:
baseline_gpt4omini_k1.json
baseline_gpt4omini_k10.json
baseline_gpt4omini_k3.json
baseline_gpt4omini_k5.json


In [26]:

file_name = list(dfs_dict.keys())[0]
cur_df = dfs_dict[file_name].head(num_queries)
print(f"Evaluating file: {file_name}")

evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini", temperature=0.5))
# dataset_ragas = Dataset.from_pandas(cur_df) 

Evaluating file: baseline_gpt4omini_k1.json


In [27]:
display(cur_df)

Unnamed: 0,user_input,response,retrieved_contexts
0,Consider the Non-Disclosure Agreement between ...,The context provided does not explicitly state...,[4 Definition of Confidential Information\n“Co...
1,Consider the Non-Disclosure Agreement between ...,"No, the document does not state that Confident...",[4 Definition of Confidential Information\n“Co...
2,Consider the Non-Disclosure Agreement between ...,"Yes, the document mentions that obligations re...",[9 Term\nThis Agreement shall be effective as ...
3,Consider the Non-Disclosure Agreement between ...,The provided context does not specify whether ...,[Mentor shall notify Organiser immediately in ...
4,Consider the Non-Disclosure Agreement between ...,"No, the Non-Disclosure Agreement does not allo...",[Mentor shall not disclose any Confidential In...
5,Consider the Non-Disclosure Agreement between ...,I don't know.,[Mentor shall not disclose any Confidential In...
6,Consider the Non-Disclosure Agreement between ...,The context provided does not explicitly addre...,[4 Definition of Confidential Information\n“Co...
7,Consider the Non-Disclosure Agreement between ...,"No, the document does not allow the Receiving ...",[Mentor shall not disclose any Confidential In...
8,Consider the Non-Disclosure Agreement between ...,I don't know.,[NON-DISCLOSURE AGREEMENT AND TERMS OF PARTICI...
9,Consider DBT's Mutual Non-Disclosure Agreement...,"Yes, the document indicates that the Agreement...",[5. No Further Rights All Confidential Informa...


In [28]:
from ragas import SingleTurnSample 
from ragas.metrics import ResponseRelevancy

samples = []
for _, row in cur_df.iterrows():
    sample = SingleTurnSample(
        user_input=row['user_input'],
        response=row['response'],
        retrieved_contexts=row['retrieved_contexts']
    )
    samples.append(sample)

# evaluator_embeddings = 

response_relevancy_scorer = ResponseRelevancy(llm=evaluator_llm, embeddings=embedding_factory())

for sample in samples:
    with open('output.txt', 'a') as f:        
        print("{", file=f)
    with open('output.txt', 'a') as f:        
        print(f"\"user_input\" : \"{sample.user_input}\",", file=f)
    
    score = await response_relevancy_scorer.single_turn_ascore(sample)    
    # with open('output.txt', 'a') as f:        
        # print(score, file=f)

    with open('output.txt', 'a') as f:        
        print("},", file=f)

In [31]:
async def evaluate_file(num_queries: int, file_index: int, output_file_name: str = None):
    """
    Evaluate a single file using ResponseRelevancy metric
    
    Args:
        num_queries: Number of queries to evaluate
        file_index: Index of the file to evaluate in dfs_dict
        output_file_name: Optional name of output file. If None, uses 'output.txt'
    """
    evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini", temperature=0.5))
    response_relevancy_scorer = ResponseRelevancy(llm=evaluator_llm, embeddings=embedding_factory())
    
    # Get the specified file
    file_name = list(dfs_dict.keys())[file_index]
    print(f"\nEvaluating file: {file_name}")
    
    # Set output file name
    output_file = output_file_name if output_file_name is not None else 'output.txt'
    
    # Get data and display
    cur_df = dfs_dict[file_name].head(num_queries)
    # display(cur_df)
    
    # Create samples
    samples = []
    for _, row in cur_df.iterrows():
        sample = SingleTurnSample(
            user_input=row['user_input'],
            response=row['response'],
            retrieved_contexts=row['retrieved_contexts']
        )
        samples.append(sample)
    
    # Evaluate each sample
    for sample in samples:
        with open(output_file, 'a') as f:        
            print("{", file=f)
        with open(output_file, 'a') as f:        
            print(f"\"user_input\" : \"{sample.user_input}\",", file=f)
        
        score = await response_relevancy_scorer.single_turn_ascore(sample)
        
        with open(output_file, 'a') as f:        
            print("},", file=f)

# Example usage:
# await evaluate_file(num_queries=10, file_index=0)  # Evaluate first file
# await evaluate_file(num_queries=10, file_index=1, output_file_name="custom_output.txt")  # Evaluate second file with custom output


In [35]:
await evaluate_file(num_queries=194, file_index=3, output_file_name="output.txt")  # Evaluate second file with custom output



Evaluating file: baseline_gpt4omini_k5.json
