In [2]:
import pandas as pd
import os
import json

from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
)
from ragas import evaluate
from datasets import Dataset    
from ragas.llms import LangchainLLMWrapper
from langchain_openai import ChatOpenAI

import logging
import transformers
transformers.tokenization_utils.logger.setLevel(logging.ERROR)
transformers.configuration_utils.logger.setLevel(logging.ERROR)
transformers.modeling_utils.logger.setLevel(logging.ERROR)

from rouge_score import rouge_scorer
from bert_score import BERTScorer

from transformers import AutoTokenizer, AutoModelForPreTraining
bert_model = AutoModelForPreTraining.from_pretrained("nlpaueb/legal-bert-base-uncased")

## Get all file as DFs

In [15]:
folder_path = './data/final_generation/privacyqa/'

dfs_dict = {}
for file_name in os.listdir(folder_path):
    if file_name.endswith('.json') and file_name.startswith('query_answer'):
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'r') as f:
            data = json.load(f)
            dfs_dict[file_name] = pd.DataFrame(data)


In [16]:
print("Files in dfs_dict:")
for file_name in dfs_dict.keys():
    print(file_name)

Files in dfs_dict:
query_answer_human_tuned_privacyqa_gpt4omini_k10.json


In [16]:
# # Filter for only CoT files
# cot_dfs_dict = {k: v for k, v in dfs_dict.items() if 'cot' in k.lower()}

# print("Files in cot_dfs_dict:")
# for file_name in cot_dfs_dict.keys():
#     print(file_name)


Files in cot_dfs_dict:
query_answer_CoT_llama3_k1.json
query_answer_CoT_llama3_k10.json
query_answer_CoT_llama3_k3.json
query_answer_CoT_llama3_k5.json


In [17]:
# dfs_dict = cot_dfs_dict


## Evaluate using RAGAS + BERT F1 + Rouge

In [17]:
def evaluate_file(file_index, num_queries = -1):
    file_name = list(dfs_dict.keys())[file_index]

    if num_queries == -1:
        num_queries = dfs_dict[file_name].shape[0]

    cur_df = dfs_dict[file_name].head(num_queries)  # Process only the specified number of queries
    print(f"Evaluating file: {file_name}")

    # RAGAS evaluation
    evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini", temperature=0.5))
    dataset_ragas = Dataset.from_pandas(cur_df) 
    # display(cur_df)   
    ragas_result = evaluate(
        dataset_ragas,
        metrics=[
            faithfulness,
            answer_relevancy
        ],
        llm=evaluator_llm
    )

    ragas_result_dict = ragas_result.__repr__()     

    # BERT and ROUGE evaluation
    responses = cur_df['response'].tolist()
    contexts = cur_df['retrieved_contexts'].tolist()

    # Initialize scorers
    bert_scorer_obj = BERTScorer(lang="en", model_type="nlpaueb/legal-bert-base-uncased", num_layers=12)
    rouge_scorer_obj = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

    bert_f1_scores = []
    rouge_recall_scores = []

    for response, context_list in zip(responses, contexts):
        # Join all contexts into single reference
        if isinstance(context_list, str):
            single_ref = context_list
        else:
            single_ref = " ".join(context_list)

        # Calculate BERT score
        _, _, F_mul = bert_scorer_obj.score([response], [single_ref])
        bert_f1_scores.append(F_mul.mean().item())

        # Calculate ROUGE score
        rouge_scores = rouge_scorer_obj.score(single_ref, response)
        avg_recall = (rouge_scores["rouge1"].recall + rouge_scores["rouge2"].recall + rouge_scores["rougeL"].recall) / 3
        rouge_recall_scores.append(avg_recall)

    # Calculate average scores
    avg_bert_f1 = sum(bert_f1_scores) / len(bert_f1_scores)
    avg_rouge_recall = sum(rouge_recall_scores) / len(rouge_recall_scores)

    results = {
        'ragas_metrics': ragas_result_dict,  # Use the raw result instead of converting to dict
        'bert_f1': avg_bert_f1,
        'rouge_recall': avg_rouge_recall
    }

    results_path = os.path.join(folder_path, "evaluation_results.json")

    # Load existing results if the file exists
    if os.path.exists(results_path):
        with open(results_path, "r") as results_file:
            all_results = json.load(results_file)
    else:
        all_results = {}

    # Add the current result to the dictionary
    all_results[file_name] = results

    # Save the updated results back to the file
    with open(results_path, "w") as results_file:
        json.dump(all_results, results_file, indent=4)

    print(f"Results for {file_name}:")
    print(f"RAGAS Metrics: {ragas_result_dict}")
    print(f"Average BERT F1 Score: {avg_bert_f1}")
    print(f"Average ROUGE Recall Score: {avg_rouge_recall}")
    print(f"Results saved to {results_path}\n")


In [None]:
total_file_count = len(dfs_dict)  # Calculate total_file_count from dfs_dict
for file_index in range(0, total_file_count):
    evaluate_file(file_index=file_index, num_queries=-1)


Evaluating file: query_answer_human_tuned_privacyqa_gpt4omini_k10.json


Evaluating:   0%|          | 0/388 [00:00<?, ?it/s]

## Reorder evaluation result

In [6]:
import json
from collections import defaultdict

def reorder_evaluation_results(file_path):
    # Read the JSON file
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    # Create a sorted dictionary to store reorganized results
    organized_results = {
        'baseline': {},
        'CoT': {},
        'manually_written': {}
    }
    
    # Process each file result
    for filename, metrics in data.items():
        # Extract k value from filename
        if '_k' in filename:
            k_value = int(filename.split('_k')[-1].split('.')[0])
            k_key = f'k{k_value}'
            
            # Determine the type and store in organized format
            if 'baseline' in filename:
                organized_results['baseline'][k_key] = metrics
            elif 'CoT' in filename:
                organized_results['CoT'][k_key] = metrics
            elif 'manually_written' in filename:
                organized_results['manually_written'][k_key] = metrics
    
    # Sort k-values within each type
    for response_type in organized_results:
        organized_results[response_type] = dict(sorted(
            organized_results[response_type].items(),
            key=lambda x: int(x[0][1:])  # Sort by k value
        ))
    
    # Write the reorganized results
    output_path = file_path.replace('.json', '_reorganized_by_type.json')
    with open(output_path, 'w') as f:
        json.dump(organized_results, f, indent=4)
    
    return organized_results

In [7]:
file_path_contractnli = "data/final_generation/contractnli/evaluation_results.json"
file_path_cuad = "data/final_generation/cuad/evaluation_results.json"
file_path_maud = "data/final_generation/maud/evaluation_results.json"
file_path_privacyqa = "data/final_generation/privacyqa/evaluation_results.json"


reorder_evaluation_results(file_path_contractnli)
reorder_evaluation_results(file_path_cuad)
reorder_evaluation_results(file_path_maud)
reorder_evaluation_results(file_path_privacyqa)



{'baseline': {'k1': {'ragas_metrics': "{'faithfulness': 0.6575, 'answer_relevancy': 0.2998}",
   'bert_f1': 0.7057263875745007,
   'rouge_recall': 0.3545316826950978},
  'k3': {'ragas_metrics': "{'faithfulness': 0.6912, 'answer_relevancy': 0.3397}",
   'bert_f1': 0.7120289350907827,
   'rouge_recall': 0.20389324070405535},
  'k5': {'ragas_metrics': "{'faithfulness': 0.7091, 'answer_relevancy': 0.3539}",
   'bert_f1': 0.710056027493526,
   'rouge_recall': 0.15817795055722808},
  'k10': {'ragas_metrics': "{'faithfulness': 0.6748, 'answer_relevancy': 0.2473}",
   'bert_f1': 0.6874822971132613,
   'rouge_recall': 0.09626729537838494}},
 'CoT': {},
 'manually_written': {'k1': {'ragas_metrics': "{'faithfulness': 0.5012, 'answer_relevancy': 0.2816}",
   'bert_f1': 0.7036664089591232,
   'rouge_recall': 0.4897746614800545},
  'k3': {'ragas_metrics': "{'faithfulness': 0.5979, 'answer_relevancy': 0.3799}",
   'bert_f1': 0.7126831827089959,
   'rouge_recall': 0.2788349592289338},
  'k5': {'ragas_