In [1]:
import pandas as pd
import os
import json

from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
)
from ragas import evaluate
from datasets import Dataset    
from ragas.llms import LangchainLLMWrapper
from langchain_openai import ChatOpenAI

import logging
import transformers
transformers.tokenization_utils.logger.setLevel(logging.ERROR)
transformers.configuration_utils.logger.setLevel(logging.ERROR)
transformers.modeling_utils.logger.setLevel(logging.ERROR)

from rouge_score import rouge_scorer
from bert_score import BERTScorer

from transformers import AutoTokenizer, AutoModelForPreTraining
bert_model = AutoModelForPreTraining.from_pretrained("nlpaueb/legal-bert-base-uncased")

import numpy as np
import torch

## Get all file as DFs

In [17]:
folder_path = './data/json_output/contractnli/llama3/'

dfs_dict = {}
for file_name in os.listdir(folder_path):
    if file_name.endswith('.json') and file_name.startswith('query_answer'):
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'r') as f:
            data = json.load(f)
            dfs_dict[file_name] = pd.DataFrame(data)


In [18]:
print("Files in dfs_dict:")
for file_name in dfs_dict.keys():
    print(file_name)

Files in dfs_dict:
query_answer_baseline_contractnli_llama3_k1.json
query_answer_baseline_contractnli_llama3_k10.json
query_answer_baseline_contractnli_llama3_k3.json
query_answer_baseline_contractnli_llama3_k5.json
query_answer_manually_written_contractnli_llama3_k1.json
query_answer_manually_written_contractnli_llama3_k10.json
query_answer_manually_written_contractnli_llama3_k3.json
query_answer_manually_written_contractnli_llama3_k5.json


## Evaluate using RAGAS + BERT F1 + Rouge

In [19]:
def evaluate_file(file_index, num_queries = -1):
    file_name = list(dfs_dict.keys())[file_index]

    if num_queries == -1:
        num_queries = dfs_dict[file_name].shape[0]

    cur_df = dfs_dict[file_name].head(num_queries)  # Process only the specified number of queries
    print(f"Evaluating file: {file_name}")

    # # RAGAS evaluation
    # evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini", temperature=0.5))
    # dataset_ragas = Dataset.from_pandas(cur_df) 
    # # display(cur_df)   
    # ragas_result = evaluate(
    #     dataset_ragas,
    #     metrics=[
    #         faithfulness,
    #         answer_relevancy
    #     ],
    #     llm=evaluator_llm
    # )

    # ragas_result_dict = ragas_result.__repr__()     
    # ragas_result_dict = {"faithfulness": 0, "answer_relevancy": 0}   

    # BERT and ROUGE evaluation
    responses = cur_df['response'].tolist()
    contexts = cur_df['retrieved_contexts'].tolist()

    # Initialize scorers
    # bert_scorer_obj = BERTScorer(lang="en", model_type="nlpaueb/legal-bert-base-uncased", num_layers=12)
    rouge_scorer_obj = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

    bert_f1_scores = []
    rouge1_recall_scores = []
    rouge2_recall_scores = []
    rougeL_recall_scores = []

    for response, context_list in zip(responses, contexts):
        # Join all contexts into single reference
        if isinstance(context_list, str):
            single_ref = context_list
        else:
            single_ref = " ".join(context_list)

        # Calculate ROUGE scores
        rouge_scores = rouge_scorer_obj.score(single_ref, response)
        rouge1_recall_scores.append(rouge_scores["rouge1"].recall)
        rouge2_recall_scores.append(rouge_scores["rouge2"].recall)
        rougeL_recall_scores.append(rouge_scores["rougeL"].recall)

    # Calculate average scores
    avg_rouge1_recall = np.mean(rouge1_recall_scores)
    avg_rouge2_recall = np.mean(rouge2_recall_scores)
    avg_rougeL_recall = np.mean(rougeL_recall_scores)

    results = {
        # 'ragas_metrics': ragas_result_dict,
        'rouge1_recall': avg_rouge1_recall,
        'rouge2_recall': avg_rouge2_recall,
        'rougeL_recall': avg_rougeL_recall
    }

    results_path = os.path.join(folder_path, "evaluation_results.json")

    # Load existing results if the file exists
    if os.path.exists(results_path):
        with open(results_path, "r") as results_file:
            all_results = json.load(results_file)
    else:
        all_results = {}

    # Add or update metrics for the current file
    if file_name not in all_results:
        all_results[file_name] = {}
    all_results[file_name].update(results)

    # Save the updated results back to the file
    with open(results_path, "w") as results_file:
        json.dump(all_results, results_file, indent=4)

    print(f"Results for {file_name}:")
    print(f"ROUGE-1 Recall: {avg_rouge1_recall:.3f}")
    print(f"ROUGE-2 Recall: {avg_rouge2_recall:.3f}") 
    print(f"ROUGE-L Recall: {avg_rougeL_recall:.3f}")
    print(f"Results saved to {results_path}\n")


In [20]:
total_file_count = len(dfs_dict)  # Calculate total_file_count from dfs_dict
for file_index in range(0, total_file_count):
    evaluate_file(file_index=file_index, num_queries=-1)


Evaluating file: query_answer_baseline_contractnli_llama3_k1.json
Results for query_answer_baseline_contractnli_llama3_k1.json:
ROUGE-1 Recall: 0.438
ROUGE-2 Recall: 0.236
ROUGE-L Recall: 0.335
Results saved to ./data/json_output/contractnli/llama3/evaluation_results.json

Evaluating file: query_answer_baseline_contractnli_llama3_k10.json
Results for query_answer_baseline_contractnli_llama3_k10.json:
ROUGE-1 Recall: 0.145
ROUGE-2 Recall: 0.061
ROUGE-L Recall: 0.092
Results saved to ./data/json_output/contractnli/llama3/evaluation_results.json

Evaluating file: query_answer_baseline_contractnli_llama3_k3.json
Results for query_answer_baseline_contractnli_llama3_k3.json:
ROUGE-1 Recall: 0.226
ROUGE-2 Recall: 0.114
ROUGE-L Recall: 0.160
Results saved to ./data/json_output/contractnli/llama3/evaluation_results.json

Evaluating file: query_answer_baseline_contractnli_llama3_k5.json
Results for query_answer_baseline_contractnli_llama3_k5.json:
ROUGE-1 Recall: 0.221
ROUGE-2 Recall: 0.095
ROUG

In [None]:
# from prince

def compute_rouge_recall(retrieved_chunks, model_responses):
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    recall_scores = []

    for ref_chunks, pred in zip(retrieved_chunks, model_responses):
        ref_text = " ".join(ref_chunks)
        scores = scorer.score(ref_text, pred)

        # Compute the average recall score instead of using just ROUGE-1
        avg_recall = (scores["rouge1"].recall + scores["rouge2"].recall + scores["rougeL"].recall) / 3
        recall_scores.append(avg_recall)

    return recall_scores

In [1]:
import json
from collections import defaultdict

def reorder_evaluation_results(file_path):
    # Read the JSON file
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    # Create a sorted dictionary to store reorganized results
    organized_results = {
        'baseline': {},
        'CoT': {},
        'manually_written': {}
    }
    
    # Process each file result
    for filename, metrics in data.items():
        # Extract k value from filename
        if '_k' in filename:
            k_value = int(filename.split('_k')[-1].split('.')[0])
            k_key = f'k{k_value}'
            
            # Determine the type and store in organized format
            if 'baseline' in filename:
                organized_results['baseline'][k_key] = metrics
            elif 'CoT' in filename:
                organized_results['CoT'][k_key] = metrics
            elif 'manually_written' in filename:
                organized_results['manually_written'][k_key] = metrics
    
    # Sort k-values within each type
    for response_type in organized_results:
        organized_results[response_type] = dict(sorted(
            organized_results[response_type].items(),
            key=lambda x: int(x[0][1:])  # Sort by k value
        ))
    
    # Write the reorganized results
    output_path = file_path.replace('.json', '_reorganized_by_type.json')
    with open(output_path, 'w') as f:
        json.dump(organized_results, f, indent=4)
    
    return organized_results

In [None]:
# file_path_contractnli = "data/json_output/contractnli/gpt/evaluation_results.json"
# # file_path_cuad = "data/json_output/cuad/llama3/evaluation_results.json"
# file_path_maud = "data/json_output/maud/llama3/evaluation_results.json"
# file_path_privacy_qa = "data/json_output/privacyqa/llama3/evaluation_results.json"


file_path_contractnli = "data/json_output/ZZZ_llama_cot/contractnli/gpt/evaluation_results.json"


reorder_evaluation_results(file_path_contractnli)
# reorder_evaluation_results(file_path_cuad)
# reorder_evaluation_results(file_path_maud)
# reorder_evaluation_results(file_path_privacy_qa)



{'baseline': {'k1': {'ragas_metrics': "{'faithfulness': 0.6510, 'answer_relevancy': 0.3157}",
   'bert_f1': 0.6471516858671129,
   'rouge_recall': 0.1655380309211203,
   'rouge1_recall': 0.23262855923782638,
   'rouge2_recall': 0.09323326788258988,
   'rougeL_recall': 0.1707522656429445},
  'k3': {'ragas_metrics': "{'faithfulness': 0.8311, 'answer_relevancy': 0.6195}",
   'bert_f1': 0.7001845624643502,
   'rouge_recall': 0.127273956099688,
   'rouge1_recall': 0.18232571670484188,
   'rouge2_recall': 0.07312573913446306,
   'rougeL_recall': 0.12637041245975889},
  'k5': {'ragas_metrics': "{'faithfulness': 0.8299, 'answer_relevancy': 0.6980}",
   'bert_f1': 0.7004778944339949,
   'rouge_recall': 0.09338551698081571,
   'rouge1_recall': 0.131375877522047,
   'rouge2_recall': 0.055975005847801296,
   'rougeL_recall': 0.09280566757259875},
  'k10': {'ragas_metrics': "{'faithfulness': 0.8438, 'answer_relevancy': 0.7899}",
   'bert_f1': 0.7053153364314246,
   'rouge_recall': 0.059093593525550