In [3]:
import pandas as pd
import os
import json

from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
)
from ragas import evaluate
from datasets import Dataset    
from ragas.llms import LangchainLLMWrapper
from langchain_openai import ChatOpenAI

import logging
import transformers
transformers.tokenization_utils.logger.setLevel(logging.ERROR)
transformers.configuration_utils.logger.setLevel(logging.ERROR)
transformers.modeling_utils.logger.setLevel(logging.ERROR)

from rouge_score import rouge_scorer
from bert_score import BERTScorer

from transformers import AutoTokenizer, AutoModelForPreTraining
bert_model = AutoModelForPreTraining.from_pretrained("nlpaueb/legal-bert-base-uncased")

In [4]:
from dotenv import load_dotenv

load_dotenv()
api_key_openai = os.environ.get("OPENAI_API_KEY")

## Get all file as DFs

In [5]:
folder_path = './data/json_output/contractnli/llama3/'

dfs_dict = {}
for file_name in os.listdir(folder_path):
    if file_name.endswith('.json') and file_name.startswith('query_answer'):
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'r') as f:
            data = json.load(f)
            dfs_dict[file_name] = pd.DataFrame(data)


In [6]:
print("Files in dfs_dict:")
for file_name in dfs_dict.keys():
    print(file_name)

Files in dfs_dict:
query_answer_baseline_contractnli_llama3_k1.json
query_answer_baseline_contractnli_llama3_k10.json
query_answer_baseline_contractnli_llama3_k3.json
query_answer_baseline_contractnli_llama3_k5.json
query_answer_manually_written_contractnli_llama3_k1.json
query_answer_manually_written_contractnli_llama3_k10.json
query_answer_manually_written_contractnli_llama3_k3.json
query_answer_manually_written_contractnli_llama3_k5.json


## Evaluate using RAGAS + BERT F1 + Rouge

In [7]:
from ragas.metrics import ResponseRelevancy

In [8]:
def evaluate_file(file_index, num_queries = -1):
    file_name = list(dfs_dict.keys())[file_index]

    if num_queries == -1:
        num_queries = dfs_dict[file_name].shape[0]

    cur_df = dfs_dict[file_name].head(num_queries)  # Process only the specified number of queries
    print(f"Evaluating file: {file_name}")

    # RAGAS evaluation
    evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini", temperature=0.5))
    dataset_ragas = Dataset.from_pandas(cur_df) 
    
    custom_response_relevancy = ResponseRelevancy(strictness=5)

    ragas_result = evaluate(
        dataset_ragas,
        metrics=[            
            custom_response_relevancy
        ],
        llm=evaluator_llm
    )

    ragas_result_dict = ragas_result.__repr__()     

    results = {
        'ragas_metrics': ragas_result_dict
    }

    results_path = os.path.join(folder_path, "evaluation_ans_relevancy_5_results.json")

    # Load existing results if the file exists
    if os.path.exists(results_path):
        with open(results_path, "r") as results_file:
            all_results = json.load(results_file)
    else:
        all_results = {}

    # Add the current result to the dictionary
    all_results[file_name] = results

    # Save the updated results back to the file
    with open(results_path, "w") as results_file:
        json.dump(all_results, results_file, indent=4)

    print(f"Results for {file_name}:")
    print(f"RAGAS Metrics: {ragas_result_dict}")
    print(f"Results saved to {results_path}\n")


In [None]:
total_file_count = len(dfs_dict)  # Calculate total_file_count from dfs_dict
for file_index in range(0, total_file_count):
    evaluate_file(file_index=file_index, num_queries=-1)


Evaluating file: query_answer_baseline_contractnli_llama3_k1.json


Evaluating:   0%|          | 0/194 [00:00<?, ?it/s]

Results for query_answer_baseline_contractnli_llama3_k1.json:
RAGAS Metrics: {'answer_relevancy': 0.5608}
Results saved to ./data/json_output/contractnli/llama3/evaluation_ans_relevancy_5_results.json

Evaluating file: query_answer_baseline_contractnli_llama3_k10.json


Evaluating:   0%|          | 0/194 [00:00<?, ?it/s]

KeyboardInterrupt: 

: 

## Reorder evaluation result

In [1]:
import json
from collections import defaultdict

def reorder_evaluation_results(file_path):
    # Read the JSON file
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    # Create a sorted dictionary to store reorganized results
    organized_results = {
        'baseline': {},
        'CoT': {},
        'manually_written': {}
    }
    
    # Process each file result
    for filename, metrics in data.items():
        # Extract k value from filename
        if '_k' in filename:
            k_value = int(filename.split('_k')[-1].split('.')[0])
            k_key = f'k{k_value}'
            
            # Determine the type and store in organized format
            if 'baseline' in filename:
                organized_results['baseline'][k_key] = metrics
            elif 'CoT' in filename:
                organized_results['CoT'][k_key] = metrics
            elif 'manually_written' in filename:
                organized_results['manually_written'][k_key] = metrics
    
    # Sort k-values within each type
    for response_type in organized_results:
        organized_results[response_type] = dict(sorted(
            organized_results[response_type].items(),
            key=lambda x: int(x[0][1:])  # Sort by k value
        ))
    
    # Write the reorganized results
    output_path = file_path.replace('.json', '_reorganized_by_type.json')
    with open(output_path, 'w') as f:
        json.dump(organized_results, f, indent=4)
    
    return organized_results

In [2]:
# file_path_contractnli = "data/json_output/contractnli/llama3/evaluation_results.json"
file_path_contractnli = "data/json_output/contractnli/gpt/evaluation_ans_relevancy_5_results.json"
# file_path_cuad = "data/json_output/cuad/llama3/evaluation_results.json"
# file_path_maud = "data/json_output/maud/llama3/evaluation_results.json"
# file_path_privacy_qa = "data/json_output/privacyqa/llama3/evaluation_results.json"

reorder_evaluation_results(file_path_contractnli)




{'baseline': {'k1': {'ragas_metrics': "{'answer_relevancy': 0.3204}"},
  'k3': {'ragas_metrics': "{'answer_relevancy': 0.6106}"},
  'k5': {'ragas_metrics': "{'answer_relevancy': 0.6933}"},
  'k10': {'ragas_metrics': "{'answer_relevancy': 0.7858}"}},
 'CoT': {'k1': {'ragas_metrics': "{'answer_relevancy': 0.3772}"},
  'k3': {'ragas_metrics': "{'answer_relevancy': 0.6954}"},
  'k5': {'ragas_metrics': "{'answer_relevancy': 0.7598}"},
  'k10': {'ragas_metrics': "{'answer_relevancy': 0.8428}"}},
 'manually_written': {'k1': {'ragas_metrics': "{'answer_relevancy': 0.4506}"},
  'k3': {'ragas_metrics': "{'answer_relevancy': 0.5991}"},
  'k5': {'ragas_metrics': "{'answer_relevancy': 0.6252}"},
  'k10': {'ragas_metrics': "{'answer_relevancy': 0.7382}"}}}