In [1]:
import json
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    answer_correctness, 
    answer_similarity
)
import evaluate as h_evaluate
import pandas as pd
import sys
sys.path.append('d:/Local-RAG-Assistant-Chatbot/')    
from src.apikeys import open_API
import os
os.environ["OPENAI_API_KEY"] = open_API

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import time
def get_metrics(model_type:str, data_type:str):
    rouge = h_evaluate.load('rouge')
    if data_type == "hotpot":
        for i in range(0,41,10):
            with open(f"outputs_{model_type}_{data_type}_{i}_{i+9}.json", 'r') as file:
                data = json.load(file)
            ds = Dataset.from_dict(data)
            predictions = ds["answer"]
            references = ds["ground_truth"]
            rouge_results = rouge.compute(predictions=predictions, references=references)
            result = evaluate(
                ds,
                metrics=[
                    context_precision,
                    faithfulness,
                    answer_relevancy,
                    context_recall,
                    answer_correctness, 
                    answer_similarity
                ],
            )
            time.sleep(3) # To avoid rate limit
            df_rouge = pd.DataFrame([rouge_results])
            df_ragas = result.to_pandas()
            df = pd.concat([df_ragas, df_rouge], axis=1)
            save_path = f"./metrics/{data_type}/{model_type}/"
            if not os.path.exists(save_path):
                os.makedirs(save_path)
            df.to_csv(save_path + f"metrics_{model_type}_{data_type}_{i}_{i+9}.csv")
    else:
        for i in range(1,5):
            with open(f"outputs_{model_type}_{data_type}_{i}.json", 'r') as file:
                data = json.load(file)
            ds = Dataset.from_dict(data)
            predictions = ds["answer"]
            references = ds["ground_truth"]
            rouge_results = rouge.compute(predictions=predictions, references=references)
            result = evaluate(
                ds,
                metrics=[
                    context_precision,
                    faithfulness,
                    answer_relevancy,
                    context_recall,
                    answer_correctness, 
                    answer_similarity
                ],
            )
            time.sleep(3) # To avoid rate limit
            df_rouge = pd.DataFrame([rouge_results])
            df_ragas = result.to_pandas()
            df = pd.concat([df_ragas, df_rouge], axis=1)
            save_path = f"./metrics/{data_type}/{model_type}/"
            if not os.path.exists(save_path):
                os.makedirs(save_path)
            df.to_csv(save_path + f"metrics_{model_type}_{data_type}_{i}.csv")



In [4]:
get_metrics("eval","squad")

Evaluating: 100%|██████████| 72/72 [00:17<00:00,  4.16it/s]
Evaluating: 100%|██████████| 72/72 [01:52<00:00,  1.56s/it]
Evaluating: 100%|██████████| 72/72 [00:18<00:00,  3.87it/s]
Evaluating: 100%|██████████| 72/72 [01:19<00:00,  1.10s/it]


In [5]:
get_metrics("base","squad")

Evaluating: 100%|██████████| 72/72 [01:04<00:00,  1.11it/s]
Evaluating: 100%|██████████| 72/72 [01:21<00:00,  1.13s/it]
Evaluating: 100%|██████████| 72/72 [01:12<00:00,  1.00s/it]
Evaluating: 100%|██████████| 72/72 [00:36<00:00,  1.98it/s]


In [6]:
get_metrics("eval","hotpot")

Evaluating: 100%|██████████| 60/60 [01:17<00:00,  1.30s/it]
Evaluating: 100%|██████████| 60/60 [00:31<00:00,  1.88it/s]
Evaluating: 100%|██████████| 60/60 [01:12<00:00,  1.21s/it]
Evaluating: 100%|██████████| 60/60 [00:38<00:00,  1.56it/s]
Evaluating: 100%|██████████| 60/60 [01:42<00:00,  1.71s/it]


In [7]:
get_metrics("base","hotpot")

Evaluating: 100%|██████████| 60/60 [00:10<00:00,  5.53it/s]
Evaluating: 100%|██████████| 60/60 [01:09<00:00,  1.16s/it]
Evaluating: 100%|██████████| 60/60 [00:42<00:00,  1.43it/s]
Evaluating: 100%|██████████| 60/60 [00:56<00:00,  1.06it/s]
Evaluating: 100%|██████████| 60/60 [01:01<00:00,  1.02s/it]
