In [8]:
# evaluate
from ragas import evaluate
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
)
import json
from pathlib import Path
import yaml

import pandas as pd
from datasets import Dataset

from mtg.bot import mtg_chain

with open("configs/config.yaml", "r") as infile:
    config = yaml.load(infile, Loader=yaml.FullLoader)

llm = mtg_chain.create_llm(model_name="gpt-3.5-turbo-0125", temperature=1.0)

In [22]:
DATA_PATH = Path("../data/etl/")

evaluation_dataset = {}
for file in (DATA_PATH / "processed/evaluation").iterdir():
    with file.open("r", encoding="utf-8") as infile:
        dataset = json.load(infile)
    for key, value in dataset.items(): 
        evaluation_dataset[key] = value

list(evaluation_dataset)

['expert_knowledge', 'level_0', 'level_1', 'level_2', 'stackexchange']

# Evaluate

In [27]:
results = {}
for key in evaluation_dataset:
    for d in evaluation_dataset[key]: 
        if "context" in d:
            d["contexts"] = d["context"]
        
    dataset = Dataset.from_pandas(pd.DataFrame(evaluation_dataset[key]))
    result = evaluate(
        dataset,
        metrics=[
            answer_relevancy,
            faithfulness,
            context_recall,
            context_precision,
        ],
        llm=llm,
    )
    results[key] = result
    results[key]["number_of_questions"] = len(evaluation_dataset[key])

Evaluating: 100%|██████████| 20/20 [00:37<00:00,  1.87s/it]
Evaluating: 100%|██████████| 20/20 [01:40<00:00,  5.02s/it]
Evaluating: 100%|██████████| 20/20 [01:05<00:00,  3.28s/it]
Evaluating: 100%|██████████| 20/20 [01:16<00:00,  3.83s/it]
Evaluating: 100%|██████████| 20/20 [01:51<00:00,  5.58s/it]


# Results 

-  <b>Faithfullness / Groundedness</b>  
    This measures the factual consistency of the generated answer against the given context. It is calculated from answer and retrieved context. The answer is scaled to (0,1) range. Higher the better.

    The generated answer is regarded as faithful if all the claims that are made in the answer can be inferred from the given context. To calculate this a set of claims from the generated answer is first identified. Then each one of these claims are cross checked with given context to determine if it can be inferred from given context or not.  

-  <b>Answer Relevancy</b>  
    The evaluation metric, Answer Relevancy, focuses on assessing how pertinent the generated answer is to the given prompt. A lower score is assigned to answers that are incomplete or contain redundant information and higher scores indicate better relevancy. This metric is computed using the question, the context and the answer.

-  <b>Context Recall</b>  
    Context recall measures the extent to which the retrieved context aligns with the annotated answer, treated as the ground truth. It is computed based on the ground truth and the retrieved context, and the values range between 0 and 1, with higher values indicating better performance.

    To estimate context recall from the ground truth answer, each sentence in the ground truth answer is analyzed to determine whether it can be attributed to the retrieved context or not. In an ideal scenario, all sentences in the ground truth answer should be attributable to the retrieved context.  
    
-  <b>Context Precision</b>   
    Context Precision is a metric that evaluates whether all of the ground-truth relevant items present in the contexts are ranked higher or not. Ideally all the relevant chunks must appear at the top ranks. This metric is computed using the question, ground_truth and the contexts, with values ranging between 0 and 1, where higher scores indicate better precision.
    

In [35]:
results_df = pd.DataFrame.from_dict(results, orient="index")
results_df = results_df.sort_values(["faithfulness", "context_recall"], ascending=False)
results_df

Unnamed: 0,answer_relevancy,faithfulness,context_recall,context_precision,number_of_questions
expert_knowledge,0.845598,1.0,1.0,0.883579,5
level_0,0.673718,1.0,1.0,0.754995,5
stackexchange,0.852136,1.0,1.0,0.928552,5
level_1,0.936661,1.0,0.9,0.889048,5
level_2,0.970543,1.0,0.8,0.947956,5


In [33]:
results_df.to_excel(DATA_PATH / "evaluation_results.xlsx")