# Notebook for running evaluations

Imports

In [8]:
import sys
sys.path.append("..")
# from experiment.evaluations import evaluate_dispatching, evaluate_llm_only, evaluate_baseline_rag, evaluate_reranker_rag, evaluate_dense_rag
import nest_asyncio
nest_asyncio.apply()
import os
import pandas as pd

ROOT_DIR = os.path.dirname(os.path.dirname(os.getcwd()))
PROJECT_DIR = os.path.join(ROOT_DIR, "master-thesis-project")
EXPERIMENT_DIR = os.path.join(PROJECT_DIR, "experiment")
DATASET_DIR = os.path.join(EXPERIMENT_DIR, "data")

Run evaluation

In [None]:
dispatching = await evaluate_dispatching()
llm_only = await evaluate_llm_only()
baseline_rag = await evaluate_baseline_rag()
reranker_rag = await evaluate_reranker_rag()
dense_rag = await evaluate_dense_rag()

Read evaluation results

In [9]:
llm_only = pd.read_csv(os.path.join(DATASET_DIR, "evaluation_llm_only.csv"))
baseline_rag = pd.read_csv(os.path.join(DATASET_DIR, "evaluation_baseline_rag.csv"))
reranker_rag = pd.read_csv(os.path.join(DATASET_DIR, "evaluation_reranker_rag.csv"))
dense_rag = pd.read_csv(os.path.join(DATASET_DIR, "evaluation_dense_rag.csv"))

In [15]:
methods = {
    "LLM Only": llm_only,
    "Baseline RAG": baseline_rag,
    "Reranker RAG": reranker_rag,
    "Dense RAG": dense_rag
}

# Compute mean and std for answer_relevancy for all methods,
# and for other columns for RAG methods (Baseline, Reranker, Dense)
columns_to_compare = ["answer_relevancy", "llm_context_precision_without_reference", "faithfulness", "nv_context_relevance"]

# Compare answer_relevancy for all methods
answer_relevancy_means = {method: df["answer_relevancy"].mean() for method, df in methods.items()}
answer_relevancy_df = pd.DataFrame.from_dict(answer_relevancy_means, orient="index", columns=["answer_relevancy_mean"])

# Compare other metrics for RAG methods only
rag_methods = {k: v for k, v in methods.items() if k != "LLM Only"}
other_metrics = ["llm_context_precision_without_reference", "faithfulness", "nv_context_relevance"]
other_metrics_means = {
    method: {metric: df[metric].mean() for metric in other_metrics}
    for method, df in rag_methods.items()
}
other_metrics_df = pd.DataFrame.from_dict(other_metrics_means, orient="index")

answer_relevancy_df, other_metrics_df


(              answer_relevancy_mean
 LLM Only                   0.801779
 Baseline RAG               0.861880
 Reranker RAG               0.842789
 Dense RAG                  0.814879,
               llm_context_precision_without_reference  faithfulness  \
 Baseline RAG                                 0.760344      0.937853   
 Reranker RAG                                 0.736470      0.915170   
 Dense RAG                                    0.768447      0.919765   
 
               nv_context_relevance  
 Baseline RAG              0.516990  
 Reranker RAG              0.521845  
 Dense RAG                 0.626214  )

In [18]:
import numpy as np

# Highlight best in each metric (highest value)
highlighted = other_metrics_df.copy()
for col in highlighted.columns:
    max_val = highlighted[col].max()
    highlighted[col] = highlighted[col].apply(lambda x: f"**{x:.4f}**" if np.isclose(x, max_val) else f"{x:.4f}")

# Find best overall (highest mean across metrics)
overall_means = other_metrics_df.mean(axis=1)
best_overall_method = overall_means.idxmax()

# Add a column to indicate best overall
highlighted["Best Overall"] = ""
highlighted.loc[best_overall_method, "Best Overall"] = "🏆"

highlighted
# Add answer_relevancy_mean to highlighted table and highlight best
highlighted["answer_relevancy_mean"] = answer_relevancy_df.loc[highlighted.index, "answer_relevancy_mean"].apply(lambda x: f"{x:.4f}")

max_relevancy = answer_relevancy_df["answer_relevancy_mean"].max()
for idx in highlighted.index:
    val = answer_relevancy_df.loc[idx, "answer_relevancy_mean"]
    if np.isclose(val, max_relevancy):
        highlighted.at[idx, "answer_relevancy_mean"] = f"**{val:.4f}**"

# Compute overall mean including answer_relevancy_mean
metrics_for_overall = ["llm_context_precision_without_reference", "faithfulness", "nv_context_relevance", "answer_relevancy_mean"]

# Convert answer_relevancy_mean to float for calculation
highlighted["answer_relevancy_mean_float"] = answer_relevancy_df.loc[highlighted.index, "answer_relevancy_mean"]

overall_means_incl_relevancy = highlighted[["llm_context_precision_without_reference", "faithfulness", "nv_context_relevance"]].applymap(lambda x: float(x.replace("**", ""))) \
    .join(highlighted["answer_relevancy_mean_float"]).mean(axis=1)

best_overall_method_incl_relevancy = overall_means_incl_relevancy.idxmax()

# Update Best Overall column
highlighted["Best Overall"] = ""
highlighted.loc[best_overall_method_incl_relevancy, "Best Overall"] = "🏆"

# Remove helper column
highlighted = highlighted.drop(columns=["answer_relevancy_mean_float"])

highlighted


  overall_means_incl_relevancy = highlighted[["llm_context_precision_without_reference", "faithfulness", "nv_context_relevance"]].applymap(lambda x: float(x.replace("**", ""))) \


Unnamed: 0,llm_context_precision_without_reference,faithfulness,nv_context_relevance,Best Overall,answer_relevancy_mean
Baseline RAG,0.7603,**0.9379**,0.5170,,**0.8619**
Reranker RAG,0.7365,0.9152,0.5218,,0.8428
Dense RAG,**0.7684**,0.9198,**0.6262**,🏆,0.8149
