In [41]:
import os
import pandas as pd
import numpy as np
import json
from datasets import Dataset
from tqdm import tqdm
import ast

import evaluate as hf_evaluate
from langchain_openai import ChatOpenAI
from langchain_community.embeddings import HuggingFaceEmbeddings
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall

In [43]:
GENERATION_FILE_PATH = "/Users/connie/Desktop/Fall 2025/LLM/Assignment2/data/evaluation/all_experiment_generations.csv"
RESULTS_DIR = os.path.join("..", "results")
os.makedirs(RESULTS_DIR, exist_ok=True)

In [45]:
print("Reading generated data from:", GENERATION_FILE_PATH)
if not os.path.exists(GENERATION_FILE_PATH):
    raise FileNotFoundError("ERROR: Generation file not found. Please run naive_rag.py first.")

qa_df = pd.read_csv(GENERATION_FILE_PATH)
qa_df["answer"] = qa_df["answer"].astype(str) 

Reading generated data from: /Users/connie/Desktop/Fall 2025/LLM/Assignment2/data/evaluation/all_experiment_generations.csv


In [47]:
qa_df.columns

Index(['question', 'answer', 'q_len', 'q_bin', 'gen_naive_k1_all-MiniLM-L6-v2',
       'gen_cot_k1_all-MiniLM-L6-v2', 'gen_persona_k1_all-MiniLM-L6-v2',
       'gen_naive_k3_all-MiniLM-L6-v2', 'gen_cot_k3_all-MiniLM-L6-v2',
       'gen_persona_k3_all-MiniLM-L6-v2', 'gen_naive_k5_all-MiniLM-L6-v2',
       'gen_cot_k5_all-MiniLM-L6-v2', 'gen_persona_k5_all-MiniLM-L6-v2',
       'gen_naive_k1_all-mpnet-base-v2', 'gen_cot_k1_all-mpnet-base-v2',
       'gen_persona_k1_all-mpnet-base-v2', 'gen_naive_k3_all-mpnet-base-v2',
       'gen_cot_k3_all-mpnet-base-v2', 'gen_persona_k3_all-mpnet-base-v2',
       'gen_naive_k5_all-mpnet-base-v2', 'gen_cot_k5_all-mpnet-base-v2',
       'gen_persona_k5_all-mpnet-base-v2', 'contexts'],
      dtype='object')

In [49]:
# Define embeddings model (used by RAGAS)
hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

### Finding out the Basic QA Metrics (F1 score, EM score)

In [52]:
print("\n--- Running Basic QA Metrics (F1/EM) ---")

# Get columns to be evaluated (e.g., generated_naive_k1, generated_cot_k1)
generation_cols = [c for c in qa_df.columns if c.startswith('gen_')]
final_metrics_summary = []

for col in tqdm(generation_cols, desc="Calculating F1/EM"):
    
    # Extract strategy name (e.g., 'naive') and K value (e.g., 1) from column name
    parts = col.split('_')
    k_part = parts[-2] 
    retrieval_k = int(k_part.replace('k', ''))
    strategy_name = parts[1]
    embedding_dim = parts[-1] 
    
    metric_f1 = hf_evaluate.load("squad") 
    f1_scores, em_scores = [], []
    
    for pred_full, truth in zip(qa_df[col], qa_df["answer"]):
        
        # 1. Answer Extraction (Process CoT output format)
        pred = pred_full
        if strategy_name == "cot" and isinstance(pred_full, str) and "Final Answer:" in pred_full:
              # Extract text after the CoT marker
              pred = pred_full.split("Final Answer:")[1].strip()
        
        # 2. Calculate F1 and EM score
        result = metric_f1.compute(
            predictions=[{"prediction_text": str(pred), "id": "0"}],
            references=[{"answers": {"text": [str(truth)], "answer_start": [0]}, "id": "0"}]
        )
        f1_scores.append(result["f1"])
        em_scores.append(result["exact_match"])
        
    avg_f1, avg_em = np.mean(f1_scores), np.mean(em_scores)

    # Temporarily store F1/EM results
    final_metrics_summary.append({
        "Retrieval_K": retrieval_k,
        "Prompt_Strategy": strategy_name,
        "Embedding_Dim": embedding_dim,
        "Avg_F1": avg_f1,
        "Avg_EM": avg_em,
    })


--- Running Basic QA Metrics (F1/EM) ---


Calculating F1/EM: 100%|████████████████████████| 18/18 [00:16<00:00,  1.06it/s]


In [53]:
# Convert results into DataFrame
results_df = pd.DataFrame(final_metrics_summary)

# Save to CSV
csv_path = os.path.join(RESULTS_DIR, "evaluation_f1_em.csv")
results_df.to_csv(csv_path, index=False)

# Pretty print top results (sorted by F1)
print("\n=== Evaluation Results (Top 10 by F1) ===")
print(results_df.sort_values(by="Avg_F1", ascending=False).head(10).to_string(index=False))

print(f"\n✅ Results saved to {csv_path}")


=== Evaluation Results (Top 10 by F1) ===
 Retrieval_K Prompt_Strategy     Embedding_Dim    Avg_F1    Avg_EM
           5         persona all-mpnet-base-v2 49.857943 40.000000
           5         persona  all-MiniLM-L6-v2 49.002727 39.166667
           3         persona  all-MiniLM-L6-v2 46.555274 37.500000
           3         persona all-mpnet-base-v2 44.691283 35.833333
           5           naive all-mpnet-base-v2 41.772417 32.500000
           5           naive  all-MiniLM-L6-v2 40.442060 30.833333
           5             cot  all-MiniLM-L6-v2 38.771223 25.000000
           5             cot all-mpnet-base-v2 38.412203 26.666667
           1         persona  all-MiniLM-L6-v2 38.208599 30.833333
           3             cot  all-MiniLM-L6-v2 37.215111 25.833333

✅ Results saved to ../results/evaluation_f1_em.csv


### Advanced Evaluation using RAGAs

In [None]:
from dotenv import load_dotenv
load_dotenv()

llm_openai = ChatOpenAI(
    model="gpt-4o-mini",
    max_retries=5, # Increase retries to handle rate limits gracefully
)
hf_llm = llm_openai


In [10]:
# Parse contexts into clean lists of passage strings
def parse_context(ctx_str):
    try:
        ctx_obj = ast.literal_eval(ctx_str)  # safely parse string
        if isinstance(ctx_obj, dict) and "passage" in ctx_obj:
            return [ctx_obj["passage"]]
        elif isinstance(ctx_obj, list):
            return [c["passage"] for c in ctx_obj if isinstance(c, dict) and "passage" in c]
    except Exception:
        return []
    return []

qa_df["contexts_clean"] = qa_df["contexts"].apply(parse_context)

In [37]:
# ----------------------------------------------------
# STEP 1: Stratified sampling (done ONCE for all columns)
# ----------------------------------------------------
qa_df["q_len"] = qa_df["question"].apply(lambda x: len(str(x).split()))

# Define bins (short, medium, long) using percentiles
bins = np.percentile(qa_df["q_len"], [33, 66])
qa_df["q_bin"] = pd.cut(
    qa_df["q_len"],
    bins=[-1, bins[0], bins[1], float("inf")],
    labels=["short", "medium", "long"]
)

# Sample 40 from each bin (same random_state → deterministic)
subset_df = qa_df.groupby("q_bin", group_keys=False).apply(
    lambda x: x.sample(min(40, len(x)), random_state=42)
).reset_index(drop=True)

print(f"\n--- Using {len(subset_df)} stratified queries for ALL columns ---")
print(subset_df["q_bin"].value_counts())

# ----------------------------------------------------
# STEP 2: Run RAGAS per column, but on same 120 queries
# ----------------------------------------------------
chunk_size = 10  # row-level logging still works

for i, summary_data in enumerate(tqdm(final_metrics_summary, desc="Running RAGAS")):
    col = generation_cols[i]
    print(f"\nEvaluating column: {col}")

    metric_sums = {"faithfulness": 0, "answer_relevancy": 0,
                   "context_precision": 0, "context_recall": 0}
    total_rows = 0

    for start in range(0, len(subset_df), chunk_size):
        end = min(start + chunk_size, len(subset_df))
        print(f"{datetime.now().strftime('%H:%M:%S')} | Processing rows {start}–{end}...")

        for idx in range(start, end):
            row_data = Dataset.from_dict({
                "question": [str(subset_df["question"].iloc[idx])],
                "answer": [str(subset_df[col].iloc[idx])],
                "contexts": [subset_df["contexts_clean"].iloc[idx]],
                "ground_truths": [[str(subset_df["answer"].iloc[idx])]],
                "reference": [str(subset_df["answer"].iloc[idx])],
            })

            try:
                res = evaluate(
                    row_data,
                    metrics=[faithfulness, answer_relevancy,
                             context_precision, context_recall],
                    llm=hf_llm,
                    embeddings=hf_embeddings,
                    show_progress=False,
                    batch_size=1
                )

                for k in metric_sums.keys():
                    val = res[k]
                    if isinstance(val, list):  # ragas sometimes returns [score]
                        val = val[0]
                    metric_sums[k] += val
                total_rows += 1

                print(f"   Done row {idx+1}/{len(subset_df)} | {res}")

            except Exception as e:
                print(f"   Row {idx+1} failed: {e}")

    if total_rows > 0:
        for k in metric_sums.keys():
            metric_sums[k] /= total_rows
        summary_data.update({
            "Embedding_Dim": final_metrics_summary[i]['Embedding_Dim'],
            "Ragas_Faithfulness": metric_sums["faithfulness"],
            "Ragas_Answer_Relevancy": metric_sums["answer_relevancy"],
            "Ragas_Context_Precision": metric_sums["context_precision"],
            "Ragas_Context_Recall": metric_sums["context_recall"]
        })
        print(f"{datetime.now().strftime('%H:%M:%S')} | Completed {col} with averages: {metric_sums}")
    else:
        print(f"{datetime.now().strftime('%H:%M:%S')} | No valid rows processed for {col}")
        summary_data.update({
            "Ragas_Faithfulness": None,
            "Ragas_Answer_Relevancy": None,
            "Ragas_Context_Precision": None,
            "Ragas_Context_Recall": None
        })

  subset_df = qa_df.groupby("q_bin", group_keys=False).apply(
  subset_df = qa_df.groupby("q_bin", group_keys=False).apply(



--- Using 120 stratified queries for ALL columns ---
q_bin
short     40
medium    40
long      40
Name: count, dtype: int64


Running RAGAS:   0%|                                     | 0/18 [00:00<?, ?it/s]


Evaluating column: gen_naive_k1_all-MiniLM-L6-v2
00:45:13 | Processing rows 0–10...
   Done row 1/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.1109, 'context_precision': 0.0000, 'context_recall': 0.0000}
   Done row 2/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.0638, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 3/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.1015, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 4/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.0827, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 5/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.0778, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 6/120 | {'faithfulness': 1.0000, 'answer_relevancy': 0.0655, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 7/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.0310, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 8/120 |

Running RAGAS:   6%|█▏                    | 1/18 [1:24:29<23:56:20, 5069.45s/it]

   Done row 120/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.0567, 'context_precision': 1.0000, 'context_recall': 1.0000}
02:09:42 | Completed gen_naive_k1_all-MiniLM-L6-v2 with averages: {'faithfulness': 0.041666666666666664, 'answer_relevancy': 0.0573861232211372, 'context_precision': 0.724999999927499, 'context_recall': 0.6166666666666667}

Evaluating column: gen_cot_k1_all-MiniLM-L6-v2
02:09:42 | Processing rows 0–10...
   Done row 1/120 | {'faithfulness': 1.0000, 'answer_relevancy': 0.7395, 'context_precision': 0.0000, 'context_recall': 0.0000}
   Done row 2/120 | {'faithfulness': 0.9091, 'answer_relevancy': 0.6597, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 3/120 | {'faithfulness': 0.6667, 'answer_relevancy': 0.5209, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 4/120 | {'faithfulness': 1.0000, 'answer_relevancy': 0.2886, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 5/120 | {'faithfulness': 0.6667, 'answer

Running RAGAS:  11%|██▍                   | 2/18 [2:50:52<22:49:38, 5136.13s/it]

   Done row 120/120 | {'faithfulness': 1.0000, 'answer_relevancy': 0.0000, 'context_precision': 1.0000, 'context_recall': 1.0000}
03:36:05 | Completed gen_cot_k1_all-MiniLM-L6-v2 with averages: {'faithfulness': 0.6324037999037999, 'answer_relevancy': 0.46637369879760854, 'context_precision': 0.7083333332624989, 'context_recall': 0.625}

Evaluating column: gen_persona_k1_all-MiniLM-L6-v2
03:36:05 | Processing rows 0–10...
   Done row 1/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.0710, 'context_precision': 0.0000, 'context_recall': 0.0000}
   Done row 2/120 | {'faithfulness': 1.0000, 'answer_relevancy': 0.8618, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 3/120 | {'faithfulness': 1.0000, 'answer_relevancy': 0.5207, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 4/120 | {'faithfulness': 1.0000, 'answer_relevancy': 0.4170, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 5/120 | {'faithfulness': 1.0000, 'answer_relevancy'

Running RAGAS:  17%|███▋                  | 3/18 [4:17:13<21:29:13, 5156.88s/it]

   Done row 120/120 | {'faithfulness': 1.0000, 'answer_relevancy': 0.0000, 'context_precision': 1.0000, 'context_recall': 1.0000}
05:02:26 | Completed gen_persona_k1_all-MiniLM-L6-v2 with averages: {'faithfulness': 0.670138888888889, 'answer_relevancy': 0.38679829125468596, 'context_precision': 0.7333333332599989, 'context_recall': 0.6083333333333333}

Evaluating column: gen_naive_k3_all-MiniLM-L6-v2
05:02:26 | Processing rows 0–10...
   Done row 1/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.1096, 'context_precision': 0.0000, 'context_recall': 0.0000}
   Done row 2/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.0616, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 3/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.1026, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 4/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.0827, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 5/120 | {'faithfulness': 0.0000, 'ans

Running RAGAS:  22%|████▉                 | 4/18 [5:43:35<20:05:32, 5166.58s/it]

   Done row 120/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.0631, 'context_precision': 1.0000, 'context_recall': 1.0000}
06:28:48 | Completed gen_naive_k3_all-MiniLM-L6-v2 with averages: {'faithfulness': 0.03333333333333333, 'answer_relevancy': 0.0566033218949069, 'context_precision': 0.716666666594999, 'context_recall': 0.6}

Evaluating column: gen_cot_k3_all-MiniLM-L6-v2
06:28:48 | Processing rows 0–10...
   Done row 1/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.6047, 'context_precision': 0.0000, 'context_recall': 0.0000}
   Done row 2/120 | {'faithfulness': 1.0000, 'answer_relevancy': 0.7208, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 3/120 | {'faithfulness': 0.6667, 'answer_relevancy': 0.5209, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 4/120 | {'faithfulness': 0.5000, 'answer_relevancy': 0.2861, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 5/120 | {'faithfulness': 0.6667, 'answer_relevancy': 0.9

Running RAGAS:  28%|██████                | 5/18 [7:10:05<18:41:18, 5175.28s/it]

   Done row 120/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.4917, 'context_precision': 1.0000, 'context_recall': 1.0000}
07:55:19 | Completed gen_cot_k3_all-MiniLM-L6-v2 with averages: {'faithfulness': 0.5338029100529098, 'answer_relevancy': 0.5156227230493109, 'context_precision': 0.7333333332599989, 'context_recall': 0.6}

Evaluating column: gen_persona_k3_all-MiniLM-L6-v2
07:55:19 | Processing rows 0–10...
   Done row 1/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.1091, 'context_precision': 0.0000, 'context_recall': 0.0000}
   Done row 2/120 | {'faithfulness': 1.0000, 'answer_relevancy': 0.8618, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 3/120 | {'faithfulness': 1.0000, 'answer_relevancy': 0.5209, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 4/120 | {'faithfulness': 1.0000, 'answer_relevancy': 0.2221, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 5/120 | {'faithfulness': 1.0000, 'answer_relevancy': 0

Running RAGAS:  33%|███████▎              | 6/18 [8:36:19<17:14:54, 5174.55s/it]

   Done row 120/120 | {'faithfulness': 1.0000, 'answer_relevancy': 0.0000, 'context_precision': 1.0000, 'context_recall': 1.0000}
09:21:32 | Completed gen_persona_k3_all-MiniLM-L6-v2 with averages: {'faithfulness': 0.6409722222222223, 'answer_relevancy': 0.38840395045257087, 'context_precision': 0.724999999927499, 'context_recall': 0.6166666666666667}

Evaluating column: gen_naive_k5_all-MiniLM-L6-v2
09:21:32 | Processing rows 0–10...
   Done row 1/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.1101, 'context_precision': 0.0000, 'context_recall': 0.0000}
   Done row 2/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.0638, 'context_precision': 1.0000, 'context_recall': 0.0000}
   Done row 3/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.1026, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 4/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.0827, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 5/120 | {'faithfulness': 0.0000, 'ans

Running RAGAS:  39%|████████▏            | 7/18 [10:02:48<15:49:34, 5179.51s/it]

   Done row 120/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.0567, 'context_precision': 1.0000, 'context_recall': 1.0000}
10:48:01 | Completed gen_naive_k5_all-MiniLM-L6-v2 with averages: {'faithfulness': 0.0375, 'answer_relevancy': 0.05545860349791817, 'context_precision': 0.7416666665924989, 'context_recall': 0.6}

Evaluating column: gen_cot_k5_all-MiniLM-L6-v2
10:48:01 | Processing rows 0–10...
   Done row 1/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.1342, 'context_precision': 0.0000, 'context_recall': 0.0000}
   Done row 2/120 | {'faithfulness': 1.0000, 'answer_relevancy': 0.6762, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 3/120 | {'faithfulness': 0.3333, 'answer_relevancy': 0.0000, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 4/120 | {'faithfulness': 1.0000, 'answer_relevancy': 0.2770, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 5/120 | {'faithfulness': 0.6667, 'answer_relevancy': 0.9580, 'conte

Running RAGAS:  44%|█████████▎           | 8/18 [11:29:15<14:23:39, 5181.90s/it]

   Done row 120/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.5051, 'context_precision': 1.0000, 'context_recall': 1.0000}
12:14:28 | Completed gen_cot_k5_all-MiniLM-L6-v2 with averages: {'faithfulness': 0.5282936507936508, 'answer_relevancy': 0.49385898392828, 'context_precision': 0.7083333332624989, 'context_recall': 0.6083333333333333}

Evaluating column: gen_persona_k5_all-MiniLM-L6-v2
12:14:28 | Processing rows 0–10...
   Done row 1/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.1342, 'context_precision': 0.0000, 'context_recall': 0.0000}
   Done row 2/120 | {'faithfulness': 1.0000, 'answer_relevancy': 0.8478, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 3/120 | {'faithfulness': 1.0000, 'answer_relevancy': 0.5209, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 4/120 | {'faithfulness': 1.0000, 'answer_relevancy': 0.2221, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 5/120 | {'faithfulness': 1.0000, 'answer_

Task exception was never retrieved
future: <Task finished name='Task-23282' coro=<as_completed.<locals>.sema_coro() done, defined at /opt/anaconda3/lib/python3.12/site-packages/ragas/async_utils.py:53> exception=KeyboardInterrupt()>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3577, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/k3/vcbwsjp93rl2cww52jf518fc0000gn/T/ipykernel_15058/2173069320.py", line 30, in <module>
    res = evaluate(
          ^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/ragas/_analytics.py", line 277, in wrapper
    result = func(*args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/ragas/evaluation.py", line 317, in evaluate
    results = executor.results()
              ^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/ragas/executor.py", line 215, in result

   Done row 113/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.2507, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 114/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.3530, 'context_precision': 0.0000, 'context_recall': 0.0000}
   Done row 115/120 | {'faithfulness': 1.0000, 'answer_relevancy': 0.0564, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 116/120 | {'faithfulness': 1.0000, 'answer_relevancy': 0.9297, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 117/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.0628, 'context_precision': 1.0000, 'context_recall': 0.0000}
   Done row 118/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.1028, 'context_precision': 1.0000, 'context_recall': 0.0000}
   Done row 119/120 | {'faithfulness': 1.0000, 'answer_relevancy': 0.8149, 'context_precision': 1.0000, 'context_recall': 1.0000}


Running RAGAS:  50%|██████████▌          | 9/18 [12:55:32<12:57:00, 5180.11s/it]

   Done row 120/120 | {'faithfulness': 1.0000, 'answer_relevancy': 0.0000, 'context_precision': 1.0000, 'context_recall': 1.0000}
13:40:45 | Completed gen_persona_k5_all-MiniLM-L6-v2 with averages: {'faithfulness': 0.6104166666666667, 'answer_relevancy': 0.4079979251596522, 'context_precision': 0.724999999927499, 'context_recall': 0.6166666666666667}

Evaluating column: gen_naive_k1_all-mpnet-base-v2
13:40:45 | Processing rows 0–10...
   Done row 1/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.1101, 'context_precision': 0.0000, 'context_recall': 0.0000}
   Done row 2/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.0616, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 3/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.1015, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 4/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.0827, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 5/120 | {'faithfulness': 0.0000, 'ans

Running RAGAS:  56%|███████████         | 10/18 [14:22:07<11:31:18, 5184.82s/it]

   Done row 120/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.0531, 'context_precision': 1.0000, 'context_recall': 1.0000}
15:07:20 | Completed gen_naive_k1_all-mpnet-base-v2 with averages: {'faithfulness': 0.05555555555555555, 'answer_relevancy': 0.05525251443321559, 'context_precision': 0.724999999927499, 'context_recall': 0.6166666666666667}

Evaluating column: gen_cot_k1_all-mpnet-base-v2
15:07:20 | Processing rows 0–10...
   Done row 1/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.6725, 'context_precision': 0.0000, 'context_recall': 0.0000}
   Done row 2/120 | {'faithfulness': 0.9091, 'answer_relevancy': 0.7100, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 3/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.1309, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 4/120 | {'faithfulness': 1.0000, 'answer_relevancy': 0.4131, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 5/120 | {'faithfulness': 0.8000, 'answ

Running RAGAS:  61%|████████████▏       | 11/18 [15:48:29<10:04:47, 5183.96s/it]

   Done row 120/120 | {'faithfulness': 1.0000, 'answer_relevancy': 0.0000, 'context_precision': 1.0000, 'context_recall': 1.0000}
16:33:42 | Completed gen_cot_k1_all-mpnet-base-v2 with averages: {'faithfulness': 0.48417297979797974, 'answer_relevancy': 0.4534198580155991, 'context_precision': 0.724999999927499, 'context_recall': 0.5833333333333334}

Evaluating column: gen_persona_k1_all-mpnet-base-v2
16:33:42 | Processing rows 0–10...
   Done row 1/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.2763, 'context_precision': 0.0000, 'context_recall': 0.0000}
   Done row 2/120 | {'faithfulness': 1.0000, 'answer_relevancy': 0.8618, 'context_precision': 1.0000, 'context_recall': 0.0000}
   Done row 3/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.4972, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 4/120 | {'faithfulness': 1.0000, 'answer_relevancy': 0.3133, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 5/120 | {'faithfulness': 0.0000, 'ans

Running RAGAS:  67%|██████████████       | 12/18 [17:14:25<8:37:33, 5175.50s/it]

   Done row 120/120 | {'faithfulness': 1.0000, 'answer_relevancy': 0.0000, 'context_precision': 1.0000, 'context_recall': 1.0000}
17:59:38 | Completed gen_persona_k1_all-mpnet-base-v2 with averages: {'faithfulness': nan, 'answer_relevancy': 0.36857428254895963, 'context_precision': 0.716666666594999, 'context_recall': 0.6166666666666667}

Evaluating column: gen_naive_k3_all-mpnet-base-v2
17:59:38 | Processing rows 0–10...
   Done row 1/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.1101, 'context_precision': 0.0000, 'context_recall': 0.0000}
   Done row 2/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.0616, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 3/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.1015, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 4/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.0827, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 5/120 | {'faithfulness': 0.0000, 'answer_relevancy

Running RAGAS:  72%|███████████████▏     | 13/18 [18:36:16<7:04:36, 5095.37s/it]

   Done row 120/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.0567, 'context_precision': 1.0000, 'context_recall': 1.0000}
19:21:29 | Completed gen_naive_k3_all-mpnet-base-v2 with averages: {'faithfulness': 0.029166666666666667, 'answer_relevancy': 0.056199251250112985, 'context_precision': 0.716666666594999, 'context_recall': 0.6}

Evaluating column: gen_cot_k3_all-mpnet-base-v2
19:21:29 | Processing rows 0–10...
   Done row 1/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.6745, 'context_precision': 0.0000, 'context_recall': 0.0000}
   Done row 2/120 | {'faithfulness': 1.0000, 'answer_relevancy': 0.8525, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 3/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.4943, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 4/120 | {'faithfulness': 1.0000, 'answer_relevancy': 0.3133, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 5/120 | {'faithfulness': 0.6667, 'answer_relevancy'

Running RAGAS:  78%|████████████████▎    | 14/18 [20:05:28<5:44:51, 5172.81s/it]

   Done row 120/120 | {'faithfulness': 0.3333, 'answer_relevancy': 0.0000, 'context_precision': 1.0000, 'context_recall': 1.0000}
20:50:41 | Completed gen_cot_k3_all-mpnet-base-v2 with averages: {'faithfulness': 0.4773015873015873, 'answer_relevancy': 0.5233506981561491, 'context_precision': 0.7333333332599989, 'context_recall': 0.6}

Evaluating column: gen_persona_k3_all-mpnet-base-v2
20:50:41 | Processing rows 0–10...
   Done row 1/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.0000, 'context_precision': 0.0000, 'context_recall': 0.0000}
   Done row 2/120 | {'faithfulness': 1.0000, 'answer_relevancy': 0.8618, 'context_precision': 1.0000, 'context_recall': 0.0000}
   Done row 3/120 | {'faithfulness': 1.0000, 'answer_relevancy': 0.5209, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 4/120 | {'faithfulness': 1.0000, 'answer_relevancy': 0.2225, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 5/120 | {'faithfulness': 1.0000, 'answer_relevancy':

Running RAGAS:  83%|█████████████████▌   | 15/18 [21:33:49<4:20:34, 5211.36s/it]

   Done row 120/120 | {'faithfulness': 1.0000, 'answer_relevancy': 0.0000, 'context_precision': 1.0000, 'context_recall': 1.0000}
22:19:02 | Completed gen_persona_k3_all-mpnet-base-v2 with averages: {'faithfulness': 0.5659722222222221, 'answer_relevancy': 0.40135172656508666, 'context_precision': 0.716666666594999, 'context_recall': 0.6166666666666667}

Evaluating column: gen_naive_k5_all-mpnet-base-v2
22:19:02 | Processing rows 0–10...
   Done row 1/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.1101, 'context_precision': 0.0000, 'context_recall': 0.0000}
   Done row 2/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.0638, 'context_precision': 1.0000, 'context_recall': 0.0000}
   Done row 3/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.1015, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 4/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.0827, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 5/120 | {'faithfulness': 0.0000, 'a

Running RAGAS:  89%|██████████████████▋  | 16/18 [22:57:30<2:51:48, 5154.27s/it]

   Done row 120/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.0631, 'context_precision': 1.0000, 'context_recall': 1.0000}
23:42:43 | Completed gen_naive_k5_all-mpnet-base-v2 with averages: {'faithfulness': 0.0375, 'answer_relevancy': 0.056595967357545726, 'context_precision': 0.716666666594999, 'context_recall': 0.6166666666666667}

Evaluating column: gen_cot_k5_all-mpnet-base-v2
23:42:43 | Processing rows 0–10...
   Done row 1/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.7123, 'context_precision': 0.0000, 'context_recall': 0.0000}
   Done row 2/120 | {'faithfulness': 1.0000, 'answer_relevancy': 0.8451, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 3/120 | {'faithfulness': 0.3333, 'answer_relevancy': 0.0000, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 4/120 | {'faithfulness': 1.0000, 'answer_relevancy': 0.3133, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 5/120 | {'faithfulness': 1.0000, 'answer_relevancy

Running RAGAS:  94%|███████████████████▊ | 17/18 [24:25:58<1:26:40, 5200.56s/it]

   Done row 120/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.7122, 'context_precision': 1.0000, 'context_recall': 1.0000}
01:11:12 | Completed gen_cot_k5_all-mpnet-base-v2 with averages: {'faithfulness': 0.5104166666666666, 'answer_relevancy': 0.5172491761386305, 'context_precision': 0.7333333332599989, 'context_recall': 0.6083333333333333}

Evaluating column: gen_persona_k5_all-mpnet-base-v2
01:11:12 | Processing rows 0–10...
   Done row 1/120 | {'faithfulness': 0.0000, 'answer_relevancy': 0.0000, 'context_precision': 0.0000, 'context_recall': 0.0000}
   Done row 2/120 | {'faithfulness': 1.0000, 'answer_relevancy': 0.8618, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 3/120 | {'faithfulness': 0.2500, 'answer_relevancy': 0.7570, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 4/120 | {'faithfulness': 1.0000, 'answer_relevancy': 0.2225, 'context_precision': 1.0000, 'context_recall': 1.0000}
   Done row 5/120 | {'faithfulness': 1.0000, 'ans

Running RAGAS: 100%|███████████████████████| 18/18 [25:52:51<00:00, 5176.17s/it]

   Done row 120/120 | {'faithfulness': 1.0000, 'answer_relevancy': 0.0000, 'context_precision': 1.0000, 'context_recall': 1.0000}
02:38:04 | Completed gen_persona_k5_all-mpnet-base-v2 with averages: {'faithfulness': 0.6125, 'answer_relevancy': 0.38534648149643247, 'context_precision': 0.716666666594999, 'context_recall': 0.625}





### Final Results Storage

In [38]:
# Define results directory explicitly (one level up from src/)
RESULTS_DIR = os.path.join("..", "results")
os.makedirs(RESULTS_DIR, exist_ok=True)

# Save the sampled subset queries (for reproducibility and appendix)
subset_path = os.path.join(RESULTS_DIR, "subset_queries.csv")
subset_df.to_csv(subset_path, index=False)
print(f"\nSampled queries saved to {subset_path}")

# Convert evaluation results into DataFrame
results_df = pd.DataFrame(final_metrics_summary)

# Save results to CSV
csv_path = os.path.join(RESULTS_DIR, "comparison_analysis.csv")
results_df.to_csv(csv_path, index=False)

print("\nEvaluation Finalized.")
print(f"Metrics saved to {json_path} and {csv_path}")


Sampled queries saved to ../results/subset_queries.csv

Evaluation Finalized.
Metrics saved to ../results/naive_results.json and ../results/comparison_analysis.csv
