In [11]:
import os
import pandas as pd
import numpy as np
import evaluate as hf_evaluate

ENHANCED_PATH = "../data/evaluation/enhanced_experiment_generations.csv"
F1_EM_PATH    = "../results/evaluation_f1_em.csv"

print("Enhanced file:", os.path.abspath(ENHANCED_PATH))
print("F1/EM results file:", os.path.abspath(F1_EM_PATH))

Enhanced file: /Users/connie/Desktop/Fall 2025/LLM/Assignment2/data/evaluation/enhanced_experiment_generations.csv
F1/EM results file: /Users/connie/Desktop/Fall 2025/LLM/Assignment2/results/evaluation_f1_em.csv


In [12]:
enhanced_df = pd.read_csv(ENHANCED_PATH)

pred_col = "gen_enhanced_k5_persona_mpnet"
for col in ["question", "answer", pred_col]:
    if col not in enhanced_df.columns:
        raise ValueError(f"Missing column in enhanced file: {col}")

answers = enhanced_df["answer"].astype(str).tolist()
preds   = enhanced_df[pred_col].fillna("").astype(str).tolist()

In [13]:
metric = hf_evaluate.load("squad")
f1_scores, em_scores = [], []
for pred, truth in zip(preds, answers):
    r = metric.compute(
        predictions=[{"prediction_text": pred, "id": "0"}],
        references=[{"answers": {"text": [truth], "answer_start": [0]}, "id": "0"}]
    )
    f1_scores.append(r["f1"])
    em_scores.append(r["exact_match"])

avg_f1 = float(np.mean(f1_scores))
avg_em = float(np.mean(em_scores))

row = {
    "Retrieval_K": 5,
    "Prompt_Strategy": "enhanced-persona+qr+rerank",
    "Embedding_Dim": "all-mpnet-base-v2",
    "Avg_F1": avg_f1,
    "Avg_EM": avg_em,
}

In [14]:
if os.path.exists(F1_EM_PATH):
    results_df = pd.read_csv(F1_EM_PATH)
else:
    results_df = pd.DataFrame(columns=["Retrieval_K","Prompt_Strategy","Embedding_Dim","Avg_F1","Avg_EM"])

results_df = pd.concat([results_df, pd.DataFrame([row])], ignore_index=True)

results_df = results_df.drop_duplicates(
    subset=["Retrieval_K","Prompt_Strategy","Embedding_Dim"], keep="last"
)

os.makedirs(os.path.dirname(F1_EM_PATH), exist_ok=True)
results_df.to_csv(F1_EM_PATH, index=False)

print(f"Updated results saved to {os.path.abspath(F1_EM_PATH)}")
print(results_df.tail(5).to_string(index=False))

Updated results saved to /Users/connie/Desktop/Fall 2025/LLM/Assignment2/results/evaluation_f1_em.csv
 Retrieval_K            Prompt_Strategy     Embedding_Dim    Avg_F1    Avg_EM
           3                    persona all-mpnet-base-v2 44.691283 35.833333
           5                      naive all-mpnet-base-v2 41.772417 32.500000
           5                        cot all-mpnet-base-v2 38.412203 26.666667
           5                    persona all-mpnet-base-v2 49.857943 40.000000
           5 enhanced-persona+qr+rerank all-mpnet-base-v2 48.315471 38.333333
