In [1]:
import json, re, string
from statistics import mean

def normalize(text):
    text = text.lower()
    text = re.sub(r'\b(a|an|the)\b', ' ', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    return ' '.join(text.split())

def em(pred, truth):
    return int(normalize(pred) == normalize(truth))

def f1(pred, truth):
    pred_tokens  = normalize(pred).split()
    truth_tokens = normalize(truth).split()
    common = set(pred_tokens) & set(truth_tokens)
    if not common:
        return 0.0
    precision = len(common) / len(pred_tokens)
    recall    = len(common) / len(truth_tokens)
    return 2 * precision * recall / (precision + recall)

em_scores, f1_scores = [], []

with open("data/rag_preds.jsonl", encoding="utf-8") as fp:
    for line in fp:
        ex = json.loads(line)
        em_scores.append(em(ex["pred"], ex["answer"]))
        f1_scores.append(f1(ex["pred"], ex["answer"]))

print(f"Exact Match: {mean(em_scores):.2%}")
print(f"Token-level F1: {mean(f1_scores):.2%}")


Exact Match: 36.00%
Token-level F1: 62.85%


In [2]:
import os, json, datetime

metrics_path = "data/rag_metrics_day02.json"
os.makedirs(os.path.dirname(metrics_path), exist_ok=True)

metrics = {
    "date": datetime.date.today().isoformat(),
    "exact_match": round(mean(em_scores), 4),   # 0.3600 for you
    "f1": round(mean(f1_scores), 4)             # 0.6285 for you
}

with open(metrics_path, "w", encoding="utf-8") as f:
    json.dump(metrics, f, indent=2)

print(f"Saved Day 2 baseline metrics to {metrics_path}")
metrics


Saved Day 2 baseline metrics to data/rag_metrics_day02.json


{'date': '2025-07-16', 'exact_match': 0.36, 'f1': 0.6285}