In [1]:
import json, re, string, itertools

def normalize(t):
    t = t.lower()
    t = re.sub(r'\b(a|an|the)\b', ' ', t)
    t = t.translate(str.maketrans('', '', string.punctuation))
    return ' '.join(t.split())

def f1(pred, truth):
    pred_t  = normalize(pred).split()
    truth_t = normalize(truth).split()
    common  = set(pred_t) & set(truth_t)
    if not common:
        return 0.0
    prec = len(common) / len(pred_t)
    rec  = len(common) / len(truth_t)
    return 2 * prec * rec / (prec + rec)

# load predictions and keep their retrieval contexts
examples = []
with open("data/rag_preds.jsonl", encoding="utf-8") as fp:
    for line in fp:
        ex = json.loads(line)
        examples.append({
            "question": ex["question"],
            "answer":   ex["answer"],
            "pred":     ex["pred"],
            "contexts": ex["context"].split("\n\n---\n\n") if "---" in ex["context"] else [ex["context"]],
            "f1":       f1(ex["pred"], ex["answer"])
        })

# sort by ascending F1 and show the 5 worst
for ex in sorted(examples, key=lambda d: d["f1"])[:5]:
    print("="*80)
    print("Q:", ex["question"])
    print("Gold:", ex["answer"])
    print("Pred:", ex["pred"])
    print(f"F1={ex['f1']:.2f}")
    for i, ctx in enumerate(ex["contexts"], 1):
        print(f"\n-- Context {i} --\n{ctx[:400]}…")


Q: The Church of England uses what term that is held by two senior members of the College of Minor Canons of St. Pauls Catherdral?
Gold: tituli
Pred: Cardinal
F1=0.00

-- Context 1 --
The term cardinal at one time applied to any priest permanently assigned or incardinated to a church, or specifically to the senior priest of an important church, based on the Latin cardo (hinge), meaning "principal" or "chief". The term was applied in this sense as early as the ninth century to the priests of the tituli (parishes) of the diocese of Rome. The Church of England retains an instance …
Q: What novel concept was introduced at the end of the 20th century?
Gold: sustainability
Pred: The concept of Israel becoming the 51st state in the United States was introduced at the end of the 20th century.
F1=0.00

-- Context 1 --
In the late 20th century a new concept was added to those included in the compass of both structure and function, the consideration of sustainability, hence sustainable architectu

In [2]:
import json, re, string
from collections import Counter

def normalize(text):
    text = text.lower()
    text = re.sub(r'\b(a|an|the)\b', ' ', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    return ' '.join(text.split())

def answer_in_context(answer, context):
    ans_tokens = set(normalize(answer).split())
    ctx_tokens = set(normalize(context).split())
    return ans_tokens.issubset(ctx_tokens)

stats = Counter()

with open("data/rag_preds.jsonl", encoding="utf-8") as fp:
    for line in fp:
        ex = json.loads(line)
        contexts = ex["context"].split("\n\n---\n\n") if "---" in ex["context"] else [ex["context"]]
        found = any(answer_in_context(ex["answer"], c) for c in contexts)
        if ex["pred"] == ex["answer"]:
            stats["correct"] += 1                # EM hit
        elif not found:
            stats["retriever_miss"] += 1
        else:
            stats["generator_miss"] += 1

print(stats)


Counter({'generator_miss': 74, 'correct': 25, 'retriever_miss': 1})
