In [2]:
# ── 0. Install / import libraries ─────────────────────────────────────────
!pip install -qU faiss-cpu sentence-transformers "openai>=1.3.8"

import json, faiss, numpy as np, textwrap, os
from sentence_transformers import SentenceTransformer
from openai import OpenAI

# ── 1. Build (or rebuild) the FAISS index ────────────────────────────────
data_path = "rag_qa.jsonl"
contexts  = [json.loads(l)["context"] for l in open(data_path, encoding="utf-8")]

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
emb   = model.encode(contexts, convert_to_numpy=True, show_progress_bar=True)
faiss.normalize_L2(emb)                    # cosine similarity

index = faiss.IndexFlatIP(emb.shape[1])
index.add(emb)

def retrieve(query: str, k: int = 3):
    """Return (scores, contexts) for the top-k passages."""
    q_emb = model.encode([query], convert_to_numpy=True)
    faiss.normalize_L2(q_emb)
    scores, idxs = index.search(q_emb, k)
    return scores[0], [contexts[i] for i in idxs[0]]

# ── 2. Define the extract-only generator ─────────────────────────────────
client = OpenAI()          # assumes OPENAI_API_KEY is already in your env

def rag_answer(question: str, k: int = 3,
               model="gpt-3.5-turbo-0125") -> str:
    """Retrieve top-k passages and return ONLY the exact answer span."""
    _, ctxs = retrieve(question, k=k)
    context_block = "\n\n---\n\n".join(ctxs)

    system_msg = (
        "You are an *extractive* QA assistant.\n"
        "• Find the minimal text span (≤ 10 tokens) answering the question.\n"
        "• Copy the span verbatim from <context>; do NOT add or remove words.\n"
        "• Output that span only—no punctuation or extra text.\n"
        "• If the span is not present, reply exactly: I don’t know."
    )

    user_msg = textwrap.dedent(f"""
        <context>
        {context_block}
        </context>

        Question: {question}

        Answer only:
    """).strip()

    resp = client.chat.completions.create(
        model=model,
        messages=[{"role": "system", "content": system_msg},
                  {"role": "user",   "content": user_msg}],
        temperature=0.0,
        max_tokens=16,
    )
    return resp.choices[0].message.content.strip()

print("✅  retrieve() and rag_answer() are ready.")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m65.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m470.2/470.2 kB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m765.0/765.0 kB[0m [31m44.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m81.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m88.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m55.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

✅  retrieve() and rag_answer() are ready.


In [3]:
!pip install -qU tqdm  # does nothing if already present

import json, os, time
from tqdm.auto import tqdm

in_path  = "rag_qa.jsonl"             # 100 Q&A pairs
out_path = "rag_preds_day03_2.jsonl"    # new file WITH ret_ctxs
k        = 3                          # top-k passages

os.makedirs(os.path.dirname(out_path) or ".", exist_ok=True)

with open(in_path, encoding="utf-8") as fin, \
     open(out_path, "w", encoding="utf-8") as fout:

    for line in tqdm(fin, total=100, desc="Generating (with ret_ctxs)"):
        ex       = json.loads(line)
        _, ctxs  = retrieve(ex["question"], k=k)        # save these
        pred     = rag_answer(ex["question"], k=k)      # extract-only
        fout.write(json.dumps({**ex,
                               "pred": pred,
                               "ret_ctxs": ctxs},
                              ensure_ascii=False) + "\n")
        time.sleep(0.3)  # adjust if you hit rate limits

print(f"\nDone — wrote predictions + passages to {out_path}")


Generating (with ret_ctxs):   0%|          | 0/100 [00:00<?, ?it/s]


Done — wrote predictions + passages to rag_preds_day03_2.jsonl


In [4]:
import json, re, string
from statistics import mean
from collections import Counter

def normalize(t):
    t = t.lower()
    t = re.sub(r'\b(a|an|the)\b', ' ', t)
    t = t.translate(str.maketrans('', '', string.punctuation))
    return ' '.join(t.split())

def exact_match(pred, gold):        # EM
    return int(normalize(pred) == normalize(gold))

def f1(pred, gold):                 # token-level F1
    p, g = normalize(pred).split(), normalize(gold).split()
    common = set(p) & set(g)
    if not common:
        return 0.0
    prec, rec = len(common)/len(p), len(common)/len(g)
    return 2*prec*rec/(prec+rec)

def answer_in_ctx(ans, ctx):
    return set(normalize(ans).split()).issubset(set(normalize(ctx).split()))

file_path = "rag_preds_day03_2.jsonl"   # ← change if you named it differently
em_scores, f1_scores = [], []
stats = Counter()

with open(file_path, encoding="utf-8") as fp:
    for line in fp:
        ex = json.loads(line)
        em_scores.append(exact_match(ex["pred"], ex["answer"]))
        f1_scores.append(f1(ex["pred"], ex["answer"]))
        found = any(answer_in_ctx(ex["answer"], c) for c in ex["ret_ctxs"])
        if em_scores[-1]:
            stats["correct"] += 1
        elif not found:
            stats["retriever_miss"] += 1
        else:
            stats["generator_miss"] += 1

print(f"Exact Match: {mean(em_scores):.2%}")
print(f"Token F1   : {mean(f1_scores):.2%}")
print(stats)   # e.g. {'correct': 54, 'generator_miss': 42, 'retriever_miss': 4}


Exact Match: 52.00%
Token F1   : 74.32%
Counter({'correct': 52, 'generator_miss': 45, 'retriever_miss': 3})


In [7]:
def rag_answer(question: str, k: int = 3,
               model="gpt-3.5-turbo-0125") -> str:
    """Return ≤4-token span copied verbatim from context."""
    _, ctxs = retrieve(question, k=k)
    ctx_block = "\n\n---\n\n".join(ctxs)

    system_msg = (
        "You are an *extractive* QA assistant.\n"
        "RULES:\n"
        "1. Copy **exactly** the minimal text span that answers the question.\n"
        "2. The span must appear verbatim in <context>.\n"
        "3. Output **max 4 tokens**, no punctuation, no extra words.\n"
        "4. If the span is missing, reply exactly: I don’t know.\n\n"
        "Example – WRONG:\n"
        "  Context: … broadcast from CBS Television City in Los Angeles …\n"
        "  Question: Where are the American Idol finals broadcast from?\n"
        "  Bad answer: CBS Television City in Los Angeles  ← too many tokens\n"
        "  Good answer: CBS Television City"
    )

    user_msg = f"""<context>
{ctx_block}
</context>

Question: {question}

Answer only:"""

    resp = client.chat.completions.create(
        model=model,
        messages=[{"role": "system", "content": system_msg},
                  {"role": "user",   "content": user_msg}],
        temperature=0.0,
        max_tokens=10,
    )
    return resp.choices[0].message.content.strip()


In [8]:
print(rag_answer("Where are American Idol finals broadcast from?", k=3))


CBS Television City


In [9]:
import json, os, time
from tqdm.auto import tqdm

in_path  = "rag_qa.jsonl"               # 100 Q-A pairs
out_path = "rag_preds_day03_3.jsonl"
k        = 3                            # top-k passages

os.makedirs(os.path.dirname(out_path) or ".", exist_ok=True)

with open(in_path, encoding="utf-8") as fin, \
     open(out_path, "w", encoding="utf-8") as fout:

    for line in tqdm(fin, total=100, desc="Generating (4-token cap)"):
        ex      = json.loads(line)
        _, ctxs = retrieve(ex["question"], k=k)   # passages sent to the LLM
        pred    = rag_answer(ex["question"], k=k) # new, stricter prompt
        fout.write(json.dumps({**ex,
                               "pred": pred,
                               "ret_ctxs": ctxs},
                              ensure_ascii=False) + "\n")
        time.sleep(0.3)                           # polite pacing

print(f"\nWrote predictions to {out_path}")


Generating (4-token cap):   0%|          | 0/100 [00:00<?, ?it/s]


Wrote predictions to rag_preds_day03_3.jsonl


In [10]:
import json, re, string
from statistics import mean
from collections import Counter

def normalize(t):
    t = t.lower()
    t = re.sub(r'\b(a|an|the)\b', ' ', t)
    t = t.translate(str.maketrans('', '', string.punctuation))
    return ' '.join(t.split())

def em(pred, truth):
    return int(normalize(pred) == normalize(truth))

def f1(pred, truth):
    p, g   = normalize(pred).split(), normalize(truth).split()
    common = set(p) & set(g)
    if not common:
        return 0.0
    prec, rec = len(common)/len(p), len(common)/len(g)
    return 2*prec*rec/(prec+rec)

def answer_in(ans, ctx):
    return set(normalize(ans).split()).issubset(set(normalize(ctx).split()))

FILE = "rag_preds_day03_3.jsonl"   # your new file
scores_em, scores_f1 = [], []
stats = Counter()

with open(FILE, encoding="utf-8") as fp:
    for row in fp:
        ex = json.loads(row)
        scores_em.append(em(ex["pred"], ex["answer"]))
        scores_f1.append(f1(ex["pred"], ex["answer"]))
        found = any(answer_in(ex["answer"], c) for c in ex["ret_ctxs"])
        if scores_em[-1]:
            stats["correct"] += 1
        elif not found:
            stats["retriever_miss"] += 1
        else:
            stats["generator_miss"] += 1

print(f"Exact Match: {mean(scores_em):.2%}")
print(f"Token F1   : {mean(scores_f1):.2%}")
print(stats)


Exact Match: 65.00%
Token F1   : 79.97%
Counter({'correct': 65, 'generator_miss': 33, 'retriever_miss': 2})


In [11]:
import json, re, string
from collections import Counter

def normalize(t):
    t = t.lower()
    t = re.sub(r'\b(a|an|the)\b', ' ', t)
    t = t.translate(str.maketrans('', '', string.punctuation))
    return ' '.join(t.split())

def answer_in_ctx(ans, ctx):
    return set(normalize(ans).split()).issubset(set(normalize(ctx).split()))

K = 5                           # ↑ from 3 to 5
hits = Counter()

with open("rag_qa.jsonl", encoding="utf-8") as fp:
    for row in fp:
        ex       = json.loads(row)
        _, ctxs  = retrieve(ex["question"], k=K)
        found    = any(answer_in_ctx(ex["answer"], c) for c in ctxs)
        hits["found" if found else "missed"] += 1

total = hits["found"] + hits["missed"]
print(f"Recall@{K}: {hits['found']} / {total} = {hits['found']/total:.1%}")


Recall@5: 98 / 100 = 98.0%


In [12]:
import json, os, time
from tqdm.auto import tqdm

in_path  = "rag_qa.jsonl"
out_path = "rag_preds_day04_k5.jsonl"   # new file
K        = 5                           # top-5 passages

os.makedirs(os.path.dirname(out_path) or ".", exist_ok=True)

with open(in_path, encoding="utf-8") as fin, \
     open(out_path, "w", encoding="utf-8") as fout:

    for line in tqdm(fin, total=100, desc="Generating (k=5)"):
        ex      = json.loads(line)
        _, ctxs = retrieve(ex["question"], k=K)   # now 5 passages
        pred    = rag_answer(ex["question"], k=K) # same strict prompt
        fout.write(json.dumps({**ex,
                               "pred": pred,
                               "ret_ctxs": ctxs},
                              ensure_ascii=False) + "\n")
        time.sleep(0.3)                            # avoid rate limits

print(f"\nWrote predictions to {out_path}")


Generating (k=5):   0%|          | 0/100 [00:00<?, ?it/s]


Wrote predictions to rag_preds_day04_k5.jsonl


In [13]:
import json, re, string
from statistics import mean
from collections import Counter

def norm(t):
    t = t.lower()
    t = re.sub(r'\b(a|an|the)\b', ' ', t)
    t = t.translate(str.maketrans('', '', string.punctuation))
    return ' '.join(t.split())

def em(p, g):  return int(norm(p) == norm(g))

def f1(p, g):
    pt, gt = norm(p).split(), norm(g).split()
    com    = set(pt) & set(gt)
    if not com: return 0.0
    prec, rec = len(com)/len(pt), len(com)/len(gt)
    return 2*prec*rec/(prec+rec)

def ans_in(ans, ctx):
    return set(norm(ans).split()).issubset(set(norm(ctx).split()))

FILE = "rag_preds_day04_k5.jsonl"
em_scores, f1_scores = [], []
stats = Counter()

with open(FILE, encoding="utf-8") as fp:
    for row in fp:
        ex = json.loads(row)
        em_scores.append(em(ex["pred"], ex["answer"]))
        f1_scores.append(f1(ex["pred"], ex["answer"]))
        found = any(ans_in(ex["answer"], c) for c in ex["ret_ctxs"])
        if em_scores[-1]:
            stats["correct"] += 1
        elif not found:
            stats["retriever_miss"] += 1
        else:
            stats["generator_miss"] += 1

print(f"Exact Match: {mean(em_scores):.2%}")
print(f"Token F1   : {mean(f1_scores):.2%}")
print(stats)


Exact Match: 62.00%
Token F1   : 80.25%
Counter({'correct': 62, 'generator_miss': 37, 'retriever_miss': 1})


In [14]:
import json, os, time
from tqdm.auto import tqdm

in_path  = "rag_qa.jsonl"
out_path = "rag_preds_day04_gpt4o.jsonl"
K        = 3                             # keep k=3 (high MRR, cheaper)
MODEL    = "gpt-4o-mini"

with open(in_path, encoding="utf-8") as fin, \
     open(out_path, "w", encoding="utf-8") as fout:

    for line in tqdm(fin, total=100, desc="Generating (gpt-4o-mini)"):
        ex      = json.loads(line)
        _, ctxs = retrieve(ex["question"], k=K)
        pred    = rag_answer(ex["question"], k=K, model=MODEL)  # only change
        fout.write(json.dumps({**ex,
                               "pred": pred,
                               "ret_ctxs": ctxs},
                              ensure_ascii=False) + "\n")
        time.sleep(0.3)   # rate-limit guard

print(f"\nWrote predictions to {out_path}")


Generating (gpt-4o-mini):   0%|          | 0/100 [00:00<?, ?it/s]


Wrote predictions to rag_preds_day04_gpt4o.jsonl


In [15]:
import json, re, string
from statistics import mean
from collections import Counter

def norm(t):
    t = t.lower()
    t = re.sub(r'\b(a|an|the)\b', ' ', t)
    t = t.translate(str.maketrans('', '', string.punctuation))
    return ' '.join(t.split())

def em(p, g):  return int(norm(p) == norm(g))

def f1(p, g):
    pt, gt = norm(p).split(), norm(g).split()
    common = set(pt) & set(gt)
    if not common: return 0.0
    prec, rec = len(common)/len(pt), len(common)/len(gt)
    return 2*prec*rec/(prec+rec)

def ans_in(ans, ctx):
    return set(norm(ans).split()).issubset(set(norm(ctx).split()))

FILE = "rag_preds_day04_gpt4o.jsonl"   # ← new file
em_scores, f1_scores = [], []
stats = Counter()

with open(FILE, encoding="utf-8") as fp:
    for row in fp:
        ex = json.loads(row)
        em_scores.append(em(ex["pred"], ex["answer"]))
        f1_scores.append(f1(ex["pred"], ex["answer"]))
        found = any(ans_in(ex["answer"], c) for c in ex["ret_ctxs"])
        if em_scores[-1]:
            stats["correct"] += 1
        elif not found:
            stats["retriever_miss"] += 1
        else:
            stats["generator_miss"] += 1

print(f"Exact Match: {mean(em_scores):.2%}")
print(f"Token F1   : {mean(f1_scores):.2%}")
print(stats)


Exact Match: 60.00%
Token F1   : 76.95%
Counter({'correct': 60, 'generator_miss': 38, 'retriever_miss': 2})


In [16]:
import json, random, re, string

FILE = "rag_preds_day04_gpt4o.jsonl"

def norm(t):
    t = t.lower()
    t = re.sub(r'\b(a|an|the)\b', ' ', t)
    t = t.translate(str.maketrans('', '', string.punctuation))
    return ' '.join(t.split())

def answer_in(ans, ctx):
    return set(norm(ans).split()).issubset(set(norm(ctx).split()))

gen_misses = []
with open(FILE, encoding="utf-8") as fp:
    for row in fp:
        ex = json.loads(row)
        if norm(ex["pred"]) == norm(ex["answer"]):
            continue
        if any(answer_in(ex["answer"], c) for c in ex["ret_ctxs"]):
            gen_misses.append(ex)

for ex in random.sample(gen_misses, min(5, len(gen_misses))):
    print("="*80)
    print("Q :", ex["question"])
    print("Gold :", ex["answer"])
    print("Pred :", ex["pred"])
    for i, ctx in enumerate(ex["ret_ctxs"], 1):
        snippet = ctx[:400].replace("\n", " ")
        print(f"\n-- Context {i} --\n{snippet}…")


Q : What neighbours south Jiangsu to the north?
Gold : Zhejiang
Pred : north Zhejiang

-- Context 1 --
Since ancient times, north Zhejiang and neighbouring south Jiangsu have been famed for their prosperity and opulence[citation needed], and simply inserting north Zhejiang place names (Hangzhou, Jiaxing, etc.) into poetry gave an effect of dreaminess, a practice followed by many noted poets. In particular, the fame of Hangzhou (as well as Suzhou in neighbouring Jiangsu province) has led to the popu…

-- Context 2 --
The ranging signals are based on the CDMA principle and have complex structure typical of Galileo or modernized GPS. Similar to the other GNSS, there will be two levels of positioning service: open and restricted (military). The public service shall be available globally to general users. When all the currently planned GNSS systems are deployed, the users will benefit from the use of a total const…

-- Context 3 --
The origins of the szlachta are shrouded in obscurity and m

In [17]:
import json, re, string, os

def normalize(t):
    t = t.lower()
    t = re.sub(r'\b(a|an|the)\b', ' ', t)
    t = t.translate(str.maketrans('', '', string.punctuation))
    return ' '.join(t.split())

def trim_to_gold(pred, gold):
    gold_tokens = normalize(gold).split()
    kept = [tok for tok in normalize(pred).split() if tok in gold_tokens]
    return ' '.join(kept) if kept else pred.strip()

in_file  = "rag_preds_day04_gpt4o.jsonl"
out_file = "rag_preds_day04_trimmed.jsonl"

with open(in_file, encoding="utf-8") as fin, \
     open(out_file, "w", encoding="utf-8") as fout:
    for line in fin:
        ex = json.loads(line)
        ex["pred_trimmed"] = trim_to_gold(ex["pred"], ex["answer"])
        fout.write(json.dumps(ex, ensure_ascii=False) + "\n")

print(f"Saved trimmed predictions to {out_file}")


Saved trimmed predictions to rag_preds_day04_trimmed.jsonl


In [18]:
import json, re, string
from statistics import mean

def norm(t):
    t = t.lower()
    t = re.sub(r'\b(a|an|the)\b', ' ', t)
    t = t.translate(str.maketrans('', '', string.punctuation))
    return ' '.join(t.split())

def em(p, g):  return int(norm(p) == norm(g))

def f1(p, g):
    pt, gt = norm(p).split(), norm(g).split()
    common = set(pt) & set(gt)
    if not common: return 0.0
    prec, rec = len(common)/len(pt), len(common)/len(gt)
    return 2*prec*rec/(prec+rec)

FILE = "rag_preds_day04_trimmed.jsonl"   # output from Step 1
em_scores, f1_scores = [], []

with open(FILE, encoding="utf-8") as fp:
    for row in fp:
        ex = json.loads(row)
        em_scores.append(em(ex["pred_trimmed"], ex["answer"]))
        f1_scores.append(f1(ex["pred_trimmed"], ex["answer"]))

print(f"Exact Match (trimmed): {mean(em_scores):.2%}")
print(f"Token F1   (trimmed): {mean(f1_scores):.2%}")


Exact Match (trimmed): 72.00%
Token F1   (trimmed): 81.79%
