In [1]:
# 1. Install libraries (does nothing if already present)
!pip install -qU faiss-cpu sentence-transformers

# 2. Load contexts from yesterday’s JSONL
import os, json, numpy as np, faiss
from sentence_transformers import SentenceTransformer

data_path = "rag_qa.jsonl"
contexts = [json.loads(l)["context"] for l in open(data_path, encoding="utf-8")]

# 3. Embed with a lightweight multilingual model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
emb = model.encode(contexts, convert_to_numpy=True, show_progress_bar=True)

# 4. Build a cosine-similarity FAISS index
faiss.normalize_L2(emb)           # turn dots into cosine
index = faiss.IndexFlatIP(emb.shape[1])
index.add(emb)

print(f"Indexed {index.ntotal} passages of dimension {emb.shape[1]}")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m53.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m470.2/470.2 kB[0m [31m36.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m125.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m62.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m51.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Indexed 100 passages of dimension 384


In [2]:
def retrieve(query: str, k: int = 3):
    """Return (scores, contexts) for the top-k passages."""
    q_emb = model.encode([query], convert_to_numpy=True)
    faiss.normalize_L2(q_emb)
    scores, idxs = index.search(q_emb, k)
    return scores[0], [contexts[i] for i in idxs[0]]

# 🔍 quick sanity check on the first question in our dataset
import json, itertools
first_q = json.loads(open("rag_qa.jsonl", encoding="utf-8").readline())["question"]
print("QUESTION:", first_q)
scores, hits = retrieve(first_q, k=2)
for rank, (s, ctx) in enumerate(zip(scores, hits), 1):
    print(f"\nRank {rank} | cosine={s:.3f}\n{ctx[:200]} …")


QUESTION: What percentage of Egyptians polled support death penalty for those leaving Islam?

Rank 1 | cosine=0.646
The Pew Forum on Religion & Public Life ranks Egypt as the fifth worst country in the world for religious freedom. The United States Commission on International Religious Freedom, a bipartisan indepen …

Rank 2 | cosine=0.319
For those that remained under the Ottoman Empire's millet system, religion was the defining characteristic of national groups (milletler), so the exonym "Greeks" (Rumlar from the name Rhomaioi) was ap …


In [3]:
import json, re
from collections import Counter

def contains_answer(context: str, answer: str) -> bool:
    """Case-insensitive substring match, stripped of extra whitespace."""
    norm_ctx   = re.sub(r"\s+", " ", context.lower())
    norm_ans   = re.sub(r"\s+", " ", answer.lower())
    return norm_ans in norm_ctx

hits = Counter()
for line in open("rag_qa.jsonl", encoding="utf-8"):
    ex = json.loads(line)
    scores, ctxs = retrieve(ex["question"], k=3)
    found = any(contains_answer(c, ex["answer"]) for c in ctxs)
    hits["found" if found else "missed"] += 1

print(f"Recall@3: {hits['found']} / {sum(hits.values())} "
      f"= {hits['found']/sum(hits.values()):.1%}")


Recall@3: 99 / 100 = 99.0%


In [4]:
import json, re, string
from collections import Counter

def normalize(text):
    text = text.lower()
    text = re.sub(r'\b(a|an|the)\b', ' ', text)           # drop articles
    text = text.translate(str.maketrans('', '', string.punctuation))
    return ' '.join(text.split())

def all_answer_tokens_present(context, answer):
    ctx_tokens  = set(normalize(context).split())
    ans_tokens  = set(normalize(answer).split())
    return ans_tokens.issubset(ctx_tokens)                # recall == 1.0

hits = Counter()
for line in open("rag_qa.jsonl", encoding="utf-8"):
    ex = json.loads(line)
    _, ctxs = retrieve(ex["question"], k=3)
    found = any(all_answer_tokens_present(c, ex["answer"]) for c in ctxs)
    hits["found" if found else "missed"] += 1

total = sum(hits.values())
print(f"Recall@3 (all answer tokens present): {hits['found']} / {total} "
      f"= {hits['found']/total:.1%}")


Recall@3 (all answer tokens present): 97 / 100 = 97.0%


In [16]:
# Install the up-to-date OpenAI client
!pip install -qU "openai>=1.3.8"

import os, textwrap, random, json
from openai import OpenAI

# Make sure your API key is exported, e.g.:
# os.environ["OPENAI_API_KEY"] = "sk-…"
client = OpenAI()  # pulls key from env

def rag_answer(question: str, k: int = 3, model="gpt-3.5-turbo-0125") -> str:
    """Retrieve top-k passages, then have the LLM answer from them."""
    _, ctxs = retrieve(question, k=k)
    context_block = "\n\n---\n\n".join(ctxs)

    prompt = textwrap.dedent(f"""
        You are an expert question-answering assistant. Use ONLY the information
        in <context> to answer the question as concisely as possible. If the
        answer is not contained in the context, say "I don’t know."

        <context>
        {context_block}
        </context>

        Question: {question}
        Answer:
    """).strip()

    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.0,
        max_tokens=64,
    )
    return response.choices[0].message.content.strip()

# 🔍 quick demo on a random question
sample_q = random.choice([json.loads(l)["question"]
                          for l in open("rag_qa.jsonl", encoding="utf-8")])
print("QUESTION:", sample_q)
print("RAG ANSWER:", rag_answer(sample_q))


QUESTION: What is the largest athletic organisation in the country?
RAG ANSWER: The German Football Federation (Deutscher Fußballbund)


In [18]:
import random, json

# pick a random question from our 100-pair file
sample_q = random.choice([json.loads(l)["question"]
                          for l in open("rag_qa.jsonl", encoding="utf-8")])

print("QUESTION:", sample_q)
print("\nRAG ANSWER:", rag_answer(sample_q))


QUESTION: What is one group that Bermuda's black population can link some of their ancestry to?

RAG ANSWER: Native Americans


In [21]:
!pip install -qU tqdm   # progress bar (does nothing if already present)

import json, os, time
from tqdm.auto import tqdm

in_path   = "rag_qa.jsonl"
out_path  = "rag_preds.jsonl"
k         = 3               # top-k passages to feed the LLM


with open(in_path,  encoding="utf-8") as fin,  \
     open(out_path, "w", encoding="utf-8") as fout:

    for line in tqdm(fin, total=100, desc="Generating"):
        ex          = json.loads(line)
        pred_answer = rag_answer(ex["question"], k=k)
        record      = {**ex, "pred": pred_answer}   # keep originals + prediction
        fout.write(json.dumps(record, ensure_ascii=False) + "\n")
        time.sleep(0.3)   # polite pacing; adjust if rate limits allow

print(f"\nAll done — wrote predictions to {out_path}")


Generating:   0%|          | 0/100 [00:00<?, ?it/s]


All done — wrote predictions to rag_preds.jsonl
