In [None]:
!pip install -qU datasets  # does nothing if already installed
from datasets import load_dataset
squad_ds = load_dataset("squad", split="train")
print(f"Loaded {len(squad_ds)} Q&A pairs")
squad_ds[:3]  # quick peek


In [None]:
import os, json
sample_size   = 100
random_seed   = 42
out_path      = "data/rag_qa.jsonl"

# deterministic shuffle → first 100 rows
sampled = squad_ds.shuffle(seed=random_seed).select(range(sample_size))

os.makedirs(os.path.dirname(out_path), exist_ok=True)
with open(out_path, "w", encoding="utf-8") as f:
    for ex in sampled:
        record = {
            "id":       ex["id"],
            "context":  ex["context"],
            "question": ex["question"],
            # take the first annotated answer text
            "answer":   ex["answers"]["text"][0]
        }
        f.write(json.dumps(record, ensure_ascii=False) + "\n")

print(f"Wrote {sample_size} records to {out_path}")
sampled[:2]   # quick sanity-check
