In [1]:
!pip install datasets transformers evaluate langchain faiss-cpu langchain-community langchain_openai rouge_score

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.23-py3-none-any.whl.metadata (2.5 kB)
Collecting langchain_openai
  Downloading langchain_openai-0.3.16-py3-none-any.whl.metadata (2.3 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=20

In [3]:
import time
import numpy as np
import pandas as pd
import evaluate
from transformers import pipeline
from datasets import load_dataset
from datasets import concatenate_datasets
from langchain.schema import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA, LLMChain
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from google.api_core.exceptions import ResourceExhausted



In [15]:
full_val = load_dataset("squad")["validation"]
full_train = load_dataset("squad")["train"]

in_domain = full_val.shuffle(seed=0).select(range(10))
out_domain = full_train.shuffle(seed=1).select(range(10))
eval_ds = concatenate_datasets([in_domain, out_domain])


In [16]:
docs = [Document(page_content=ex["context"], metadata={"id": str(i)})
        for i, ex in enumerate(in_domain)]
emb = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(docs, emb)


In [17]:
# Extractive
ext_qa = pipeline("question-answering",
                  model="huggingface-course/bert-finetuned-squad",
                  tokenizer="huggingface-course/bert-finetuned-squad",
                  device=-1)

Device set to use cpu


In [18]:
from google.colab import userdata
api_key = userdata.get('OPENAI_API_KEY')

In [19]:
# Generative
gen_llm = ChatOpenAI(model="gpt-4o", max_tokens=64, temperature=0.0,api_key=api_key)
gen_chain = LLMChain(
    llm=gen_llm,
    prompt=PromptTemplate(
        input_variables=["question"],
        template="Answer the question: {question}"
    )
)




In [21]:
# RAG
rag_qa = RetrievalQA.from_chain_type(
    llm=gen_llm,
    chain_type="map_reduce",
    retriever=vectorstore.as_retriever(k=5),
)


In [22]:
squad_metric = evaluate.load("squad")
rouge      = evaluate.load("rouge")

In [23]:
def retrieval_stats(question, gold_idx):
    docs = vectorstore.as_retriever(k=5).get_relevant_documents(question)
    ids  = [int(d.metadata["id"]) for d in docs]
    # recall@5
    recall = 1.0 if gold_idx in ids else 0.0
    # mrr
    if gold_idx in ids:
        rank = ids.index(gold_idx) + 1
        mrr  = 1.0 / rank
    else:
        mrr = 0.0
    return recall, mrr



In [24]:
results = []
for approach, fn in [
    ("Extractive", lambda q, c, i: ext_qa(question=q, context=c)["answer"]),
    ("Generative", lambda q, c, i: gen_chain.invoke({"question": q})["text"].strip()),
    ("RAG",        lambda q, c, i: rag_qa.run(q).strip())
]:
    preds, refs = [], []
    recalls, mrrs = [], []
    times = []

    start_time = time.time()  # Start timing for the approach

    for i, ex in enumerate(eval_ds):
        q, ctx = ex["question"], ex["context"]
        golds   = ex["answers"]["text"]
        try:
            ans = fn(q, ctx, i if i < 10 else None)
        except ResourceExhausted:
            ans = fn(q, ctx, i if i < 10 else None)

        preds.append({"id": str(i), "prediction_text": ans})
        refs.append({"id": str(i), "answers": {"text": golds, "answer_start": ex["answers"]["answer_start"]}})

        # only compute retrieval metrics for in-domain (i < 10)
        if approach == "RAG" and i < 10:
            r, m = retrieval_stats(q, i)
        else:
            r, m = None, None
        recalls.append(r)
        mrrs.append(m)

    end_time = time.time()  # End timing for the approach
    elapsed_time = end_time - start_time  # Total time taken
    qps = len(eval_ds) / elapsed_time  # Calculate queries per second (QPS)

    # core QA metrics
    scores = squad_metric.compute(predictions=preds, references=refs)
    # rouge
    rouge_scores = rouge.compute(predictions=[p["prediction_text"] for p in preds],
                                 references=[r["answers"]["text"][0] for r in refs])

    results.append({
        "Approach": approach,
        "EM": scores["exact_match"],
        "F1": scores["f1"],
        "ROUGE-L": rouge_scores["rougeL"],
        "Recall@5 (in-d)": np.nanmean([r for r in recalls if r is not None]),
        "MRR@5 (in-d)":    np.nanmean([m for m in mrrs    if m is not None]),
        "Throughput (QPS)": round(qps, 2)
    })

  "Recall@5 (in-d)": np.nanmean([r for r in recalls if r is not None]),
  "MRR@5 (in-d)":    np.nanmean([m for m in mrrs    if m is not None]),
  "Recall@5 (in-d)": np.nanmean([r for r in recalls if r is not None]),
  "MRR@5 (in-d)":    np.nanmean([m for m in mrrs    if m is not None]),


In [25]:
df = pd.DataFrame(results)
df.set_index("Approach", inplace=True)
df


Unnamed: 0_level_0,EM,F1,ROUGE-L,Recall@5 (in-d),MRR@5 (in-d),Throughput (QPS)
Approach,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Extractive,90.0,96.285714,0.933333,,,3.16
Generative,0.0,6.027052,0.052831,,,0.5
RAG,0.0,15.626221,0.122424,1.0,1.0,0.23
