In [1]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer, AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [3]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m35.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl 

In [41]:
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer, util
import numpy as np

# Load SQuAD dataset (5000 samples)
squad = load_dataset("squad", split="train[:30000]")

# Extract all relevant fields for later evaluation
corpus = [item["context"] for item in squad]
questions = [item["question"] for item in squad]
references = [item["answers"]["text"][0] if item["answers"]["text"] else "" for item in squad]  # avoid empty answers
contexts = corpus  # just for clarity

# TF-IDF Sparse Retriever
class SparseRetriever:
    def __init__(self, docs):
        self.vectorizer = TfidfVectorizer().fit(docs)
        self.doc_vectors = self.vectorizer.transform(docs)
        self.docs = docs

    def retrieve(self, query, k=5):
        q_vec = self.vectorizer.transform([query])
        scores = np.dot(self.doc_vectors, q_vec.T).toarray().squeeze()
        top_k_idx = scores.argsort()[-k:][::-1]
        return [self.docs[i] for i in top_k_idx]

# SBERT Dense Retriever
class DenseRetriever:
    def __init__(self, docs):
        self.model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
        self.docs = docs
        self.doc_embeddings = self.model.encode(docs, convert_to_tensor=True)

    def retrieve(self, query, k=5):
        query_embedding = self.model.encode(query, convert_to_tensor=True)
        hits = util.semantic_search(query_embedding, self.doc_embeddings, top_k=k)[0]
        return [self.docs[hit['corpus_id']] for hit in hits]

In [42]:
# Combine documents into context
def augment_context(query, docs):
    return query + " " + " ".join(docs)

# Generative QA (T5 example)
class T5Generator:
    def __init__(self):
        self.tokenizer = T5Tokenizer.from_pretrained("t5-base")
        self.model = T5ForConditionalGeneration.from_pretrained("t5-base")

    def generate(self, input_text, max_len=64):
        inputs = self.tokenizer("question: " + input_text, return_tensors="pt", truncation=True)
        output = self.model.generate(**inputs, max_length=max_len)
        return self.tokenizer.decode(output[0], skip_special_tokens=True)

In [43]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

# Better extractive QA model
tokenizer_bidaf = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
model_bidaf = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
model_bidaf.eval()

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,)

In [44]:
def extract_answer_bidaf(question, context):
    inputs = tokenizer_bidaf.encode_plus(question, context, return_tensors="pt", truncation=True, max_length=512)
    input_ids = inputs["input_ids"]

    with torch.no_grad():
        outputs = model_bidaf(**inputs)
        start_scores = outputs.start_logits
        end_scores = outputs.end_logits

    start_idx = torch.argmax(start_scores)
    end_idx = torch.argmax(end_scores) + 1

    answer_tokens = input_ids[0][start_idx:end_idx]
    return tokenizer_bidaf.decode(answer_tokens, skip_special_tokens=True)

In [45]:
def rag_pipeline(query, retriever, answer_mode='generate'):
    docs = retriever.retrieve(query, k=5)
    context = augment_context(query, docs)

    if answer_mode == 'generate':
        return T5Generator().generate(context)
    elif answer_mode == 'extract':
        return extract_answer_bidaf(query, context)
    else:
        raise ValueError("Unsupported answer mode")

In [46]:
def evaluate(answers, ground_truths):
    from datasets import load_metric
    metric = load_metric("squad")
    return metric.compute(predictions=answers, references=ground_truths)

In [47]:
retriever = DenseRetriever(corpus)  # or SparseRetriever(corpus)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [48]:
def qa_pipeline(question, top_k=3):
    # Retrieve top-k documents
    retrieved_docs = retriever.retrieve(question, k=top_k)
    context = " ".join(retrieved_docs)
    input_text = f"question: {question} context: {context}"

    # Generate answer using T5 pipeline
    generated = rag_pipeline(input_text, max_length=64, truncation=True)[0]['generated_text']
    return generated

In [49]:
import logging
logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)

In [50]:
query = "What is the capital of Germany?"
answer = rag_pipeline(query, retriever, answer_mode='generate')
print("Answer:", answer)

Answer: Bern


In [51]:
query = "Who was the first president of the United States?"
answer = rag_pipeline(query, retriever, answer_mode='generate')
print("Answer:", answer)

Answer: George Washington


In [52]:
query = "What is the chemical symbol for hydrogen?"
answer = rag_pipeline(query, retriever, answer_mode='generate')
print("Answer:", answer)

Answer: H


In [53]:
query = "Who developed the theory of relativity?"
answer = rag_pipeline(query, retriever, answer_mode='generate')
print("Answer:", answer)

Answer: Whitehead


In [54]:
query = "Where is the Eiffel Tower located?"
answer = rag_pipeline(query, retriever, answer_mode='generate')
print("Answer:", answer)

Answer: Notre Dame cathedral


In [55]:
print("\n\n".join(retriever.retrieve("What is the capital of Germany?", k=5)))

The city of Bern or Berne (German: Bern, pronounced [bɛrn] ( listen); French: Berne [bɛʁn]; Italian: Berna [ˈbɛrna]; Romansh: Berna  [ˈbɛrnɐ] (help·info); Bernese German: Bärn [b̥æːrn]) is the de facto capital of Switzerland, referred to by the Swiss as their (e.g. in German) Bundesstadt, or "federal city".[note 1] With a population of 140,634 (November 2015), Bern is the fifth most populous city in Switzerland. The Bern agglomeration, which includes 36 municipalities, had a population of 406,900 in 2014. The metropolitan area had a population of 660,000 in 2000. Bern is also the capital of the Canton of Bern, the second most populous of Switzerland's cantons.

The city of Bern or Berne (German: Bern, pronounced [bɛrn] ( listen); French: Berne [bɛʁn]; Italian: Berna [ˈbɛrna]; Romansh: Berna  [ˈbɛrnɐ] (help·info); Bernese German: Bärn [b̥æːrn]) is the de facto capital of Switzerland, referred to by the Swiss as their (e.g. in German) Bundesstadt, or "federal city".[note 1] With a popula

In [56]:
predicted_answers = []
eval_size = 100

for i in range(eval_size):
    question = questions[i]
    answer = rag_pipeline(question, retriever, answer_mode="generate")  # no extra kwargs
    predicted_answers.append(answer)

In [57]:
eval_references = references[:eval_size]

In [59]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [60]:
import evaluate

# Load SQuAD-style evaluator
metric = evaluate.load("squad")

# Format predictions and references correctly
formatted_preds = [{"id": str(i), "prediction_text": pred} for i, pred in enumerate(predicted_answers)]
formatted_refs = [{"id": str(i), "answers": {"answer_start": [0], "text": [ref]}} for i, ref in enumerate(eval_references)]

# Compute EM + F1
results = metric.compute(predictions=formatted_preds, references=formatted_refs)

print("SQuAD Evaluation:")
print(f"Exact Match (EM): {results['exact_match']:.2f}")
print(f"F1 Score: {results['f1']:.2f}")

Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

SQuAD Evaluation:
Exact Match (EM): 48.00
F1 Score: 54.13


In [61]:
def evaluate_retriever(retriever, questions, references, corpus, k=5):
    hits = 0
    total = len(questions)

    for q, true_answer in zip(questions, references):
        retrieved_docs = retriever.retrieve(q, k=k)
        if any(true_answer.lower() in doc.lower() for doc in retrieved_docs):
            hits += 1

    recall_at_k = 100 * hits / total
    print(f"Retriever Evaluation: Recall@{k}: {recall_at_k:.2f}%")

# Evaluate both retrievers
print("Sparse Retriever Evaluation:")
evaluate_retriever(SparseRetriever(corpus), questions, references, corpus, k=5)

print("Dense Retriever Evaluation:")
evaluate_retriever(DenseRetriever(corpus), questions, references, corpus, k=5)

Sparse Retriever Evaluation:
Retriever Evaluation: Recall@5: 62.71%
Dense Retriever Evaluation:
Retriever Evaluation: Recall@5: 70.41%
