In [1]:
!pip install rank_bm25 faiss-cpu -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from sklearn.metrics import precision_score, recall_score
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi
import numpy as np
import faiss

In [3]:
# Sample corpus and queries
documents = [
    "The cat sat on the mat.",
    "Dogs are great pets.",
    "Cats and dogs are mortal enemies.",
    "I love to play football.",
    "Football is a popular sport worldwide.",
    "The economy is improving steadily.",
    "Stock markets are seeing a bullish trend.",
    "Quantum physics is a fascinating subject.",
    "The book on physics was enlightening.",
    "Reading books can expand your knowledge."
]

queries = [
    "Tell me something about dogs.",
    "What is football?",
    "Books on physics."
]

# Ground truth relevance per query (indexes of relevant documents)
ground_truth = {
    0: [1, 2],
    1: [3, 4],
    2: [7, 8]
}


In [4]:
# Load dense model
model = SentenceTransformer('all-MiniLM-L6-v2')
doc_embeddings = model.encode(documents, convert_to_numpy=True)
query_embeddings = model.encode(queries, convert_to_numpy=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [5]:
# FAISS index
dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(doc_embeddings)

In [6]:

# Sparse retrieval using BM25
tokenized_corpus = [doc.lower().split() for doc in documents]
bm25 = BM25Okapi(tokenized_corpus)

In [7]:
# Evaluation function
def evaluate_retrieval(retrieved_indices, ground_truth, k=3):
    precisions, recalls, mrrs, ndcgs = [], [], [], []
    for i, retrieved in enumerate(retrieved_indices):
        relevant = set(ground_truth[i])
        retrieved_k = retrieved[:k]
        relevant_k = [1 if idx in relevant else 0 for idx in retrieved_k]

        precision = sum(relevant_k) / k
        precisions.append(precision)

        recall = sum(relevant_k) / len(relevant)
        recalls.append(recall)

        mrr = 0
        for rank, is_relevant in enumerate(relevant_k, start=1):
            if is_relevant:
                mrr = 1 / rank
                break
        mrrs.append(mrr)

        dcg = sum(rel / np.log2(idx + 2) for idx, rel in enumerate(relevant_k))
        ideal_k = [1] * min(len(relevant), k) + [0] * (k - min(len(relevant), k))
        idcg = sum(rel / np.log2(idx + 2) for idx, rel in enumerate(ideal_k))
        ndcg = dcg / idcg if idcg > 0 else 0
        ndcgs.append(ndcg)

    return np.mean(precisions), np.mean(recalls), np.mean(mrrs), np.mean(ndcgs)


In [8]:
# Dense retrieval
_, dense_indices = index.search(query_embeddings, 5)
dense_metrics = evaluate_retrieval(dense_indices, ground_truth)

In [9]:
# Sparse retrieval
sparse_indices = [np.argsort(bm25.get_scores(query.lower().split()))[::-1][:5] for query in queries]
sparse_metrics = evaluate_retrieval(sparse_indices, ground_truth)

In [10]:
# Hybrid retrieval
def hybrid_retrieval(query_embeddings, queries, top_k=5):
    hybrid_indices = []
    for i, query_embedding in enumerate(query_embeddings):
        dense_scores = cosine_similarity([query_embedding], doc_embeddings)[0]
        dense_norm = (dense_scores - dense_scores.min()) / (dense_scores.max() - dense_scores.min())

        sparse_scores = bm25.get_scores(queries[i].lower().split())
        sparse_norm = (sparse_scores - np.min(sparse_scores)) / (np.max(sparse_scores) - np.min(sparse_scores))

        hybrid_scores = dense_norm + sparse_norm
        top_indices = np.argsort(hybrid_scores)[::-1][:top_k]
        hybrid_indices.append(top_indices)
    return hybrid_indices

hybrid_indices = hybrid_retrieval(query_embeddings, queries)
hybrid_metrics = evaluate_retrieval(hybrid_indices, ground_truth)

  sparse_norm = (sparse_scores - np.min(sparse_scores)) / (np.max(sparse_scores) - np.min(sparse_scores))


In [11]:
# Output
print("Dense Retrieval Metrics (Precision, Recall, MRR, nDCG):", dense_metrics)
print("Sparse Retrieval Metrics (Precision, Recall, MRR, nDCG):", sparse_metrics)
print("Hybrid Retrieval Metrics (Precision, Recall, MRR, nDCG):", hybrid_metrics)

Dense Retrieval Metrics (Precision, Recall, MRR, nDCG): (np.float64(0.6666666666666666), np.float64(1.0), np.float64(1.0), np.float64(1.0))
Sparse Retrieval Metrics (Precision, Recall, MRR, nDCG): (np.float64(0.2222222222222222), np.float64(0.3333333333333333), np.float64(0.27777777777777773), np.float64(0.23114213453909027))
Hybrid Retrieval Metrics (Precision, Recall, MRR, nDCG): (np.float64(0.3333333333333333), np.float64(0.5), np.float64(0.6666666666666666), np.float64(0.5109559939712153))


# Let's do it with Real world dataset

In [12]:
!pip install beir -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/77.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m288.0/288.0 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m81.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m80.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m48.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [14]:
import json
from beir import util
from beir.datasets.data_loader import GenericDataLoader

# Download and load
dataset="scifact"
url = f"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{dataset}.zip"
data_path = util.download_and_unzip(url, "datasets/scifact")
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")

# 3️⃣ Limit corpus to 500 random docs
all_doc_ids = list(corpus.keys())
np.random.seed(42)
selected_doc_ids = all_doc_ids[:500]  # first 500 (or use np.random.choice to sample)

# Map doc_id → text and build index mappings
documents = []
doc_id_to_index = {}
for i, did in enumerate(selected_doc_ids):
    doc = corpus[did]
    documents.append(doc.get("text", "") )
    doc_id_to_index[did] = i

# 4️⃣ Build queries list and ground truth mapping
queries_list = []
ground_truth = {}
for qi, qid in enumerate(queries):
    # Include only if at least one relevant doc in our 500
    rels = [doc_id_to_index[did] for did in qrels.get(qid, {}) if did in doc_id_to_index]
    if rels:
        ground_truth[len(queries_list)] = rels
        queries_list.append(queries[qid])


datasets/scifact/scifact.zip:   0%|          | 0.00/2.69M [00:00<?, ?iB/s]

  0%|          | 0/5183 [00:00<?, ?it/s]

In [15]:
queries_list

['5% of perinatal mortality is due to low birth weight.',
 'Albendazole is used to treat lymphatic filariasis.',
 'Autophagy declines in aged organisms.',
 'CCL19 is absent within dLNs.',
 'Cells undergoing methionine restriction may activate miRNAs.',
 'Chenodeosycholic acid treatment increases whole-body energy expenditure.',
 'Chenodeosycholic acid treatment reduces whole-body energy expenditure.',
 'Chronic aerobic exercise alters endothelial function, improving vasodilating mechanisms mediated by NO.',
 'Cold exposure increases BAT recruitment.',
 'Cold exposure reduces BAT recruitment.',
 'Deleting Raptor reduces G-CSF levels.',
 'Ethanol stress decreases the expression of IBP in bacteria.',
 'Exposure to fine particulate air pollution is relate to anxiety prevalence.',
 'Gene expression does not vary appreciably across genetically identical cells.',
 'IgA plasma cells that are specific for transglutaminase 2 accumulate in the duodenal mucosa on commencement of a gluten-free diet

In [16]:
ground_truth

{0: [291],
 1: [213],
 2: [196],
 3: [406],
 4: [416],
 5: [281],
 6: [281],
 7: [200],
 8: [175],
 9: [175],
 10: [370],
 11: [203],
 12: [145],
 13: [90],
 14: [267],
 15: [189],
 16: [472],
 17: [213],
 18: [213],
 19: [389],
 20: [332],
 21: [329],
 22: [433],
 23: [344],
 24: [344],
 25: [266],
 26: [365],
 27: [208],
 28: [299],
 29: [23],
 30: [393],
 31: [5],
 32: [446],
 33: [446],
 34: [52]}

In [17]:
import pandas as pd

# ✅ 1. Corpus DataFrame (500 docs)
corpus_df = pd.DataFrame({
    "doc_id": selected_doc_ids,
    "text": documents
})

# ✅ 2. Query-GroundTruth DataFrame
query_data = []
for local_qid, global_qid in enumerate([k for k in qrels if any(d in selected_doc_ids for d in qrels[k])]):
    relevant_docs = [doc_id for doc_id in qrels[global_qid] if doc_id in selected_doc_ids]
    if relevant_docs:
        query_data.append({
            "query_id": global_qid,
            "query": queries[global_qid],
            "relevant_doc_ids": relevant_docs
        })

queries_df = pd.DataFrame(query_data)

In [18]:
queries_df

Unnamed: 0,query_id,query,relevant_doc_ids
0,13,5% of perinatal mortality is due to low birth ...,[1606628]
1,94,Albendazole is used to treat lymphatic filaria...,[1215116]
2,148,Autophagy declines in aged organisms.,[1084345]
3,198,CCL19 is absent within dLNs.,[2177022]
4,238,Cells undergoing methionine restriction may ac...,[2251426]
5,248,Chenodeosycholic acid treatment increases whol...,[1568684]
6,249,Chenodeosycholic acid treatment reduces whole-...,[1568684]
7,261,Chronic aerobic exercise alters endothelial fu...,[1122279]
8,268,Cold exposure increases BAT recruitment.,[970012]
9,269,Cold exposure reduces BAT recruitment.,[970012]


In [19]:
# 5️⃣ Initialize model, embed docs & queries
model = SentenceTransformer('all-MiniLM-L6-v2')
doc_embeddings = model.encode(documents, convert_to_numpy=True)
query_embeddings = model.encode(queries_list, convert_to_numpy=True)

In [20]:
# 6️⃣ Build FAISS index (dense retrieval)
dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(doc_embeddings)

In [21]:
# 7️⃣ Build BM25 sparse index
tokenized = [doc.lower().split() for doc in documents]
bm25 = BM25Okapi(tokenized)

In [22]:
# 8️⃣ Define evaluation function
from sklearn.metrics import precision_score, recall_score
def evaluate_retrieval(retrieved_indices, ground_truth, k=5):
    precisions, recalls, mrrs, ndcgs = [], [], [], []
    for i, retrieved in enumerate(retrieved_indices):
        rel = set(ground_truth[i])
        topk = retrieved[:k]
        hits = [1 if idx in rel else 0 for idx in topk]

        precisions.append(sum(hits)/k)
        recalls.append(sum(hits)/len(rel))

        # MRR
        mrr = next((1/(rank+1) for rank,isrel in enumerate(hits) if isrel), 0)
        mrrs.append(mrr)

        # nDCG
        dcg = sum(hit/np.log2(rank+2) for rank,hit in enumerate(hits))
        ideal = sorted([1]*len(rel) + [0]*(k-len(rel)), reverse=True)
        idcg = sum(i/np.log2(idx+2) for idx,i in enumerate(ideal))
        ndcgs.append(dcg/idcg if idcg>0 else 0)
    return (np.mean(precisions), np.mean(recalls), np.mean(mrrs), np.mean(ndcgs))

In [23]:

# 9️⃣ Run retrieval
_, dense_idx = index.search(query_embeddings, 5)
sparse_idx = [np.argsort(bm25.get_scores(q.lower().split()))[::-1][:5]
               for q in queries_list]

def hybrid_retrieve(q_embs, queries_list, top_k=5):
    results = []
    ds = cosine_similarity(q_embs, doc_embeddings)
    for i, scores in enumerate(ds):
        dn = (scores - scores.min()) / (scores.max() - scores.min())
        sp = bm25.get_scores(queries_list[i].lower().split())
        sn = (sp - sp.min())/(sp.max()-sp.min())
        hy = dn + sn
        results.append(np.argsort(hy)[::-1][:top_k])
    return results

hybrid_idx = hybrid_retrieve(query_embeddings, queries_list)

In [24]:
# 10️⃣ Evaluate models
dense_metrics = evaluate_retrieval(dense_idx, ground_truth)
sparse_metrics = evaluate_retrieval(sparse_idx, ground_truth)
hybrid_metrics = evaluate_retrieval(hybrid_idx, ground_truth)

In [25]:
print("Dense    (P, R, MRR, nDCG):", dense_metrics)
print("Sparse   (P, R, MRR, nDCG):", sparse_metrics)
print("Hybrid   (P, R, MRR, nDCG):", hybrid_metrics)

Dense    (P, R, MRR, nDCG): (np.float64(0.17142857142857146), np.float64(0.8571428571428571), np.float64(0.769047619047619), np.float64(0.7912153161490374))
Sparse   (P, R, MRR, nDCG): (np.float64(0.1485714285714286), np.float64(0.7428571428571429), np.float64(0.6190476190476191), np.float64(0.6503388430612261))
Hybrid   (P, R, MRR, nDCG): (np.float64(0.17142857142857146), np.float64(0.8571428571428571), np.float64(0.7485714285714286), np.float64(0.7754570052598772))
