# 1. Environment Setup

In [2]:
from google.colab import drive
drive.mount('/content/drive')
PROJECT_PATH = "/content/drive/MyDrive/CS6120_project"

# Environment Setup
!pip install sentence-transformers
!pip install faiss-cpu
!pip install rank-bm25
!pip install datasets
!pip install pytrec_eval
!pip install hnswlib

import os
import numpy as np
import nltk
import random
import pytrec_eval
import hnswlib
import faiss

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm
from collections import Counter
from datasets import load_dataset
import pandas as pd

nltk.download('punkt_tab')
nltk.download('stopwords')

# When using in Colab, you may need to mount your Google Drive (enable if necessary)
# from google.colab import drive
# drive.mount('/content/drive')

os.environ["WANDB_DISABLED"] = "true"  # Optional: Disable wandb logging


Mounted at /content/drive
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# 2. Prepare Data

In [14]:
def check_positive_counts(queries, qrels):
    """
    Print the distribution of positive counts for each query to understand the data.
    """
    positive_counts = []
    for qid in queries:
        if qid in qrels:
            positive_counts.append(len(qrels[qid]))
        else:
            positive_counts.append(0)

    counter = Counter(positive_counts)
    print("Distribution of positive examples (Number of positive examples: Number of queries):")
    for num_pos, num_queries in sorted(counter.items()):
        print(f"{num_pos} positive examples: {num_queries} queries")

    total_queries = len(queries)
    no_positive = counter.get(0, 0)
    print(f"\nTotal number of queries: {total_queries}")
    print(f"Queries with no positive examples: {no_positive} ({no_positive/total_queries*100:.2f}%)\n")

def load_msmarco_hf(sample_size=5000, seed=40):
    """
    Load the MS MARCO V1.1 validation set from Hugging Face datasets.
    By default, only a sample of 5000 entries is used for testing (modifiable).
    Returns: corpus, queries, qrels
    """
    dataset = load_dataset("ms_marco", "v1.1")
    dev_data = dataset["validation"].shuffle(seed=seed).select(range(sample_size))

    queries = {}
    corpus = {}
    qrels = {}

    for example in dev_data:
        qid = str(example["query_id"])
        query_text = example["query"]
        queries[qid] = query_text

        passages_info = example["passages"]
        passage_texts = passages_info.get("passage_text", [])
        is_selecteds = passages_info.get("is_selected", [])

        for i, (text, is_sel) in enumerate(zip(passage_texts, is_selecteds)):
            doc_id = f"{qid}_{i}"
            corpus[doc_id] = text
            if is_sel == 1:
                if qid not in qrels:
                    qrels[qid] = {}
                qrels[qid][doc_id] = 1

    check_positive_counts(queries, qrels)
    return corpus, queries, qrels

# ========== Load Data ==========
corpus, queries, qrels = load_msmarco_hf(sample_size=5000)
doc_ids = list(corpus.keys())
documents = [corpus[doc_id] for doc_id in doc_ids]
print(f"Loaded {len(corpus)} documents, {len(queries)} queries, {len(qrels)} qrels.")

Distribution of positive examples (Number of positive examples: Number of queries):
0 positive examples: 186 queries
1 positive examples: 4346 queries
2 positive examples: 428 queries
3 positive examples: 32 queries
4 positive examples: 7 queries
5 positive examples: 1 queries

Total number of queries: 5000
Queries with no positive examples: 186 (3.72%)

Loaded 40997 documents, 5000 queries, 4814 qrels.


# 3. Build Indices

## 3.1 BM25 Index

In [15]:
def build_bm25(tokenized_corpus, k1=0.9, b=0.6):
    """
    Build and return a BM25Okapi index
    """
    bm25_index = BM25Okapi(tokenized_corpus, k1=k1, b=b)
    return bm25_index

## 3.2 Build HNSW Index

In [16]:
def build_hnsw_index(doc_embeddings, ef_construction=200, M=16, ef_search=50):
    """
    Build an HNSW index using hnswlib (space='cosine' indicates cosine distance).
    Return the index object.
    """
    num_elements, dim = doc_embeddings.shape
    index = hnswlib.Index(space='cosine', dim=dim)
    index.init_index(max_elements=num_elements, ef_construction=ef_construction, M=M)
    index.add_items(doc_embeddings, ids=np.arange(num_elements))
    index.set_ef(ef_search)
    return index

def build_sbert_hnsw(corpus, model_name_or_path):
    """
    Encode the corpus using the given SBERT model, then build an HNSW index.
    Returns: (model, hnsw_index, doc_embeddings)
    """
    model = SentenceTransformer(model_name_or_path)
    doc_texts = list(corpus.values())
    doc_embeddings = model.encode(doc_texts, convert_to_numpy=True, show_progress_bar=True)
    hnsw_index = build_hnsw_index(doc_embeddings)
    return model, hnsw_index, doc_embeddings

# 4. Document Rretrieval Methods

In [17]:
def bm25_retrieve(query, bm25_index, doc_ids, k=10):
    """
    BM25 retrieval: Return the top k doc_ids and scores.
    """
    query_tokens = word_tokenize(query.lower())
    bm25_scores = bm25_index.get_scores(query_tokens)
    top_indices = np.argsort(bm25_scores)[::-1][:k]
    ranked_doc_ids = [doc_ids[i] for i in top_indices]
    scores = bm25_scores[top_indices]
    return ranked_doc_ids, scores

def hnsw_retrieve(query, model, hnsw_index, doc_ids, top_k=10):
    """
    HNSW + SBERT vector retrieval: Return the top k doc_ids and scores.
    """
    query_embedding = model.encode([query], convert_to_numpy=True)
    labels, distances = hnsw_index.knn_query(query_embedding, k=top_k)
    retrieved_ids = [doc_ids[i] for i in labels[0]]
    # Since cosine distance = 1 - cosine_similarity, the similarity score is computed as: score = 1 - distance
    scores = [1 - d for d in distances[0]]
    return retrieved_ids, scores

def normalize_scores(scores):
    """
    Simply normalize the scores to [0, 1] to avoid large discrepancies between different retrieval methods.
    """
    min_val = np.min(scores)
    max_val = np.max(scores)
    return (scores - min_val) / (max_val - min_val + 1e-8)

def hybrid_retrieve(query, bm25_index, hnsw_index, model, doc_ids, top_k=10, alpha=0.5):
    """
    Hybrid retrieval: Fuse the scores from BM25 and HNSW (SBERT) using:
    final_score = alpha * bm25_score_norm + (1 - alpha) * hnsw_score_norm
    """
    candidate_k = top_k * 5  # A larger candidate set can be used for fusion

    # Perform BM25 and HNSW retrieval separately
    bm25_ids, bm25_scores = bm25_retrieve(query, bm25_index, doc_ids, k=candidate_k)
    hnsw_ids, hnsw_scores = hnsw_retrieve(query, model, hnsw_index, doc_ids, top_k=candidate_k)

    bm25_norm = normalize_scores(np.array(bm25_scores))
    hnsw_norm = normalize_scores(np.array(hnsw_scores))

    candidate_set = set(bm25_ids) | set(hnsw_ids)
    bm25_dict = dict(zip(bm25_ids, bm25_norm))
    hnsw_dict = dict(zip(hnsw_ids, hnsw_norm))

    combined_scores = {}
    for docid in candidate_set:
        score_bm25 = bm25_dict.get(docid, 0.0)
        score_hnsw = hnsw_dict.get(docid, 0.0)
        combined_scores[docid] = alpha * score_bm25 + (1 - alpha) * score_hnsw

    # Sort and select top k candidates
    ranked_candidates = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:top_k]
    ranked_doc_ids = [docid for docid, score in ranked_candidates]
    ranked_scores = [score for docid, score in ranked_candidates]
    return ranked_doc_ids, ranked_scores

# 5. Evaluation

## 5.1 Build Run

In [18]:
def build_run_bm25(queries, bm25_index, doc_ids, top_k=100):
    """
    For all queries, perform BM25 retrieval.
    Returns a run: {qid: {docid: score}}
    """
    run = {}
    for qid, query in queries.items():
        ranked_doc_ids, scores = bm25_retrieve(query, bm25_index, doc_ids, k=top_k)
        run[qid] = {docid: float(score) for docid, score in zip(ranked_doc_ids, scores)}
    return run

def build_run_hnsw(queries, model, hnsw_index, doc_ids, top_k=100):
    """
    For all queries, perform HNSW+SBERT retrieval.
    Returns a run: {qid: {docid: score}}
    """
    run = {}
    for qid, query in queries.items():
        ranked_doc_ids, scores = hnsw_retrieve(query, model, hnsw_index, doc_ids, top_k=top_k)
        run[qid] = {docid: float(score) for docid, score in zip(ranked_doc_ids, scores)}
    return run

def build_run_hybrid(queries, bm25_index, hnsw_index, model, doc_ids, top_k=100, alpha=0.5):
    """
    For all queries, perform Hybrid retrieval.
    """
    run = {}
    for qid, query in queries.items():
        ranked_doc_ids, scores = hybrid_retrieve(query, bm25_index, hnsw_index, model, doc_ids, top_k=top_k, alpha=alpha)
        run[qid] = {docid: float(score) for docid, score in zip(ranked_doc_ids, scores)}
    return run


## 5.2 Evaluate Runs

In [19]:
def compute_mrr_at_k(run, qrels, k=100):
    """
    Compute MRR@k
    run: {qid: {docid: score}}
    qrels: {qid: {docid: relevance}}
    """
    total_rr = 0.0
    num_queries = 0

    for qid, relevant_docs in qrels.items():
        if qid not in run:
            continue

        # Sort the documents by score in descending order and take the top k
        sorted_docs = sorted(run[qid].items(), key=lambda x: x[1], reverse=True)[:k]

        rr = 0.0
        for rank, (doc_id, score) in enumerate(sorted_docs, start=1):
            # Treat relevance > 0 as relevant
            if doc_id in relevant_docs and relevant_docs[doc_id] > 0:
                rr = 1.0 / rank
                break
        total_rr += rr
        num_queries += 1

    return total_rr / num_queries if num_queries else 0.0

def compute_recall_at_k(run, qrels, k=100):
    """
    Compute Recall@k
    """
    total_recall = 0.0
    num_queries_with_rels = 0

    for qid, rel_docs in qrels.items():
        relevant_docs = {doc_id for doc_id, rel in rel_docs.items() if rel > 0}
        if not relevant_docs:
            # Skip queries with no relevant documents or count as 0
            continue

        if qid not in run:
            num_queries_with_rels += 1
            continue

        sorted_docs = sorted(run[qid].items(), key=lambda x: x[1], reverse=True)[:k]
        top_k_docs = {doc_id for doc_id, score in sorted_docs}
        hit_count = len(relevant_docs & top_k_docs)
        recall_q = hit_count / len(relevant_docs)

        total_recall += recall_q
        num_queries_with_rels += 1

    return total_recall / num_queries_with_rels if num_queries_with_rels else 0.0

def evaluate_run(run, qrels):
    """
    Evaluate a run and return various metrics.
    """
    # Use pytrec_eval to evaluate NDCG and MAP
    evaluator = pytrec_eval.RelevanceEvaluator(qrels, {'ndcg', 'map'})
    results = evaluator.evaluate(run)

    avg_ndcg = sum(d.get('ndcg', 0) for d in results.values()) / len(results)
    avg_map = sum(d.get('map', 0) for d in results.values()) / len(results)
    mrr_100 = compute_mrr_at_k(run, qrels, k=100)
    recall_100 = compute_recall_at_k(run, qrels, k=100)

    return {
        'ndcg': avg_ndcg,
        'map': avg_map,
        'mrr@100': mrr_100,
        'recall@100': recall_100
    }

# 6. Results

In [20]:
# ========== 1) stsb_finetuned_sbert_model + HNSW ==========
model_path_1 = "/content/drive/MyDrive/CS6120_project/model/stsb_finetuned_model"
model_1, hnsw_1, doc_emb_1 = build_sbert_hnsw(corpus, model_path_1)
run_1 = build_run_hnsw(queries, model_1, hnsw_1, doc_ids, top_k=100)
results_1 = evaluate_run(run_1, qrels)

# ========== 2) msmarco_stsb_finetuned_sbert_model + HNSW ==========
model_path_2 = "/content/drive/MyDrive/CS6120_project/model/msmarco_stsb_finetuned_model"
model_2, hnsw_2, doc_emb_2 = build_sbert_hnsw(corpus, model_path_2)
run_2 = build_run_hnsw(queries, model_2, hnsw_2, doc_ids, top_k=100)
results_2 = evaluate_run(run_2, qrels)

# ========== 3) BM25 (k1=0.9, b=0.6) ==========
tokenized_corpus = [word_tokenize(doc.lower()) for doc in documents]
bm25_1 = build_bm25(tokenized_corpus, k1=0.9, b=0.6)
run_3 = build_run_bm25(queries, bm25_1, doc_ids, top_k=100)
results_3 = evaluate_run(run_3, qrels)

# ========== 4) BM25 (k1=1.2, b=0.75) ==========
bm25_2 = build_bm25(tokenized_corpus, k1=1.2, b=0.75)
run_4 = build_run_bm25(queries, bm25_2, doc_ids, top_k=100)
results_4 = evaluate_run(run_4, qrels)

# ========== 5) msmarco_stsb_finetuned_sbert_model + Hybrid ==========
# Here we fuse BM25 (1.2, 0.75) with msmarco_stsb_finetuned_sbert_model
run_5 = build_run_hybrid(queries, bm25_2, hnsw_2, model_2, doc_ids, top_k=100, alpha=0.5)
results_5 = evaluate_run(run_5, qrels)

Batches:   0%|          | 0/1282 [00:00<?, ?it/s]

Batches:   0%|          | 0/1282 [00:00<?, ?it/s]

In [21]:
results_all = {
    "stsb_finetuned_sbert_model + HNSW": results_1,
    "msmarco_stsb_finetuned_sbert_model + HNSW": results_2,
    "BM25 (0.9, 0.6)": results_3,
    "BM25 (1.2, 0.75)": results_4,
    "msmarco_stsb_finetuned + Hybrid": results_5
}

df = pd.DataFrame(results_all).T  # Rows represent methods, columns represent metrics

# For a more visually pleasing output and to control the number of decimal places
df_rounded = df.round(4)
print("\n=== Final Results (rounded to 4 decimals) ===")
print(df_rounded)



=== Final Results (rounded to 4 decimals) ===
                                             ndcg     map  mrr@100  recall@100
stsb_finetuned_sbert_model + HNSW          0.3390  0.2308   0.2389      0.7379
msmarco_stsb_finetuned_sbert_model + HNSW  0.5588  0.4309   0.4384      0.9749
BM25 (0.9, 0.6)                            0.5079  0.3842   0.3917      0.9225
BM25 (1.2, 0.75)                           0.5063  0.3819   0.3893      0.9231
msmarco_stsb_finetuned + Hybrid            0.5767  0.4488   0.4559      0.9891
