In [1]:
!pip install -q datasets sentence-transformers faiss-cpu

from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
import torch
import os
import json

print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.6/23.6 MB[0m [31m42.5 MB/s[0m eta [36m0:00:00[0m
[?25hPyTorch version: 2.9.0+cu126
CUDA available: True


In [2]:
dataset = load_dataset("izhx/COMP5423-25Fall-HQ-small")

train_ds = dataset["train"]
val_ds = dataset["validation"]
test_ds = dataset["test"]
coll_ds = dataset["collection"]

collection_ids = list(coll_ds["id"])      # <- force plain list
collection_texts = list(coll_ds["text"])

print("Collection size:", len(collection_ids))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

validation.jsonl: 0.00B [00:00, ?B/s]

collection.jsonl:   0%|          | 0.00/87.7M [00:00<?, ?B/s]

test.jsonl: 0.00B [00:00, ?B/s]

train.jsonl: 0.00B [00:00, ?B/s]

Generating validation split:   0%|          | 0/1500 [00:00<?, ? examples/s]

Generating collection split:   0%|          | 0/144718 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1052 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/12000 [00:00<?, ? examples/s]

Collection size: 144718


In [3]:
SAVE_DIR = "dense_index_gte"  # change name if using another model

from google.colab import drive
drive.mount('/content/drive')
SAVE_DIR = "/content/drive/MyDrive/COMP5423/dense_index_gte"

os.makedirs(SAVE_DIR, exist_ok=True)

EMB_PATH = os.path.join(SAVE_DIR, "doc_embs.npy")
INDEX_PATH = os.path.join(SAVE_DIR, "faiss_index.bin")
META_PATH = os.path.join(SAVE_DIR, "meta.json")
IDS_PATH = os.path.join(SAVE_DIR, "doc_ids.json")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
class DenseRetriever:
    def __init__(self,
                 doc_ids,
                 doc_texts=None,
                 model_name="Alibaba-NLP/gte-modernbert-base",
                 batch_size=64,
                 load_existing=False):
        """
        If load_existing=True, will try to load embeddings + index from disk.
        If False, will encode doc_texts and then save them.
        """
        self.doc_ids = doc_ids
        self.model_name = model_name
        self.batch_size = batch_size
        self.model = SentenceTransformer(model_name)

        if load_existing and self._check_files_exist():
            print("Loading existing embeddings and FAISS index from disk...")
            self._load_from_disk()
        else:
            if doc_texts is None:
                raise ValueError("doc_texts must be provided when building index from scratch.")
            print("No existing index found or load_existing=False. Building new index...")
            self._build_and_save(doc_texts)

    def _check_files_exist(self):
        return all(os.path.exists(p) for p in [EMB_PATH, INDEX_PATH, META_PATH, IDS_PATH])

    def _build_and_save(self, doc_texts):
        # Encode corpus
        print("Encoding collection documents...")
        doc_embs = self.model.encode(
            doc_texts,
            batch_size=self.batch_size,
            show_progress_bar=True,
            convert_to_numpy=True
        )
        # Normalize
        norms = np.linalg.norm(doc_embs, axis=1, keepdims=True)
        doc_embs = doc_embs / np.clip(norms, 1e-12, None)

        self.doc_embs = doc_embs

        dim = doc_embs.shape[1]
        self.index = faiss.IndexFlatIP(dim)
        self.index.add(doc_embs)

        print("Index built. Number of vectors:", self.index.ntotal)

        # Save to disk
        print("Saving embeddings and index to disk...")
        np.save(EMB_PATH, doc_embs)
        faiss.write_index(self.index, INDEX_PATH)
        with open(META_PATH, "w", encoding="utf-8") as f:
            json.dump({"model_name": self.model_name, "dim": dim}, f)
        with open(IDS_PATH, "w", encoding="utf-8") as f:
            json.dump(self.doc_ids, f, ensure_ascii=False)
        print("Saved to:", SAVE_DIR)

    def _load_from_disk(self):
        # Load embeddings (optional; mostly for sanity / extra use)
        self.doc_embs = np.load(EMB_PATH)
        with open(META_PATH, "r", encoding="utf-8") as f:
            meta = json.load(f)

        # You could assert meta["model_name"] == self.model_name if you want
        self.index = faiss.read_index(INDEX_PATH)

        with open(IDS_PATH, "r", encoding="utf-8") as f:
            saved_ids = json.load(f)

        # Consistency check
        if len(saved_ids) != len(self.doc_ids):
            print("Warning: loaded doc_ids length does not match current doc_ids length.")
        else:
            self.doc_ids = saved_ids

        print("Loaded index with", self.index.ntotal, "vectors.")
        print("Model (for queries):", self.model_name)

    def retrieve(self, query, k=10):
        q_emb = self.model.encode([query], convert_to_numpy=True)
        q_emb = q_emb / np.clip(np.linalg.norm(q_emb, axis=1, keepdims=True), 1e-12, None)
        scores, idx = self.index.search(q_emb, k)
        scores = scores[0]
        idx = idx[0]
        return [(self.doc_ids[i], float(scores[j])) for j, i in enumerate(idx)]


In [None]:
dense_retriever = DenseRetriever(
    doc_ids=collection_ids,
    doc_texts=collection_texts,
    model_name="Alibaba-NLP/gte-modernbert-base",
    batch_size=64,
    load_existing=False  # build from scratch and save
)


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/205 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/298M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

No existing index found or load_existing=False. Building new index...
Encoding collection documents...


Batches:   0%|          | 0/2262 [00:00<?, ?it/s]

  return torch._C._get_cublas_allow_tf32()
W1126 08:04:00.627000 1830 torch/_inductor/utils.py:1558] [1/0_1] Not enough SMs to use max_autotune_gemm mode


Index built. Number of vectors: 144718
Saving embeddings and index to disk...
Saved to: /content/drive/MyDrive/COMP5423/dense_index_gte


In [5]:
dense_retriever = DenseRetriever(
    doc_ids=collection_ids,   # or load from IDS_PATH, but this is fine
    doc_texts=None,
    model_name="Alibaba-NLP/gte-modernbert-base",
    batch_size=64,
    load_existing=True  # will load from disk if files exist
)


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/205 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/298M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

Loading existing embeddings and FAISS index from disk...
Loaded index with 144718 vectors.
Model (for queries): Alibaba-NLP/gte-modernbert-base


In [6]:
def retrieve_for_question(question, k=10):
    results = dense_retriever.retrieve(question, k=k)
    return [[doc_id, score] for (doc_id, score) in results]

q = "Where was Barack Obama born?"
print("Query:", q)
print("Top-5 docs:", retrieve_for_question(q, k=5))


Query: Where was Barack Obama born?


  return torch._C._get_cublas_allow_tf32()
W1128 18:18:25.944000 1001 torch/_inductor/utils.py:1558] [1/0_1] Not enough SMs to use max_autotune_gemm mode


Top-5 docs: [['doc-82816', 0.7561941146850586], ['doc-56118', 0.7561941146850586], ['doc-130209', 0.732140302658081], ['doc-102029', 0.7316992878913879], ['doc-2778', 0.7316992878913879]]


In [7]:
!pip install pytrec_eval pandas

Collecting pytrec_eval
  Downloading pytrec_eval-0.5.tar.gz (15 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pytrec_eval
  Building wheel for pytrec_eval (setup.py) ... [?25l[?25hdone
  Created wheel for pytrec_eval: filename=pytrec_eval-0.5-cp312-cp312-linux_x86_64.whl size=309353 sha256=a36c537450ad9c32879cb9134bdca4e5ebdb75a81c8f9a90bc456272f96f15a9
  Stored in directory: /root/.cache/pip/wheels/c6/4a/9e/e17f9ea004e1c221bd0ff384732285211c4917b790d598ea51
Successfully built pytrec_eval
Installing collected packages: pytrec_eval
Successfully installed pytrec_eval-0.5


In [14]:
import json
from tqdm import tqdm

OUTPUT_PATH = "/content/drive/MyDrive/COMP5423/data/dense_pred.jsonl"

with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    for item in tqdm(val_ds):
        qid = item["id"]        # question id
        query = item["text"]    # question text

        # get dense top-10
        top_docs = dense_retriever.retrieve(query, k=10)
        pred_ids = [doc_id for doc_id, score in top_docs]

        row = {
            "id": qid,
            "retrieved_docs": pred_ids
        }
        f.write(json.dumps(row, ensure_ascii=False) + "\n")

print("Saved:", OUTPUT_PATH)


100%|██████████| 1500/1500 [01:42<00:00, 14.63it/s]

Saved: /content/drive/MyDrive/COMP5423/data/dense_pred.jsonl





In [17]:
import json
import time
import pytrec_eval
import pandas as pd


def read_jsonl(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            data.append(json.loads(line))
    print(f'[{time.asctime()}] Read {len(data)} from {file_path}')
    return data


def compute_metrics(qrels, results, k_values=[2, 5, 10]):
    map_string = "map_cut." + ",".join(str(k) for k in k_values)
    ndcg_string = "ndcg_cut." + ",".join(str(k) for k in k_values)
    recall_string = "recall." + ",".join(str(k) for k in k_values)
    precision_string = "P." + ",".join(str(k) for k in k_values)

    evaluator = pytrec_eval.RelevanceEvaluator(
        qrels, {map_string, ndcg_string, recall_string, precision_string}
    )
    scores_by_query = evaluator.evaluate(results)
    scores = pd.DataFrame.from_dict(scores_by_query.values()).mean()

    metrics = {}
    for prefix in ('map_cut', 'ndcg_cut', 'recall', 'P'):
        name = 'precision' if prefix == 'P' else prefix.split('_')[0]
        for k in k_values:
            metrics[f'{name}_at_{k}'] = scores[f'{prefix}_{k}']
    return metrics


def evaluate_retrieval(gold_path: str, pred_path: str):
    gold_data = read_jsonl(gold_path)
    pred_data = read_jsonl(pred_path)

    # qrels: from validation.jsonl
    # {qid: {docid: relevance}}
    qrels = {
        str(item["id"]): {str(docid): 1 for docid in item["supporting_ids"]}
        for item in gold_data
    }
    gold_ids = set(qrels.keys())

    # results: from val_retrieval_pred.jsonl
    # {qid: {docid: score}}
    results = {}
    for item in pred_data:
        qid = str(item["id"])
        if qid not in gold_ids:
            # skip queries that are not in gold
            continue

        doc_ids = item["retrieved_docs"]
        n = len(doc_ids)
        # assign scores so earlier docs have higher scores
        doc_scores = {
            str(docid): float(n - rank)   # or just float(n - rank)
            for rank, docid in enumerate(doc_ids)
        }
        results[qid] = doc_scores

    print(f"Queries in gold: {len(gold_ids)}, queries with predictions used: {len(results)}")

    metrics = compute_metrics(qrels, results)
    print(json.dumps(metrics, indent=2))
    return metrics


In [18]:
gold_path = "/content/drive/MyDrive/COMP5423/data/validation.jsonl"
pred_path = "/content/drive/MyDrive/COMP5423/data/dense_pred.jsonl"

metrics = evaluate_retrieval(gold_path, pred_path)
metrics

[Fri Nov 28 18:35:34 2025] Read 1500 from /content/drive/MyDrive/COMP5423/data/validation.jsonl
[Fri Nov 28 18:35:34 2025] Read 1500 from /content/drive/MyDrive/COMP5423/data/dense_pred.jsonl
Queries in gold: 1500, queries with predictions used: 1500
{
  "map_at_2": 0.544,
  "map_at_5": 0.6437888888888887,
  "map_at_10": 0.6604689153439153,
  "ndcg_at_2": 0.62026605161229,
  "ndcg_at_5": 0.7285464999731569,
  "ndcg_at_10": 0.7545148483406677,
  "recall_at_2": 0.5766666666666667,
  "recall_at_5": 0.769,
  "recall_at_10": 0.834,
  "precision_at_2": 0.5766666666666667,
  "precision_at_5": 0.3076,
  "precision_at_10": 0.1668
}


{'map_at_2': np.float64(0.544),
 'map_at_5': np.float64(0.6437888888888887),
 'map_at_10': np.float64(0.6604689153439153),
 'ndcg_at_2': np.float64(0.62026605161229),
 'ndcg_at_5': np.float64(0.7285464999731569),
 'ndcg_at_10': np.float64(0.7545148483406677),
 'recall_at_2': np.float64(0.5766666666666667),
 'recall_at_5': np.float64(0.769),
 'recall_at_10': np.float64(0.834),
 'precision_at_2': np.float64(0.5766666666666667),
 'precision_at_5': np.float64(0.3076),
 'precision_at_10': np.float64(0.1668)}