In [1]:
!pip install -q "datasets" "sentence-transformers" "faiss-cpu" "pytrec_eval" "tqdm" "transformers" "accelerate" "einops"

from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
import torch
import os
import json
from tqdm import tqdm
import pytrec_eval

print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


  from .autonotebook import tqdm as notebook_tqdm


PyTorch version: 2.9.1
CUDA available: False


In [2]:
dataset = load_dataset("izhx/COMP5423-25Fall-HQ-small")

train_ds = dataset["train"]
val_ds = dataset["validation"]
test_ds = dataset["test"]
coll_ds = dataset["collection"]

collection_ids = list(coll_ds["id"])
collection_texts = list(coll_ds["text"])

print("Collection size:", len(collection_ids))
print("Train/Val/Test sizes:", len(train_ds), len(val_ds), len(test_ds))


Collection size: 144718
Train/Val/Test sizes: 12000 1500 1052


In [3]:
# === Choose which instruction-aware retriever to test ===
#   "e5-mistral" -> intfloat/e5-mistral-7b-instruct
#   "qwen3-0.6b" -> Qwen/Qwen3-Embedding-0.6B
MODEL_CHOICE = "qwen3-0.6b"  # or "qwen3-0.6b"

if MODEL_CHOICE == "e5-mistral":
    MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
    QUERY_PROMPT_NAME = "web_search_query"   # from HF model card
    DEFAULT_BATCH_SIZE = 4                   # 7B model, keep batch small
elif MODEL_CHOICE == "qwen3-0.6b":
    MODEL_NAME = "Qwen/Qwen3-Embedding-0.6B"
    QUERY_PROMPT_NAME = "query"              # from HF model card
    DEFAULT_BATCH_SIZE = 16
else:
    raise ValueError("Unknown MODEL_CHOICE")

# Task description for manual 'Instruct:' formatting (fallback)
TASK_DESCRIPTION = (
    "Given a multi-hop question about Wikipedia, retrieve the most relevant passages "
    "that help answer the question."
)

print("Using model:", MODEL_NAME)
print("Query prompt_name:", QUERY_PROMPT_NAME)


Using model: Qwen/Qwen3-Embedding-0.6B
Query prompt_name: query


In [4]:
BASE_DIR = "retrieval_model"
SAVE_DIR = os.path.join(BASE_DIR, f"dense_instruction_{MODEL_CHOICE}")
os.makedirs(SAVE_DIR, exist_ok=True)

EMB_PATH = os.path.join(SAVE_DIR, "doc_embs.npy")
INDEX_PATH = os.path.join(SAVE_DIR, "faiss_index.bin")
META_PATH = os.path.join(SAVE_DIR, "meta.json")
IDS_PATH = os.path.join(SAVE_DIR, "doc_ids.json")

print("Saving to:", SAVE_DIR)


Saving to: retrieval_model/dense_instruction_qwen3-0.6b


In [6]:
class DenseInstructionRetriever:
    def __init__(
        self,
        doc_ids,
        doc_texts=None,
        model_name=MODEL_NAME,
        batch_size=DEFAULT_BATCH_SIZE,
        load_existing=False,
        query_prompt_name=QUERY_PROMPT_NAME,
        task_description=TASK_DESCRIPTION,
    ):
        """
        If load_existing=True, tries to load doc embeddings + FAISS index from disk.
        Otherwise, encodes doc_texts and builds the index, then saves to disk.
        """
        self.doc_ids = list(doc_ids)
        self.model_name = model_name
        self.batch_size = batch_size
        self.query_prompt_name = query_prompt_name
        self.task_description = task_description

        # Load ST model
        self.model = SentenceTransformer(model_name)
        print("Loaded SentenceTransformer model:", model_name)

        if load_existing and self._can_load_from_disk():
            self._load_from_disk()
        else:
            if doc_texts is None:
                raise ValueError("doc_texts must be provided if load_existing=False.")
            self._build_index(doc_texts)

    # ---------- internal helpers ----------
    def _can_load_from_disk(self):
        return (
            os.path.exists(EMB_PATH)
            and os.path.exists(INDEX_PATH)
            and os.path.exists(META_PATH)
            and os.path.exists(IDS_PATH)
        )

    def _build_index(self, doc_texts):
        print("Encoding collection documents...")
        # Documents do NOT need explicit instructions
        doc_embs = self.model.encode(
            doc_texts,
            batch_size=self.batch_size,
            convert_to_numpy=True,
            show_progress_bar=True,
        )

        # L2-normalize to use IP as cosine similarity
        norms = np.linalg.norm(doc_embs, axis=1, keepdims=True)
        doc_embs = doc_embs / np.clip(norms, 1e-12, None)
        doc_embs = np.ascontiguousarray(doc_embs, dtype="float32")

        self.doc_embs = doc_embs
        dim = doc_embs.shape[1]

        print("Building FAISS index (dim = {})...".format(dim))
        index = faiss.IndexFlatIP(dim)
        index.add(doc_embs)
        self.index = index

        print("Index built. Number of vectors:", self.index.ntotal)
        self._save_to_disk()

    def _save_to_disk(self):
        print("Saving embeddings and index to disk...")
        np.save(EMB_PATH, self.doc_embs)
        faiss.write_index(self.index, INDEX_PATH)

        meta = {
            "model_name": self.model_name,
            "query_prompt_name": self.query_prompt_name,
            "task_description": self.task_description,
        }
        with open(META_PATH, "w", encoding="utf-8") as f:
            json.dump(meta, f, ensure_ascii=False, indent=2)

        with open(IDS_PATH, "w", encoding="utf-8") as f:
            json.dump(self.doc_ids, f, ensure_ascii=False)

        print("Saved to", SAVE_DIR)

    def _load_from_disk(self):
        print("Loading embeddings and index from disk...")
        self.doc_embs = np.load(EMB_PATH)
        self.index = faiss.read_index(INDEX_PATH)

        with open(META_PATH, "r", encoding="utf-8") as f:
            meta = json.load(f)
        with open(IDS_PATH, "r", encoding="utf-8") as f:
            saved_ids = json.load(f)

        if len(saved_ids) != len(self.doc_ids):
            print("Warning: loaded doc_ids length != current doc_ids length.")
        else:
            self.doc_ids = saved_ids

        self.model_name = meta.get("model_name", self.model_name)
        self.query_prompt_name = meta.get("query_prompt_name", self.query_prompt_name)
        self.task_description = meta.get("task_description", self.task_description)

        print("Loaded index with", self.index.ntotal, "vectors.")
        print("Model used:", self.model_name)
        print("Query prompt_name:", self.query_prompt_name)

    def _encode_queries(self, questions):
        """
        Encode queries with *instructions*.

        Priority:
        1) Use Sentence-Transformers built-in prompts via prompt_name (recommended)
        2) Fallback: explicit 'Instruct: ...\\nQuery: ...' formatting
        """
        if self.query_prompt_name is not None:
            return self.model.encode(
                questions,
                batch_size=self.batch_size,
                convert_to_numpy=True,
                show_progress_bar=False,
                prompt_name=self.query_prompt_name,
            )

        if self.task_description:
            texts = [
                f"Instruct: {self.task_description}\nQuery: {q}"
                for q in questions
            ]
        else:
            texts = questions

        return self.model.encode(
            texts,
            batch_size=self.batch_size,
            convert_to_numpy=True,
            show_progress_bar=False,
        )

    # ---------- public API ----------
    def retrieve(self, query, k=10):
        q_emb = self._encode_queries([query])
        q_emb = q_emb.astype("float32")

        # Normalize query embedding to match doc normalization
        norms = np.linalg.norm(q_emb, axis=1, keepdims=True)
        q_emb = q_emb / np.clip(norms, 1e-12, None)

        scores, idx = self.index.search(q_emb, k)
        scores = scores[0]
        idx = idx[0]

        results = [(self.doc_ids[i], float(scores[j])) for j, i in enumerate(idx)]
        return results


In [7]:
instruction_retriever = DenseInstructionRetriever(
    doc_ids=collection_ids,
    doc_texts=collection_texts,
    model_name=MODEL_NAME,
    batch_size=DEFAULT_BATCH_SIZE,
    load_existing=True,   # set False once to build; then True to reuse
    query_prompt_name=QUERY_PROMPT_NAME,
    task_description=TASK_DESCRIPTION,
)


Loaded SentenceTransformer model: Qwen/Qwen3-Embedding-0.6B
Loading embeddings and index from disk...
Loaded index with 144718 vectors.
Model used: Qwen/Qwen3-Embedding-0.6B
Query prompt_name: query


In [8]:
def debug_retrieve(example_idx=0, k=5):
    q_item = val_ds[example_idx]
    qid = q_item["id"]
    question = q_item["text"]
    answer = q_item["answer"]
    supporting_ids = q_item["supporting_ids"]

    print("QID:", qid)
    print("Question:", question)
    print("Gold answer:", answer)
    print("Gold supporting_ids:", supporting_ids)

    hits = instruction_retriever.retrieve(question, k=k)
    print("\nTop-{} retrieved doc ids & scores:".format(k))
    for doc_id, score in hits:
        mark = "✔" if doc_id in supporting_ids else " "
        print(f"{mark} doc_id={doc_id} | score={score:.4f}")

debug_retrieve(example_idx=0, k=10)


QID: 5ae3f9c45542995ad6573cfe
Question: The second place finisher of the 2011 Gran Premio Santander d'Italia drove for who when he won the 2009 FIA Formula One World Championship?
Gold answer: Brawn GP
Gold supporting_ids: ['doc-23954', 'doc-109746']

Top-10 retrieved doc ids & scores:
✔ doc_id=doc-23954 | score=0.7095
  doc_id=doc-42270 | score=0.6327
  doc_id=doc-45341 | score=0.6293
  doc_id=doc-45965 | score=0.6202
  doc_id=doc-109 | score=0.6186
  doc_id=doc-52698 | score=0.5990
  doc_id=doc-115582 | score=0.5914
  doc_id=doc-20719 | score=0.5896
  doc_id=doc-10144 | score=0.5889
  doc_id=doc-43480 | score=0.5849


In [9]:
VAL_OUTPUT_PATH = os.path.join(
    BASE_DIR, f"validation_dense_instruction_{MODEL_CHOICE}.jsonl"
)
TOP_K = 10

def write_predictions_for_split(split_ds, output_path, top_k=10):
    """
    Writes one JSON object per line:
    {
      "id": <str>,
      "question": <str>,
      "answer": <str or ''>,
      "retrieved_docs": [[doc_id, score], ...]  # length top_k
    }
    """
    with open(output_path, "w", encoding="utf-8") as f:
        for item in tqdm(split_ds, desc="Retrieving for split"):
            qid = item["id"]
            question = item["text"]
            answer = item.get("answer", "")  # test split has no answer key

            hits = instruction_retriever.retrieve(question, k=top_k)
            retrieved_docs = [[doc_id, score] for doc_id, score in hits]

            rec = {
                "id": qid,
                "question": question,
                "answer": answer,
                "retrieved_docs": retrieved_docs,
            }
            f.write(json.dumps(rec, ensure_ascii=False) + "\n")

    print("Saved predictions to:", output_path)

write_predictions_for_split(val_ds, VAL_OUTPUT_PATH, top_k=TOP_K)


Retrieving for split: 100%|██████████| 1500/1500 [02:17<00:00, 10.92it/s]

Saved predictions to: retrieval_model/validation_dense_instruction_qwen3-0.6b.jsonl





In [10]:
TEST_OUTPUT_PATH = os.path.join(
    BASE_DIR, f"test_prediction_dense_instruction_{MODEL_CHOICE}.jsonl"
)
write_predictions_for_split(test_ds, TEST_OUTPUT_PATH, top_k=TOP_K)


Retrieving for split: 100%|██████████| 1052/1052 [01:59<00:00,  8.82it/s]

Saved predictions to: /content/drive/MyDrive/COMP5423/test_prediction_dense_instruction_qwen3-0.6b.jsonl





In [10]:
!pip install pytrec_eval pandas

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [11]:
import json
import time
import pytrec_eval
import pandas as pd


def read_jsonl(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            data.append(json.loads(line))
    print(f'[{time.asctime()}] Read {len(data)} from {file_path}')
    return data


def compute_metrics(qrels, results, k_values=[2, 5, 10]):
    map_string = "map_cut." + ",".join(str(k) for k in k_values)
    ndcg_string = "ndcg_cut." + ",".join(str(k) for k in k_values)
    recall_string = "recall." + ",".join(str(k) for k in k_values)
    precision_string = "P." + ",".join(str(k) for k in k_values)

    evaluator = pytrec_eval.RelevanceEvaluator(
        qrels, {map_string, ndcg_string, recall_string, precision_string}
    )
    scores_by_query = evaluator.evaluate(results)
    scores = pd.DataFrame.from_dict(scores_by_query.values()).mean()

    metrics = {}
    for prefix in ('map_cut', 'ndcg_cut', 'recall', 'P'):
        name = 'precision' if prefix == 'P' else prefix.split('_')[0]
        for k in k_values:
            metrics[f'{name}_at_{k}'] = scores[f'{prefix}_{k}']
    return metrics


def evaluate_retrieval(gold_path: str, pred_path: str):
    gold_data = read_jsonl(gold_path)
    pred_data = read_jsonl(pred_path)

    # qrels: from validation.jsonl
    # {qid: {docid: relevance}}
    qrels = {
        str(item["id"]): {str(docid): 1 for docid in item["supporting_ids"]}
        for item in gold_data
    }
    gold_ids = set(qrels.keys())

    # results: from val_retrieval_pred.jsonl
    # {qid: {docid: score}}
    results = {}
    for item in pred_data:
        qid = str(item["id"])
        if qid not in gold_ids:
            # skip queries that are not in gold
            continue

        doc_ids = item["retrieved_docs"]
        n = len(doc_ids)
        # assign scores so earlier docs have higher scores
        doc_scores = {
            str(docid): rank   # or just float(n - rank)
            for docid, rank in doc_ids
        }
        results[qid] = doc_scores

    print(f"Queries in gold: {len(gold_ids)}, queries with predictions used: {len(results)}")

    metrics = compute_metrics(qrels, results)
    print(json.dumps(metrics, indent=2))
    return metrics


In [15]:
gold_path = "data/validation.jsonl"
pred_path = "retrieval_model/validation_dense_instruction_qwen3-0.6b.jsonl"

metrics = evaluate_retrieval(gold_path, pred_path)
metrics

[Sun Nov 30 06:55:16 2025] Read 1500 from data/validation.jsonl
[Sun Nov 30 06:55:16 2025] Read 1500 from retrieval_model/validation_dense_instruction_qwen3-0.6b.jsonl
Queries in gold: 1500, queries with predictions used: 1500
{
  "map_at_2": 0.515,
  "map_at_5": 0.6133555555555555,
  "map_at_10": 0.6331997354497354,
  "ndcg_at_2": 0.5934240890420839,
  "ndcg_at_5": 0.7018507794007087,
  "ndcg_at_10": 0.7323016521261576,
  "recall_at_2": 0.5513333333333333,
  "recall_at_5": 0.7436666666666667,
  "recall_at_10": 0.819,
  "precision_at_2": 0.5513333333333333,
  "precision_at_5": 0.2974666666666667,
  "precision_at_10": 0.1638
}


{'map_at_2': 0.515,
 'map_at_5': 0.6133555555555555,
 'map_at_10': 0.6331997354497354,
 'ndcg_at_2': 0.5934240890420839,
 'ndcg_at_5': 0.7018507794007087,
 'ndcg_at_10': 0.7323016521261576,
 'recall_at_2': 0.5513333333333333,
 'recall_at_5': 0.7436666666666667,
 'recall_at_10': 0.819,
 'precision_at_2': 0.5513333333333333,
 'precision_at_5': 0.2974666666666667,
 'precision_at_10': 0.1638}