In [1]:
!pip install -q datasets faiss-cpu huggingface_hub pytrec_eval tqdm



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
import os
import json
import numpy as np
from tqdm import tqdm

import faiss
from datasets import load_dataset
from huggingface_hub import InferenceClient
import pytrec_eval

import torch

print("PyTorch version:", torch.__version__)


PyTorch version: 2.9.1


In [4]:
# Load HQ-small dataset
dataset = load_dataset("izhx/COMP5423-25Fall-HQ-small")

train_ds = dataset["train"]
val_ds = dataset["validation"]
test_ds = dataset["test"]
coll_ds = dataset["collection"]

print("Train:", len(train_ds))
print("Validation:", len(val_ds))
print("Test:", len(test_ds))
print("Collection:", len(coll_ds))


Train: 12000
Validation: 1500
Test: 1052
Collection: 144718


In [5]:
# Change this to your own folder (on Colab or local)
BASE_DIR = "retrieval_model"  # or any path you used
MODEL_TAG = "dense_instruction_e5-mistral"          # just a folder name to group stuff

SAVE_DIR = os.path.join(BASE_DIR, MODEL_TAG)
os.makedirs(SAVE_DIR, exist_ok=True)

EMB_PATH = os.path.join(SAVE_DIR, "doc_embs.npy")
INDEX_PATH = os.path.join(SAVE_DIR, "faiss_index.bin")
IDS_PATH = os.path.join(SAVE_DIR, "doc_ids.json")

print("SAVE_DIR:", SAVE_DIR)
print("Expecting files:")
print(" ", EMB_PATH)
print(" ", INDEX_PATH)
print(" ", IDS_PATH)


SAVE_DIR: retrieval_model/dense_instruction_e5-mistral
Expecting files:
  retrieval_model/dense_instruction_e5-mistral/doc_embs.npy
  retrieval_model/dense_instruction_e5-mistral/faiss_index.bin
  retrieval_model/dense_instruction_e5-mistral/doc_ids.json


In [6]:
# Load precomputed doc embeddings + FAISS index + doc_ids

if not (os.path.exists(EMB_PATH) and os.path.exists(INDEX_PATH) and os.path.exists(IDS_PATH)):
    raise FileNotFoundError(
        "Could not find one of the required files: doc_embs.npy, faiss_index.bin, doc_ids.json.\n"
        "Make sure you set SAVE_DIR correctly and that you already built the index."
    )

doc_embs = np.load(EMB_PATH)
index = faiss.read_index(INDEX_PATH)
with open(IDS_PATH, "r", encoding="utf-8") as f:
    doc_ids = json.load(f)

print("doc_embs shape:", doc_embs.shape)
print("FAISS index vectors:", index.ntotal)
print("Number of doc_ids:", len(doc_ids))


doc_embs shape: (144718, 4096)
FAISS index vectors: 144718
Number of doc_ids: 144718


In [None]:
# Option 1: set it here (for quick testing)
HF_TOKEN = "hf_XXXXXXXXXXXXXXXXXXXXXXXX"  # <-- put your token here

# Option 2 (recommended): set environment variable instead, then:
# HF_TOKEN = os.environ.get("HF_TOKEN")

if HF_TOKEN is None or HF_TOKEN == "":
    raise ValueError("HF_TOKEN is not set. Please set your Hugging Face token.")

client = InferenceClient(
    provider="auto",
    api_key=HF_TOKEN,
)

print("HF InferenceClient initialized.")


HF InferenceClient initialized.


In [13]:
def hf_e5_mistral_embed(texts):
    """
    texts: str or list[str]
    returns: np.ndarray (B, dim)
    """
    if isinstance(texts, str):
        texts = [texts]

    # ✅ no 'inputs=' keyword, just pass texts as first arg
    outputs = client.feature_extraction(
        texts,
        model="intfloat/e5-mistral-7b-instruct",
    )

    arr = np.array(outputs, dtype="float32")
    if arr.ndim == 1:   # single vector
        arr = arr[None, :]
    return arr


In [14]:
TASK_DESCRIPTION = (
    "Given a multi-hop question about Wikipedia, retrieve the most relevant passages "
    "that help answer the question."
)

def format_e5_instruction_query(q: str) -> str:
    return f"Instruct: {TASK_DESCRIPTION}\nQuery: {q}"

def encode_queries_via_hf_api(questions):
    """
    questions: list[str]
    returns: np.ndarray (B, dim), L2-normalized
    """
    texts = [format_e5_instruction_query(q) for q in questions]
    embs = hf_e5_mistral_embed(texts)

    # L2 norm
    norms = np.linalg.norm(embs, axis=1, keepdims=True)
    embs = embs / np.clip(norms, 1e-12, None)

    return embs.astype("float32")


In [15]:
test_emb = encode_queries_via_hf_api(["Who is the president of the United States?"])
print("Query embedding shape:", test_emb.shape)
print("Matches doc_emb dim:", test_emb.shape[1] == doc_embs.shape[1])


Query embedding shape: (1, 4096)
Matches doc_emb dim: True


In [16]:
def retrieve(question: str, k: int = 10):
    """
    Runs retrieval for a single question using:
      - HF API to embed query
      - local FAISS index built on your doc_embs
    """
    q_emb = encode_queries_via_hf_api([question])  # (1, dim)
    scores, idx = index.search(q_emb, k)          # FAISS search

    scores = scores[0]
    idx = idx[0]

    results = [(doc_ids[i], float(scores[j])) for j, i in enumerate(idx)]
    return results


In [17]:
example_item = val_ds[0]
print("Question:", example_item["text"])
print("Supporting IDs:", example_item["supporting_ids"])

hits = retrieve(example_item["text"], k=10)
print("\nTop-10 retrieved:")
for doc_id, score in hits:
    mark = "✔" if doc_id in example_item["supporting_ids"] else " "
    print(f"{mark} {doc_id} | {score:.4f}")


Question: The second place finisher of the 2011 Gran Premio Santander d'Italia drove for who when he won the 2009 FIA Formula One World Championship?
Supporting IDs: ['doc-23954', 'doc-109746']

Top-10 retrieved:
✔ doc-23954 | 0.8087
  doc-59626 | 0.7761
  doc-45965 | 0.7748
  doc-43480 | 0.7693
✔ doc-109746 | 0.7670
  doc-100598 | 0.7670
  doc-52698 | 0.7648
  doc-43038 | 0.7594
  doc-45341 | 0.7473
  doc-121143 | 0.7426


In [18]:
def write_predictions_for_split(split_ds, output_path, top_k=10):
    """
    split_ds: HF dataset (validation or test)
    Writes JSONL with fields:
      - id
      - question
      - answer ('' for test)
      - retrieved_docs: [[doc_id, score], ...]
    """
    with open(output_path, "w", encoding="utf-8") as f:
        for item in tqdm(split_ds, desc="Retrieving via HF API"):
            qid = item["id"]
            question = item["text"]
            answer = item.get("answer", "")  # test doesn't have answer

            hits = retrieve(question, k=top_k)

            rec = {
                "id": qid,
                "question": question,
                "answer": answer,
                "retrieved_docs": [
                    [doc_id, float(score)] for doc_id, score in hits
                ],
            }
            f.write(json.dumps(rec, ensure_ascii=False) + "\n")

    print("Saved predictions to:", output_path)


In [19]:
VAL_OUTPUT_PATH = os.path.join(
    SAVE_DIR, "validation_dense_instruction_e5_mistral.jsonl"
)

write_predictions_for_split(val_ds, VAL_OUTPUT_PATH, top_k=10)


Retrieving via HF API:   2%|▏         | 33/1500 [00:33<24:32,  1.00s/it]


HfHubHTTPError: 402 Client Error: Payment Required for url: https://router.huggingface.co/nebius/v1/embeddings (Request ID: Root=1-692bc70c-20d402062e47ca3d384c5c05;11c64671-85ff-41b8-830d-afd0d65a9a18)

You have exceeded your monthly included credits for Inference Providers. Subscribe to PRO to get 20x more monthly included credits.