# Query processing pipeline.

#### 1. Setup: load model, tokenizer, FAISS index, and metadata

In [1]:
import os, numpy as np, pandas as pd, torch, faiss
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, AutoModel

# --- Paths you already created earlier ---
META_PATH   = "data/embeddings_fast/paraphrase-multilingual-MiniLM-L12-v2_passages_meta.parquet"   # replace
FAISS_PATH  = "data/faiss_index/passages_flatip.faiss"                # replace

# Choose model (speed tiers):
# FAST:    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" (384d)
# BALANCE: "intfloat/multilingual-e5-base" (768d)
# QUALITY: "intfloat/multilingual-e5-large" (1024d)
MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"

# --- Load meta and FAISS ---
meta  = pd.read_parquet(META_PATH)          # must include global_chunk_id, doc_id, chunk_id, title, preview, site, lang, ...
index = faiss.read_index(FAISS_PATH)

In [2]:
# --- Model / tokenizer ---
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DTYPE  = torch.float16 if DEVICE.type == "cuda" else torch.float32

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model     = AutoModel.from_pretrained(MODEL_NAME, torch_dtype=(torch.float16 if DEVICE.type=="cuda" else None)).to(DEVICE).eval()

In [3]:
def model_token_budget(tok, headroom=16, cap_default=512):
    m = getattr(tok, "model_max_length", None)
    if m is None or m > 100_000_000: m = cap_default
    return max(32, int(m - headroom))
TOKEN_BUDGET = model_token_budget(tokenizer)

# E5 uses "query: " prefix. Others don't.
def add_query_prefix(texts):
    return [f"query: {t}" for t in texts] if "intfloat/multilingual-e5" in MODEL_NAME.lower() else texts

@torch.no_grad()
def mean_pool(last_hidden_state, attention_mask):
    mask = attention_mask.unsqueeze(-1).type_as(last_hidden_state)
    return (last_hidden_state * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1e-6)

def l2_normalize_np(x: np.ndarray) -> np.ndarray:
    n = np.linalg.norm(x, axis=1, keepdims=True) + 1e-12
    return x / n

### 1. Batch embed queries (fast, safe)

In [4]:
def embed_queries(queries: list[str], batch_size: int = 128) -> np.ndarray:
    """Return (Q, d) L2-normalized float32 embeddings for queries."""
    vecs = []
    queries = add_query_prefix([str(q) for q in queries])
    for i in tqdm(range(0, len(queries), batch_size), desc="Embedding queries", unit="batch"):
        batch = queries[i:i+batch_size]
        enc = tokenizer(batch, padding=True, truncation=True, max_length=TOKEN_BUDGET, return_tensors="pt")
        enc = {k: v.to(DEVICE, non_blocking=True) for k, v in enc.items()}
        with torch.inference_mode(), (
            torch.autocast(device_type=DEVICE.type, dtype=torch.float16) if DEVICE.type=="cuda" else torch.no_grad()
        ):
            out    = model(**enc)
            pooled = mean_pool(out.last_hidden_state, enc["attention_mask"])
            pooled = torch.nn.functional.normalize(pooled, p=2, dim=1)   # cosine → IP
        vecs.append(pooled.to(torch.float32).cpu().numpy())
    return np.vstack(vecs)


### 2. Dense search (FAISS) for a batch of queries

In [5]:
def faiss_search_batch(q_vecs: np.ndarray, top_k: int = 10) -> tuple[np.ndarray, np.ndarray]:
    """
    q_vecs: (Q, d) float32, L2-normalized
    returns (scores, indices) each (Q, top_k) with cosine scores (via inner product).
    """
    assert q_vecs.dtype == np.float32
    D, I = index.search(q_vecs, top_k)
    return D, I


### 3. Assemble results into a tidy DataFrame

In [6]:
def results_df(queries: list[str], D: np.ndarray, I: np.ndarray, meta: pd.DataFrame) -> pd.DataFrame:
    """
    Build a long DataFrame with one row per (query, hit).
    Columns: query_id, query, rank, dense_score, global_chunk_id, doc_id, chunk_id, title, site, lang, preview, ...
    """
    rows = []
    for qid, (scores, idxs) in enumerate(zip(D, I)):
        m = meta.iloc[idxs].copy()
        m = m.assign(
            query_id = qid,
            query    = queries[qid],
            rank     = np.arange(1, len(idxs)+1),
            dense_score = scores
        )
        rows.append(m)
    df = pd.concat(rows, ignore_index=True)

    # Ensure IDs exist
    if "global_chunk_id" not in df.columns:
        df["global_chunk_id"] = df["doc_id"].astype(str) + ":" + df["chunk_id"].astype(int).astype(str)
    return df


### 4. Optional: roll up chunk → document (max score per doc)

In [7]:
def doc_level(df_hits: pd.DataFrame, top_k: int = 10) -> pd.DataFrame:
    """
    Keep the best chunk per document for each query; then take top_k docs.
    """
    # best chunk per (query_id, doc_id)
    best = (df_hits.sort_values(["query_id","doc_id","dense_score"], ascending=[True, True, False])
                 .groupby(["query_id","doc_id"], as_index=False)
                 .first())
    # rerank per query by score
    best["rank"] = best.groupby("query_id")["dense_score"].rank(ascending=False, method="first").astype(int)
    best = best.sort_values(["query_id","rank"]).groupby("query_id").head(top_k).reset_index(drop=True)
    return best


### 5. One call: run a list of queries end-to-end

In [8]:
def run_dense_pipeline(queries: list[str], top_k_chunks: int = 10, top_k_docs: int | None = None):
    # 1) embed
    q_vecs = embed_queries(queries)
    q_vecs = q_vecs.astype("float32")
    q_vecs = l2_normalize_np(q_vecs)  # already normalized above; keep for safety

    # 2) search
    D, I = faiss_search_batch(q_vecs, top_k=top_k_chunks)

    # 3) assemble
    df_chunks = results_df(queries, D, I, meta)

    # 4) optional doc-level rollup
    df_docs = doc_level(df_chunks, top_k=top_k_docs) if top_k_docs else None
    return df_chunks, df_docs

In [9]:
# Example usage
queries = [
    "child immunization reduces mortality",      # en
    "política de vacunación infantil",          # es
    "टीकाकरण क्यों जरूरी है इसके पांच मुख्य कारण",                    # hi
    "儿童免疫接种 对 死亡率 的 影响"                     # zh
]

In [10]:
df_hits, df_docs = run_dense_pipeline(queries, top_k_chunks=20, top_k_docs=10)

Embedding queries:   0%|          | 0/1 [00:00<?, ?batch/s]

In [11]:
# Inspect
df_hits[df_hits['query'] == 'टीकाकरण क्यों जरूरी है इसके पांच मुख्य कारण'].head(10)[["query_id","query","rank","dense_score","doc_id","chunk_id","title","lang","preview"]]

Unnamed: 0,query_id,query,rank,dense_score,doc_id,chunk_id,title,lang,preview
40,2,टीकाकरण क्यों जरूरी है इसके पांच मुख्य कारण,1,0.862851,15900,0,टीकाकरण क्यों जरूरी है इसके पांच मुख्य कारण,hi,चिकित्सा के क्षेत्र में टीकाकरण का अहम योगदान ...
41,2,टीकाकरण क्यों जरूरी है इसके पांच मुख्य कारण,2,0.797363,14211,5,اليونيسف ومنظمة الصحة العالمية تحذِّران من اجت...,ar,لشرح أهمية اللقاحات؛ سد الفجوات في تغطية التحص...
42,2,टीकाकरण क्यों जरूरी है इसके पांच मुख्य कारण,3,0.776559,18018,4,Зміцнення довіри до продовження планової імуні...,uk,"ключових осіб, які приймають рішення, пов'язан..."
43,2,टीकाकरण क्यों जरूरी है इसके पांच मुख्य कारण,4,0.767746,14644,3,Dix-huit millions de doses du tout premier vac...,fr,"ès et traiter la maladie depuis longtemps, mai..."
44,2,टीकाकरण क्यों जरूरी है इसके पांच मुख्य कारण,5,0.763638,13699,5,خدمات التحصين تبدأ بالتعافي ببطء من التعطيلات ...,ar,اث والتطوير في مجال اللقاحات، وضمان استمرارية ...
45,2,टीकाकरण क्यों जरूरी है इसके पांच मुख्य कारण,6,0.744338,8669,2,"123,000 children in Europe and Central Asia ar...",en,combination of the number of vaccines in a cou...
46,2,टीकाकरण क्यों जरूरी है इसके पांच मुख्य कारण,7,0.743962,20165,2,ЮНІСЕФ доставив в Україну 340 тисяч доз оральн...,uk,"допомогою щеплень. Окрім того, аби забезпечити..."
47,2,टीकाकरण क्यों जरूरी है इसके पांच मुख्य कारण,8,0.735053,4424,2,告诉你疫苗那些事儿,zh,传播出去)。 新浪国际:接种疫苗对孩子有什么好处? 朱徐: 有些
48,2,टीकाकरण क्यों जरूरी है इसके पांच मुख्य कारण,9,0.72854,19340,6,ВООЗ та ЮНІСЕФ застерігають щодо зниження рівн...,uk,"та профілактики захворювань в США (CDC), Інсти..."
49,2,टीकाकरण क्यों जरूरी है इसके पांच मुख्य कारण,10,0.727539,13216,8,80 millions d'enfants de moins d’un an risquen...,fr,"i, l'Alliance du Vaccin. Son objectif est d'ér..."


*****

## <b>Query Evaluation</b>

### 1. Helpers: roll up to doc-level (optional) and normalize inputs

In [None]:
import numpy as np
import pandas as pd
from typing import Iterable, Dict, List

def to_doc_level(df_hits: pd.DataFrame, top_k: int | None = None) -> pd.DataFrame:
    """
    Collapse chunk-level results to doc-level by keeping the best chunk per (query_id, doc_id),
    then re-ranking by score. Assumes columns: query_id, doc_id, dense_score (or score), rank.
    """
    score_col = "dense_score" if "dense_score" in df_hits.columns else "score"
    if score_col not in df_hits.columns:
        raise ValueError("df_hits must contain 'dense_score' or 'score' column.")
    # keep best chunk per (query, doc)
    best = (df_hits
            .sort_values(["query_id","doc_id",score_col], ascending=[True, True, False])
            .groupby(["query_id","doc_id"], as_index=False)
            .first())
    # rerank within query
    best["rank"] = best.groupby("query_id")[score_col].rank(ascending=False, method="first").astype(int)
    best = best.sort_values(["query_id","rank"])
    if top_k:
        best = best.groupby("query_id").head(top_k).reset_index(drop=True)
    return best

def load_ground_truth(gt_path_or_df) -> pd.DataFrame:
    """
    Expect columns: query_id, doc_id, relevant (0/1 or graded).
    Keeps only rows with relevant > 0.
    """
    gt = gt_path_or_df if isinstance(gt_path_or_df, pd.DataFrame) else pd.read_csv(gt_path_or_df)
    # Basic sanity
    required = {"query_id","doc_id","relevant"}
    missing = required - set(gt.columns)
    if missing:
        raise ValueError(f"Ground truth missing columns: {missing}")
    gt = gt.copy()
    # Coerce types
    gt["query_id"] = gt["query_id"].astype(int)
    gt["doc_id"]   = gt["doc_id"].astype(str)
    # Keep nonzero as relevant; preserve graded values for nDCG
    gt = gt[gt["relevant"] > 0].reset_index(drop=True)
    return gt


### 2. Metrics (P@k, Recall@k, MRR, MAP, nDCG) — binary or graded

In [None]:
def precision_at_k(ranked_ids: List[str], relevant_set: set[str], k: int) -> float:
    if k == 0: return 0.0
    topk = ranked_ids[:k]
    hits = sum(1 for x in topk if x in relevant_set)
    return hits / k

def recall_at_k(ranked_ids: List[str], relevant_set: set[str], k: int) -> float:
    if not relevant_set: return 0.0
    topk = ranked_ids[:k]
    hits = sum(1 for x in topk if x in relevant_set)
    return hits / len(relevant_set)

def average_precision(ranked_ids: List[str], relevant_set: set[str], k: int | None = None) -> float:
    """AP (binary)."""
    if not relevant_set: return 0.0
    if k is None: k = len(ranked_ids)
    ap, hits = 0.0, 0
    for i, rid in enumerate(ranked_ids[:k], start=1):
        if rid in relevant_set:
            hits += 1
            ap += hits / i
    return ap / max(1, len(relevant_set))

def mrr_at_k(ranked_ids: List[str], relevant_set: set[str], k: int) -> float:
    for i, rid in enumerate(ranked_ids[:k], start=1):
        if rid in relevant_set:
            return 1.0 / i
    return 0.0

def dcg_at_k(gains: List[float], k: int) -> float:
    gains = gains[:k]
    return sum(g / np.log2(i+2) for i, g in enumerate(gains))

def ndcg_at_k(ranked_ids: List[str],
              rel_dict: Dict[str, float],  # doc_id -> gain (e.g., 1 or graded)
              k: int,
              gain_scheme: str = "exp2") -> float:
    """If gain_scheme == 'exp2', use 2^rel - 1; else use rel as gain."""
    gains = []
    for rid in ranked_ids:
        rel = rel_dict.get(rid, 0.0)
        gains.append((2**rel - 1) if gain_scheme == "exp2" else rel)
    dcg  = dcg_at_k(gains, k)
    # ideal DCG
    ideal_gains = sorted(((2**v - 1) if gain_scheme == "exp2" else v) for v in rel_dict.values())[::-1]
    idcg = dcg_at_k(ideal_gains, k) if ideal_gains else 0.0
    return (dcg / idcg) if idcg > 0 else 0.0


### 3 Runner: compute metrics per query and aggregate

In [None]:
def evaluate_retrieval(
    df_hits: pd.DataFrame,
    gt_path_or_df,
    id_level: str = "doc",       # "doc" or "chunk"
    k_list: Iterable[int] = (1,3,5,10),
    score_col: str | None = None,
    gain_scheme: str = "exp2"    # for nDCG with graded relevance
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Returns (per_query_df, summary_df)
    df_hits must contain:
      - query_id (int), doc_id (str), rank (int ascending), and a score column (dense_score/score)
      - If id_level="chunk", metrics are computed on (query_id, global_chunk_id) vs GT on doc_id or chunk_id.
    Ground truth must contain columns: query_id, doc_id, relevant (>0). (chunk-level GT optional—then adapt join)
    """
    # Normalize df_hits
    hits = df_hits.copy()
    if "query_id" not in hits.columns or "doc_id" not in hits.columns:
        raise ValueError("df_hits must have columns: query_id, doc_id (chunk_id/global_chunk_id optional).")
    if score_col is None:
        score_col = "dense_score" if "dense_score" in hits.columns else "score"
    if score_col not in hits.columns:
        # if rank exists but no score, synthesize descending score from rank
        if "rank" in hits.columns:
            hits[score_col] = -hits["rank"].astype(float)
        else:
            raise ValueError("df_hits must have a numeric score column or rank.")

    # If evaluating at doc-level but df_hits is chunk-level, roll up
    if id_level == "doc":
        hits = to_doc_level(hits)

    # For each query, build ranked list of IDs
    hits["query_id"] = hits["query_id"].astype(int)
    hits["doc_id"]   = hits["doc_id"].astype(str)
    hits = hits.sort_values(["query_id", score_col], ascending=[True, False])
    ranked_lists = (hits.groupby("query_id")["doc_id"]
                         .apply(list)
                         .to_dict())

    # Prepare ground truth
    gt = load_ground_truth(gt_path_or_df)
    gt = gt.copy()
    gt["query_id"] = gt["query_id"].astype(int)
    gt["doc_id"]   = gt["doc_id"].astype(str)

    # Build per-query relevant sets and (for nDCG) relevance dictionaries
    rel_sets: Dict[int, set[str]] = {qid: set(g["doc_id"]) for qid, g in gt.groupby("query_id")}
    rel_dicts: Dict[int, Dict[str, float]] = {
        qid: {row.doc_id: float(row.relevant) for _, row in g.iterrows()}
        for qid, g in gt.groupby("query_id")
    }

    # Compute per-query metrics
    per_rows = []
    for qid, ranked in ranked_lists.items():
        rset = rel_sets.get(qid, set())
        rmap = rel_dicts.get(qid, {})
        row = {"query_id": qid, "num_relevant": len(rset), "retrieved": len(ranked)}
        for k in k_list:
            row[f"P@{k}"]   = precision_at_k(ranked, rset, k)
            row[f"R@{k}"]   = recall_at_k(ranked, rset, k)
            row[f"nDCG@{k}"] = ndcg_at_k(ranked, rmap, k, gain_scheme=gain_scheme)
        row["MRR@{}".format(max(k_list))] = mrr_at_k(ranked, rset, k=max(k_list))
        row["MAP"] = average_precision(ranked, rset, k=max(k_list))
        per_rows.append(row)

    per_query = pd.DataFrame(per_rows).sort_values("query_id").reset_index(drop=True)

    # Summary (macro-average over queries)
    agg = {col:"mean" for col in per_query.columns if col not in {"query_id","num_relevant","retrieved"}}
    summary = per_query.agg(agg).to_frame(name="mean").T

    return per_query, summary


### 4. (Optional) Slice metrics by language/site

In [None]:
def evaluate_by_slice(df_hits: pd.DataFrame, gt, slice_col: str, **kwargs):
    if slice_col not in df_hits.columns:
        raise ValueError(f"{slice_col} not found in df_hits")
    out = []
    for val, sub in df_hits.groupby(slice_col):
        per_q, summ = evaluate_retrieval(sub, gt, **kwargs)
        summ.insert(0, slice_col, val)
        summ.insert(1, "queries", per_q["query_id"].nunique())
        out.append(summ)
    return pd.concat(out, ignore_index=True)


### 5. Example usage

In [17]:
# Suppose you already have df_hits from your query pipeline (chunk-level).
# 1) Evaluate at doc-level (recommended for document search)
gt_path = "data/ground/ground_truth_sample.csv"  # columns: query_id, doc_id, relevant
per_q, summary = evaluate_retrieval(
    df_hits=df_hits,
    gt_path_or_df=gt_path,
    id_level="doc",          # or "chunk"
    k_list=(1,3,5,10),
    gain_scheme="exp2"       # use graded relevance if available
)
display(per_q.head())
display(summary)

# 2) Optional: add lang/site to df_hits before slicing
# For example, join with meta on (doc_id, chunk_id) to bring in 'lang' and 'site':
# df_hits = df_hits.merge(meta[["doc_id","chunk_id","lang","site","title","preview"]],
#                         on=["doc_id","chunk_id"], how="left")

# 3) Metrics by language (macro-averaged)
# by_lang = evaluate_by_slice(df_hits, gt_path, slice_col="lang", id_level="doc", k_list=(1,3,5,10))
# display(by_lang.sort_values("mean", ascending=False))


NameError: name 'evaluate_retrieval' is not defined

### 7) (Optional) Save results for later analysis & plots

In [16]:
# df_sparse_chunks.to_parquet("results/sparse_chunks.parquet", index=False)
df_hits.to_parquet("results/sparse_docs.parquet", index=False)
per_q.to_parquet("results/metrics_sparse_per_query.parquet", index=False)
summary.to_csv("results/metrics_sparse_summary.csv", index=False)


NameError: name 'per_q' is not defined

*****

## <b>Keyword based search</b>

In [14]:
from elasticsearch import Elasticsearch

es = Elasticsearch(
    "http://localhost:9200",
    basic_auth=("elastic", "DfZP9TzO")   # 👈 add this
)


#### Prepare data & bulk-ingest passages