# <b>Query processing pipeline using indexed and matadata</b>

#### 1. Setup: load model, tokenizer, FAISS index, and metadata

In [2]:
import os, numpy as np, pandas as pd, torch, faiss
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, AutoModel
from pathlib import Path

from elasticsearch import Elasticsearch, helpers

In [3]:
# Choose model (speed tiers):
# FAST:    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" (384d)
# BALANCE: "intfloat/multilingual-e5-base" (768d)
# QUALITY: "intfloat/multilingual-e5-large" (1024d)
MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"

In [4]:
MODEL_TAG = "paraphrase-multilingual-MiniLM-L12-v2"
METADATA_PATH = f"../shared-data-library/metadata/{MODEL_TAG}__meta.parquet"
FAISS_INDEX_PATH  = f"../shared-data-library/indexes/faiss/{MODEL_TAG}__indexes.faiss"

In [5]:
# --- Load meta and FAISS Indexes ---
metadata  = pd.read_parquet(METADATA_PATH)
index = faiss.read_index(FAISS_INDEX_PATH)

In [6]:
# --- Model / tokenizer ---
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DTYPE  = torch.float16 if DEVICE.type == "cuda" else torch.float32

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model     = AutoModel.from_pretrained(MODEL_NAME, torch_dtype=(torch.float16 if DEVICE.type=="cuda" else None)).to(DEVICE).eval()

## <b>Functions.</b>

In [None]:
def model_token_budget(tok, headroom=16, cap_default=512):
    m = getattr(tok, "model_max_length", None)
    if m is None or m > 100_000_000: m = cap_default
    return max(32, int(m - headroom))
TOKEN_BUDGET = model_token_budget(tokenizer)

In [None]:
# E5 uses "query: " prefix. Others don't.
def add_query_prefix(texts):
    return [f"query: {t}" for t in texts] if "intfloat/multilingual-e5" in MODEL_NAME.lower() else texts

In [None]:
@torch.no_grad()
def mean_pool(last_hidden_state, attention_mask):
    mask = attention_mask.unsqueeze(-1).type_as(last_hidden_state)
    return (last_hidden_state * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1e-6)

In [None]:
def l2_normalize_np(x: np.ndarray) -> np.ndarray:
    n = np.linalg.norm(x, axis=1, keepdims=True) + 1e-12
    return x / n

*****

### 1. Batch embed queries (fast, safe)

In [None]:
def embed_queries(queries: list[str], batch_size: int = 128) -> np.ndarray:
    """Return (Q, d) L2-normalized float32 embeddings for queries."""
    vecs = []
    queries = add_query_prefix([str(q) for q in queries])
    for i in tqdm(range(0, len(queries), batch_size), desc="Embedding queries", unit="batch"):
        batch = queries[i:i+batch_size]
        enc = tokenizer(batch, padding=True, truncation=True, max_length=TOKEN_BUDGET, return_tensors="pt")
        enc = {k: v.to(DEVICE, non_blocking=True) for k, v in enc.items()}
        with torch.inference_mode(), (
            torch.autocast(device_type=DEVICE.type, dtype=torch.float16) if DEVICE.type=="cuda" else torch.no_grad()
        ):
            out    = model(**enc)
            pooled = mean_pool(out.last_hidden_state, enc["attention_mask"])
            pooled = torch.nn.functional.normalize(pooled, p=2, dim=1)   # cosine → IP
        vecs.append(pooled.to(torch.float32).cpu().numpy())
    return np.vstack(vecs)

### 2. Dense search (FAISS) for a batch of queries

In [None]:
def faiss_search_batch(q_vecs: np.ndarray, top_k: int = 10) -> tuple[np.ndarray, np.ndarray]:
    """
    q_vecs: (Q, d) float32, L2-normalized
    returns (scores, indices) each (Q, top_k) with cosine scores (via inner product).
    """
    assert q_vecs.dtype == np.float32
    D, I = index.search(q_vecs, top_k)
    return D, I

### 3. Assemble results into a tidy DataFrame

In [None]:
def results_df(queries: list[str], D: np.ndarray, I: np.ndarray, meta: pd.DataFrame) -> pd.DataFrame:
    """
    Build a long DataFrame with one row per (query, hit).
    Columns: query_id, query, rank, dense_score, global_chunk_id, doc_id, chunk_id, title, site, lang, preview, ...
    """
    rows = []
    for qid, (scores, idxs) in enumerate(zip(D, I)):
        m = meta.iloc[idxs].copy()
        m = m.assign(
            query_id = qid,
            query    = queries[qid],
            rank     = np.arange(1, len(idxs)+1),
            dense_score = scores
        )
        rows.append(m)
    df = pd.concat(rows, ignore_index=True)

    # Ensure IDs exist
    if "global_chunk_id" not in df.columns:
        df["global_chunk_id"] = df["doc_id"].astype(str) + ":" + df["chunk_id"].astype(int).astype(str)
    return df

### 4. Optional: roll up chunk → document (max score per doc)

In [None]:
def doc_level(df_hits: pd.DataFrame, top_k: int = 10) -> pd.DataFrame:
    """
    Keep the best chunk per document for each query; then take top_k docs.
    """
    # best chunk per (query_id, doc_id)
    best = (df_hits.sort_values(["query_id","doc_id","dense_score"], ascending=[True, True, False])
                 .groupby(["query_id","doc_id"], as_index=False)
                 .first())
    # rerank per query by score
    best["rank"] = best.groupby("query_id")["dense_score"].rank(ascending=False, method="first").astype(int)
    best = best.sort_values(["query_id","rank"]).groupby("query_id").head(top_k).reset_index(drop=True)
    return best

### 5. One call: run a list of queries end-to-end

In [None]:
def run_dense_pipeline(queries: list[str], top_k_chunks: int = 10, top_k_docs: int | None = None):
    # 1) embed
    q_vecs = embed_queries(queries)
    q_vecs = q_vecs.astype("float32")
    q_vecs = l2_normalize_np(q_vecs)  # already normalized above; keep for safety

    # 2) search
    D, I = faiss_search_batch(q_vecs, top_k=top_k_chunks)

    # 3) assemble
    df_chunks = results_df(queries, D, I, metadata)

    # 4) optional doc-level rollup
    df_docs = doc_level(df_chunks, top_k=top_k_docs) if top_k_docs else None
    return df_chunks, df_docs

*****

#### Run example of query batch.

In [11]:
queries_df = pd.read_csv("../shared-data-library/ground/queries_sample.csv");

In [12]:
queries = queries_df['query_text'].tolist()
queries

['child immunization reduces mortality',
 'política de vacunación infantil',
 'बाल टीकाकरण और मृत्यु दर',
 '儿童免疫接种 对 死亡率 的 影响',
 'impact of clean water access on diarrhea']

In [None]:
df_hits, df_docs = run_dense_pipeline(queries, top_k_chunks=10, top_k_docs=5)

In [None]:
df_docs

### Save Query results.

In [None]:
df_processed_queries = df_docs[["query_id","query","rank","dense_score","doc_id","chunk_id","title","lang","preview"]]

In [None]:
# processed file
OUT_PARQ  = Path(f"../shared-data-library/queries/{MODEL_TAG}__dense-query-results.parquet")
df_processed_queries.to_parquet(OUT_PARQ, index=False, compression="zstd")

*****

#### <b>Keyword bases query processing.</b>

In [7]:
es = Elasticsearch(
    "http://localhost:9200",
    basic_auth=("elastic", "DfZP9TzO")   # 👈 add this
)

In [8]:
def bm25_search(query_str: str, k: int = 50) -> pd.DataFrame:
    body = {
      "size": k,
      "query": { "multi_match": {
        "query": query_str,
        "fields": ["title^2", "chunk_text"]
      }}
    }
    res = es.search(index="passages_bm25", body=body)
    rows = []
    for rank, hit in enumerate(res["hits"]["hits"], start=1):
        src = hit["_source"]
        rows.append({
            "query": query_str,
            "rank": rank,
            "sparse_score": hit["_score"],
            "global_chunk_id": src["global_chunk_id"],
            "doc_id": src["doc_id"],
            "chunk_id": src["chunk_id"],
            "title": src.get("title",""),
            "site": src.get("site",""),
            "lang": src.get("lang",""),
            "preview": src.get("preview","")
        })
    return pd.DataFrame(rows)

In [9]:
def bm25_search_batch(queries: list[str], k: int = 50) -> pd.DataFrame:
    all_rows = []
    for qid, q in enumerate(queries):
        df = bm25_search(q, k=k)
        if df.empty:
            continue
        df.insert(0, "query_id", qid)
        all_rows.append(df)
    return pd.concat(all_rows, ignore_index=True) if all_rows else pd.DataFrame()


#### <b>4. Convert chunk results → doc results (best chunk per doc) </b>

In [10]:
def bm25_doc_level(df_hits: pd.DataFrame, top_k_docs: int = 10) -> pd.DataFrame:
    # keep best chunk per (query_id, doc_id)
    sub = (df_hits
           .sort_values(["query_id","doc_id","sparse_score"], ascending=[True, True, False])
           .groupby(["query_id","doc_id"], as_index=False)
           .first())
    # rerank docs per query
    sub["rank"] = sub.groupby("query_id")["sparse_score"].rank(ascending=False, method="first").astype(int)
    sub = sub.sort_values(["query_id","rank"]).groupby("query_id").head(top_k_docs).reset_index(drop=True)
    return sub


### <b> 5. Run the baseline for a list of queries.</b>

In [13]:
# Option B: load from CSV with mapping (query_id, query_text)
# queries_df = pd.read_csv("data/queries.csv")
# queries = queries_df["query_text"].tolist()
df_sparse_chunks = bm25_search_batch(queries, k=5)
df_sparse_docs   = bm25_doc_level(df_sparse_chunks, top_k_docs=10)

### Save query result.

In [14]:
df_processed_sparse_queries = df_sparse_docs[["query_id","query","rank","sparse_score","doc_id","chunk_id","title","lang","preview"]]

In [15]:
# processed file
OUT_PARQ  = Path(f"../shared-data-library/queries/{MODEL_TAG}__sparse-query-results.parquet")
df_processed_sparse_queries.to_parquet(OUT_PARQ, index=False, compression="zstd")