# <b>Evaluation of Query results</b>

### 1. Helpers: roll up to doc-level (optional) and normalize inputs

In [1]:
import numpy as np
import pandas as pd
from typing import Iterable, Dict, List

In [2]:
def to_doc_level(df_hits: pd.DataFrame, top_k: int | None = None) -> pd.DataFrame:
    """
    Collapse chunk-level results to doc-level by keeping the best chunk per (query_id, doc_id),
    then re-ranking by score. Assumes columns: query_id, doc_id, dense_score (or score), rank.
    """
    score_col = "dense_score" if "dense_score" in df_hits.columns else "score"
    if score_col not in df_hits.columns:
        raise ValueError("df_hits must contain 'dense_score' or 'score' column.")
    # keep best chunk per (query, doc)
    best = (df_hits
            .sort_values(["query_id","doc_id",score_col], ascending=[True, True, False])
            .groupby(["query_id","doc_id"], as_index=False)
            .first())
    # rerank within query
    best["rank"] = best.groupby("query_id")[score_col].rank(ascending=False, method="first").astype(int)
    best = best.sort_values(["query_id","rank"])
    if top_k:
        best = best.groupby("query_id").head(top_k).reset_index(drop=True)
    return best


In [3]:
def load_ground_truth(gt_path_or_df) -> pd.DataFrame:
    """
    Expect columns: query_id, doc_id, relevant (0/1 or graded).
    Keeps only rows with relevant > 0.
    """
    gt = gt_path_or_df if isinstance(gt_path_or_df, pd.DataFrame) else pd.read_csv(gt_path_or_df)
    # Basic sanity
    required = {"query_id","doc_id","relevant"}
    missing = required - set(gt.columns)
    if missing:
        raise ValueError(f"Ground truth missing columns: {missing}")
    gt = gt.copy()
    # Coerce types
    gt["query_id"] = gt["query_id"].astype(int)
    gt["doc_id"]   = gt["doc_id"].astype(str)
    # Keep nonzero as relevant; preserve graded values for nDCG
    gt = gt[gt["relevant"] > 0].reset_index(drop=True)
    return gt

### Evaluation metrics

In [4]:
def precision_at_k(ranked_ids: List[str], relevant_set: set[str], k: int) -> float:
    if k == 0: return 0.0
    topk = ranked_ids[:k]
    hits = sum(1 for x in topk if x in relevant_set)
    return hits / k

In [5]:
def recall_at_k(ranked_ids: List[str], relevant_set: set[str], k: int) -> float:
    if not relevant_set: return 0.0
    topk = ranked_ids[:k]
    hits = sum(1 for x in topk if x in relevant_set)
    return hits / len(relevant_set)

In [6]:
def average_precision(ranked_ids: List[str], relevant_set: set[str], k: int | None = None) -> float:
    """AP (binary)."""
    if not relevant_set: return 0.0
    if k is None: k = len(ranked_ids)
    ap, hits = 0.0, 0
    for i, rid in enumerate(ranked_ids[:k], start=1):
        if rid in relevant_set:
            hits += 1
            ap += hits / i
    return ap / max(1, len(relevant_set))

In [7]:
def mrr_at_k(ranked_ids: List[str], relevant_set: set[str], k: int) -> float:
    for i, rid in enumerate(ranked_ids[:k], start=1):
        if rid in relevant_set:
            return 1.0 / i
    return 0.0

In [8]:
def dcg_at_k(gains: List[float], k: int) -> float:
    gains = gains[:k]
    return sum(g / np.log2(i+2) for i, g in enumerate(gains))

In [11]:
def ndcg_at_k(ranked_ids: List[str],
              rel_dict: Dict[str, float],  # doc_id -> gain (e.g., 1 or graded)
              k: int,
              gain_scheme: str = "exp2") -> float:
    """If gain_scheme == 'exp2', use 2^rel - 1; else use rel as gain."""
    gains = []
    for rid in ranked_ids:
        rel = rel_dict.get(rid, 0.0)
        gains.append((2**rel - 1) if gain_scheme == "exp2" else rel)
    dcg  = dcg_at_k(gains, k)
    # ideal DCG
    ideal_gains = sorted(((2**v - 1) if gain_scheme == "exp2" else v) for v in rel_dict.values())[::-1]
    idcg = dcg_at_k(ideal_gains, k) if ideal_gains else 0.0
    return (dcg / idcg) if idcg > 0 else 0.0

### 3 Runner: compute metrics per query and aggregate

In [12]:
def evaluate_retrieval(
    df_hits: pd.DataFrame,
    gt_path_or_df,
    id_level: str = "doc",       # "doc" or "chunk"
    k_list: Iterable[int] = (1,3,5,10),
    score_col: str | None = None,
    gain_scheme: str = "exp2"    # for nDCG with graded relevance
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Returns (per_query_df, summary_df)
    df_hits must contain:
      - query_id (int), doc_id (str), rank (int ascending), and a score column (dense_score/score)
      - If id_level="chunk", metrics are computed on (query_id, global_chunk_id) vs GT on doc_id or chunk_id.
    Ground truth must contain columns: query_id, doc_id, relevant (>0). (chunk-level GT optional—then adapt join)
    """
    # Normalize df_hits
    hits = df_hits.copy()
    if "query_id" not in hits.columns or "doc_id" not in hits.columns:
        raise ValueError("df_hits must have columns: query_id, doc_id (chunk_id/global_chunk_id optional).")
    if score_col is None:
        score_col = "dense_score" if "dense_score" in hits.columns else "score"
    if score_col not in hits.columns:
        # if rank exists but no score, synthesize descending score from rank
        if "rank" in hits.columns:
            hits[score_col] = -hits["rank"].astype(float)
        else:
            raise ValueError("df_hits must have a numeric score column or rank.")

    # If evaluating at doc-level but df_hits is chunk-level, roll up
    if id_level == "doc":
        hits = to_doc_level(hits)

    # For each query, build ranked list of IDs
    hits["query_id"] = hits["query_id"].astype(int)
    hits["doc_id"]   = hits["doc_id"].astype(str)
    hits = hits.sort_values(["query_id", score_col], ascending=[True, False])
    ranked_lists = (hits.groupby("query_id")["doc_id"]
                         .apply(list)
                         .to_dict())

    # Prepare ground truth
    gt = load_ground_truth(gt_path_or_df)
    gt = gt.copy()
    gt["query_id"] = gt["query_id"].astype(int)
    gt["doc_id"]   = gt["doc_id"].astype(str)

    # Build per-query relevant sets and (for nDCG) relevance dictionaries
    rel_sets: Dict[int, set[str]] = {qid: set(g["doc_id"]) for qid, g in gt.groupby("query_id")}
    rel_dicts: Dict[int, Dict[str, float]] = {
        qid: {row.doc_id: float(row.relevant) for _, row in g.iterrows()}
        for qid, g in gt.groupby("query_id")
    }

    # Compute per-query metrics
    per_rows = []
    for qid, ranked in ranked_lists.items():
        rset = rel_sets.get(qid, set())
        rmap = rel_dicts.get(qid, {})
        row = {"query_id": qid, "num_relevant": len(rset), "retrieved": len(ranked)}
        for k in k_list:
            row[f"P@{k}"]   = precision_at_k(ranked, rset, k)
            row[f"R@{k}"]   = recall_at_k(ranked, rset, k)
            row[f"nDCG@{k}"] = ndcg_at_k(ranked, rmap, k, gain_scheme=gain_scheme)
        row["MRR@{}".format(max(k_list))] = mrr_at_k(ranked, rset, k=max(k_list))
        row["MAP"] = average_precision(ranked, rset, k=max(k_list))
        per_rows.append(row)

    per_query = pd.DataFrame(per_rows).sort_values("query_id").reset_index(drop=True)

    # Summary (macro-average over queries)
    agg = {col:"mean" for col in per_query.columns if col not in {"query_id","num_relevant","retrieved"}}
    summary = per_query.agg(agg).to_frame(name="mean").T

    return per_query, summary


### 4. (Optional) Slice metrics by language/site

In [15]:
def evaluate_by_slice(df_hits: pd.DataFrame, gt, slice_col: str, **kwargs):
    if slice_col not in df_hits.columns:
        raise ValueError(f"{slice_col} not found in df_hits")
    out = []
    for val, sub in df_hits.groupby(slice_col):
        per_q, summ = evaluate_retrieval(sub, gt, **kwargs)
        summ.insert(0, slice_col, val)
        summ.insert(1, "queries", per_q["query_id"].nunique())
        out.append(summ)
    return pd.concat(out, ignore_index=True)

*****

#### Load query results.

In [33]:
MODEL_TAG = "paraphrase-multilingual-MiniLM-L12-v2"
dense_query_results = pd.read_parquet(f"../shared-data-library/queries/{MODEL_TAG}__dense-query-results.parquet")
sparse_query_results = pd.read_parquet(f"../shared-data-library/queries/{MODEL_TAG}__sparse-query-results.parquet")

#### <b>Running experiment and execute evaluation</b>

In [49]:
# Suppose you already have df_hits from your query pipeline (chunk-level).
# 1) Evaluate at doc-level (recommended for document search)
gt_path = "../shared-data-library/ground/ground_truth_sample.csv"  # columns: query_id, doc_id, relevant

In [50]:

per_q, summary = evaluate_retrieval(
    df_hits=dense_query_results,
    gt_path_or_df=gt_path,
    id_level="doc",          # or "chunk"
    k_list=(1,3,5,10),
    gain_scheme="exp2"       # use graded relevance if available
)



In [51]:
display(per_q.head())

Unnamed: 0,query_id,num_relevant,retrieved,P@1,R@1,nDCG@1,P@3,R@3,nDCG@3,P@5,R@5,nDCG@5,P@10,R@10,nDCG@10,MRR@10,MAP
0,0,3,5,1.0,0.333333,1.0,0.666667,0.666667,0.878962,0.6,1.0,0.983218,0.3,1.0,0.983218,1.0,0.916667
1,1,3,5,1.0,0.333333,1.0,0.666667,0.666667,0.847267,0.6,1.0,0.951523,0.3,1.0,0.951523,1.0,0.805556
2,2,3,5,0.0,0.0,0.0,0.333333,0.333333,0.121038,0.6,1.0,0.506238,0.3,1.0,0.506238,0.333333,0.477778
3,3,3,5,1.0,0.333333,1.0,0.666667,0.666667,0.878962,0.6,1.0,0.97261,0.3,1.0,0.97261,1.0,0.866667
4,4,3,5,1.0,0.333333,1.0,1.0,1.0,1.0,0.6,1.0,1.0,0.3,1.0,1.0,1.0,1.0


In [52]:
display(summary)

Unnamed: 0,P@1,R@1,nDCG@1,P@3,R@3,nDCG@3,P@5,R@5,nDCG@5,P@10,R@10,nDCG@10,MRR@10,MAP
mean,0.8,0.266667,0.8,0.666667,0.666667,0.745246,0.6,1.0,0.882718,0.3,1.0,0.882718,0.866667,0.813333


In [53]:
per_q.to_parquet("../shared-data-library/evaluation/metrics_dense_per_query.parquet", index=False)
summary.to_csv("../shared-data-library/evaluation/metrics_dense_summary.csv", index=False)

#### Baseline queries.

In [44]:
# Reuse the evaluate_retrieval() function you already have
per_q, summary = evaluate_retrieval(
    df_hits=sparse_query_results.rename(columns={"sparse_score":"score"}),  # evaluator expects 'score' or 'dense_score'
    gt_path_or_df=gt_path,
    id_level="doc",       # evaluate documents
    k_list=(1,3,5,10),
    gain_scheme="exp2"
)


In [45]:
display(per_q.head())
display(summary)

Unnamed: 0,query_id,num_relevant,retrieved,P@1,R@1,nDCG@1,P@3,R@3,nDCG@3,P@5,R@5,nDCG@5,P@10,R@10,nDCG@10,MRR@10,MAP
0,0,3,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,3,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,3,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,P@1,R@1,nDCG@1,P@3,R@3,nDCG@3,P@5,R@5,nDCG@5,P@10,R@10,nDCG@10,MRR@10,MAP
mean,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Save results for later analysis & Plots.

In [48]:
per_q.to_parquet("../shared-data-library/evaluation/metrics_sparse_per_query.parquet", index=False)
summary.to_csv("../shared-data-library/evaluation/metrics_sparse_summary.csv", index=False)