In [34]:
from ranx import Qrels, Run

qrels = Qrels.from_file("qrels.rag24.raggy-dev.txt")

In [35]:
run_splade = Run.from_file("raggy-dev/raggy-dev_splade.txt")

In [36]:
run_bge = Run.from_file("raggy-dev/raggy-bge_results.txt")

In [37]:
run_gemini = Run.from_file("raggy-dev/gemini_raggydev_rerank.txt")

In [51]:
run_jina = Run.from_file("raggy-dev/raggy-jina_results.txt")

In [6]:
def modify_run_docids(run):
    return {
        query_id: {
            doc_id.split('#')[0]: score
            for doc_id, score in query_results.items()
        }
        for query_id, query_results in run.to_dict().items()
    }

In [38]:
def modify_run_docids(run):
    modified_run = {}
    for query_id, query_results in run.to_dict().items():
        modified_query_results = {}
        for doc_id, score in query_results.items():
            base_doc_id = doc_id.split('#')[0]
            if base_doc_id not in modified_query_results or score > modified_query_results[base_doc_id]:
                modified_query_results[base_doc_id] = score
        modified_run[query_id] = modified_query_results
    return modified_run

In [39]:
run_splade = Run.from_dict(modify_run_docids(run_splade))

In [40]:
run_bge = Run.from_dict(modify_run_docids(run_bge))

In [41]:
run_gemini = Run.from_dict(modify_run_docids(run_gemini))

In [52]:
run_jina = Run.from_dict(modify_run_docids(run_jina))

In [53]:
from ranx import evaluate

In [43]:
evaluate(qrels, run_splade, ["ndcg@10", "map", "mrr", "precision@10", "recall@100"])

{'ndcg@10': np.float64(0.5426624354049229),
 'map': np.float64(0.22604420645827805),
 'mrr': np.float64(0.8679861111111111),
 'precision@10': np.float64(0.625),
 'recall@100': np.float64(0.35705680734407924)}

In [44]:
evaluate(qrels, run_bge, ["ndcg@10", "map", "mrr", "precision@10", "recall@100"])

{'ndcg@10': np.float64(0.23530983618429158),
 'map': np.float64(0.14789757286853275),
 'mrr': np.float64(0.4856343089526709),
 'precision@10': np.float64(0.3358333333333333),
 'recall@100': np.float64(0.3445041555180156)}

In [45]:
evaluate(qrels, run_gemini, ["ndcg@10", "map", "mrr", "precision@10", "recall@100"])

{'ndcg@10': np.float64(0.567565964807014),
 'map': np.float64(0.19183424551258313),
 'mrr': np.float64(0.8817272347535504),
 'precision@10': np.float64(0.6491666666666667),
 'recall@100': np.float64(0.28179310296757626)}

In [54]:
evaluate(qrels, run_jina, ["ndcg@10", "map", "mrr", "precision@10", "recall@100"])

{'ndcg@10': np.float64(0.23367470173791194),
 'map': np.float64(0.14402895958499135),
 'mrr': np.float64(0.4928165351299738),
 'precision@10': np.float64(0.33249999999999996),
 'recall@100': np.float64(0.34220160424356255)}

In [46]:
from ranx import compare

In [63]:
report = compare(
    qrels=qrels,
    runs=[run_splade, run_jina, run_bge, run_gemini],
    metrics=["ndcg@10", "map", "mrr", "precision@10", "recall@100"],
    rounding_digits=4,
    show_percentages=True,
)

In [59]:
report.model_names = ["SPLADEv3", "JINA-v2", "BAAI/BGE", "Gemini"]

In [64]:
print(report.to_latex())

% Add in preamble
\usepackage{graphicx}
\usepackage{booktabs}


% To change the table size, act on the resizebox argument `0.8`.
\begin{table*}[ht]
\centering
\caption{
Overall effectiveness of the models.
The best results are highlighted in boldface.
Superscripts denote significant differences in paired Student's t-test with $p \le 0.01$.
}
\resizebox{0.8\textwidth}{!}{
\begin{tabular}{c|l|c|c|c|c|c}
\toprule
\textbf{\#}
& \textbf{Model}
& \textbf{NDCG@10}
& \textbf{MAP}
& \textbf{MRR}
& \textbf{P@10}
& \textbf{Recall@100} \\ 
\midrule
a &
run\_1 &
54.27$^{bc}$\hphantom{$^{d}$} &
\textbf{22.60}$^{bcd}$\hphantom{} &
86.80$^{bc}$\hphantom{$^{d}$} &
62.50$^{bc}$\hphantom{$^{d}$} &
\textbf{35.71}$^{bcd}$\hphantom{} \\
b &
run\_2 &
23.37\hphantom{$^{acd}$} &
14.40\hphantom{$^{acd}$} &
49.28\hphantom{$^{acd}$} &
33.25\hphantom{$^{acd}$} &
34.22$^{d}$\hphantom{$^{ac}$} \\
c &
run\_3 &
23.53\hphantom{$^{abd}$} &
14.79\hphantom{$^{abd}$} &
48.56\hphantom{$^{abd}$} &
33.58\hphantom{$^{abd}$} &


In [18]:
run.mean_scores

{}