In [1]:
import pandas as pd  # type: ignore
from src.exp_logger import logger  # type: ignore
from src.load_index import setup_system, get_train_splits
import numpy as np
import pyterrier as pt  # type: ignore
import yaml  # type: ignore
import os

import matplotlib.pyplot as plt

with open("../settings.yml", "r") as yamlfile:
    config = yaml.load(yamlfile, Loader=yaml.FullLoader)

PyTerrier 0.9.2 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


# Train

In [2]:
_, topics, qrels = setup_system("WT", train=True)

Loaded index with  1570734 documents.


In [3]:
def get_runs(results_path, subcollection=None):
    runs=[]
    run_files =  os.listdir(results_path)
    if subcollection:
        run_files = [run for run in run_files if run.endswith(subcollection)]
    else:
        run_files = [run for run in run_files if run.endswith("WT")]
    for run_name in run_files:
        runs.append(pt.io.read_results(os.path.join(results_path, run_name)))
    return runs, run_files

In [4]:
results_path = "../results/train/"
runs, names = get_runs(results_path)

In [5]:
results = pt.Experiment(
    runs,
    topics,
    qrels,
    eval_metrics=["map", "bpref", "recip_rank", "P_20",  "ndcg", "ndcg_cut_20"],
    names=names,
    verbose=True,
    round=4,
    baseline=0,
    correction='bonferroni'
)
results[["name", "map", "bpref", "recip_rank", "P_20", "ndcg", "ndcg_cut_20", "map reject", "bpref reject", "recip_rank reject", "P_20 reject", "ndcg reject", "ndcg_cut_20 reject"]].sort_values(by="name")

pt.Experiment: 100%|██████████| 17/17 [00:49<00:00,  2.91s/system]


Unnamed: 0,name,map,bpref,recip_rank,P_20,ndcg,ndcg_cut_20,map reject,bpref reject,recip_rank reject,P_20 reject,ndcg reject,ndcg_cut_20 reject
7,IRC_BM25+Bo1.WT,0.147,0.3341,0.2534,0.0661,0.2922,0.2075,False,False,False,False,True,False
15,IRC_BM25+RM3.WT,0.1426,0.3295,0.2408,0.0658,0.2867,0.2035,False,True,False,False,True,False
16,IRC_BM25+colBERT.WT,0.1682,0.3447,0.3046,0.0692,0.3082,0.231,True,False,True,False,False,False
6,IRC_BM25+monoT5.WT,0.1809,0.3494,0.3216,0.0768,0.3208,0.249,True,False,True,True,True,True
8,IRC_BM25+passages+monoT5.WT,0.154,0.3369,0.2743,0.0708,0.2969,0.2196,False,False,False,False,False,False
4,IRC_BM25.WT,0.1452,0.3245,0.2604,0.0654,0.2884,0.2087,False,True,False,False,True,False
1,IRC_E5_base.WT,0.1545,0.3483,0.2826,0.0634,0.291,0.2128,False,False,False,False,False,False
11,IRC_E5_small.WT,0.1437,0.3265,0.2705,0.0619,0.2762,0.2039,False,False,False,False,True,False
13,IRC_PL2.WT,0.1408,0.3352,0.2572,0.065,0.2884,0.2064,False,False,False,False,True,False
5,IRC_RRF(BM25+Bo1-XSqrA_M-PL2).WT,0.1511,0.3466,0.2686,0.0673,0.304,0.2156,False,False,False,False,False,False


# Test WT

In [15]:
_, topics, qrels = setup_system("WT", train=False)
results_path = "../results/test/"
runs, names = get_runs(results_path, "WT")

Loaded index with  1570734 documents.


In [16]:
results_WT = pt.Experiment(
    runs,
    topics,
    qrels,
    eval_metrics=["map", "bpref", "recip_rank", "P_20",  "ndcg", "ndcg_cut_20"],
    names=names,
    verbose=True,
    round=4,
    baseline=1,
    correction='bonferroni'
)
results_WT[["name", "map", "bpref", "recip_rank", "P_20", "ndcg", "ndcg_cut_20", "map reject", "bpref reject", "recip_rank reject", "P_20 reject", "ndcg reject", "ndcg_cut_20 reject"]]

pt.Experiment:   0%|          | 0/6 [00:00<?, ?system/s]

pt.Experiment: 100%|██████████| 6/6 [00:03<00:00,  1.82system/s]


Unnamed: 0,name,map,bpref,recip_rank,P_20,ndcg,ndcg_cut_20,map reject,bpref reject,recip_rank reject,P_20 reject,ndcg reject,ndcg_cut_20 reject
0,IRC_E5_base.WT,0.1629,0.327,0.2923,0.0673,0.2891,0.2214,False,False,False,False,False,False
1,IRC_BM25.WT,0.1328,0.2924,0.2479,0.0648,0.2697,0.1945,False,False,False,False,False,False
2,IRC_RRF(BM25+Bo1-XSqrA_M-PL2).WT,0.1355,0.3122,0.2487,0.0658,0.2842,0.1984,False,False,False,False,True,False
3,IRC_BM25+monoT5.WT,0.1642,0.3093,0.2934,0.0781,0.3034,0.2406,False,False,False,True,False,False
4,IRC_d2q+BM25.WT,0.1347,0.3109,0.2257,0.0684,0.2746,0.1963,False,False,False,False,False,False
5,IRC_BM25+colBERT.WT,0.1551,0.3246,0.2572,0.0704,0.2883,0.2189,False,False,False,False,False,False


# Test ST

In [17]:
_, topics, qrels = setup_system("ST", train=False)
results_path = "../results/test/"
runs, names = get_runs(results_path, "ST")

18:31:46.524 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 3,4 GiB of memory would be required.
Loaded index with  1593376 documents.


In [18]:
results_ST = pt.Experiment(
    runs,
    topics,
    qrels,
    eval_metrics=["map", "bpref", "recip_rank", "P_20",  "ndcg", "ndcg_cut_20"],
    names=names,
    verbose=True,
    round=4,
    baseline=1,
    correction='bonferroni'
)
results_ST[["name", "map", "bpref", "recip_rank", "P_20", "ndcg", "ndcg_cut_20", "map reject", "bpref reject", "recip_rank reject", "P_20 reject", "ndcg reject", "ndcg_cut_20 reject"]]

pt.Experiment:   0%|          | 0/6 [00:00<?, ?system/s]

pt.Experiment: 100%|██████████| 6/6 [00:22<00:00,  3.67s/system]


Unnamed: 0,name,map,bpref,recip_rank,P_20,ndcg,ndcg_cut_20,map reject,bpref reject,recip_rank reject,P_20 reject,ndcg reject,ndcg_cut_20 reject
0,IRC_d2q+BM25.ST,0.168,0.3353,0.2789,0.0705,0.3072,0.2316,True,False,True,True,True,True
1,IRC_BM25+monoT5.ST,0.19,0.3485,0.3271,0.0768,0.3256,0.2602,False,False,False,False,False,False
2,IRC_RRF(BM25+Bo1-XSqrA_M-PL2).ST,0.1516,0.3264,0.2557,0.0657,0.2939,0.2135,True,True,True,True,True,True
3,IRC_BM25.ST,0.151,0.3154,0.2567,0.0658,0.2871,0.2127,True,True,True,True,True,True
4,IRC_BM25+colBERT.ST,0.1769,0.3445,0.31,0.0705,0.3132,0.2419,True,False,False,True,True,True
5,IRC_E5_base.ST,0.1599,0.3519,0.2915,0.0652,0.297,0.2227,True,False,True,True,True,True


# Test LT

In [19]:
_, topics, qrels = setup_system("LT", train=False)
results_path = "../results/test/"
runs, names = get_runs(results_path, "LT")

18:32:23.937 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 2,3 GiB of memory would be required.
Loaded index with  1081334 documents.


In [20]:
results_LT = pt.Experiment(
    runs,
    topics,
    qrels,
    eval_metrics=["map", "bpref", "recip_rank", "P_20",  "ndcg", "ndcg_cut_20"],
    names=names,
    verbose=True,
    round=4,
    baseline=1,
    correction='bonferroni'
)
results_LT[["name", "map", "bpref", "recip_rank", "P_20", "ndcg", "ndcg_cut_20", "map reject", "bpref reject", "recip_rank reject", "P_20 reject", "ndcg reject", "ndcg_cut_20 reject"]]

pt.Experiment: 100%|██████████| 6/6 [00:23<00:00,  4.00s/system]


Unnamed: 0,name,map,bpref,recip_rank,P_20,ndcg,ndcg_cut_20,map reject,bpref reject,recip_rank reject,P_20 reject,ndcg reject,ndcg_cut_20 reject
0,IRC_BM25.LT,0.1534,0.3171,0.2645,0.0722,0.2989,0.2168,True,False,True,True,True,True
1,IRC_BM25+colBERT.LT,0.1736,0.3288,0.3188,0.0775,0.3209,0.2439,False,False,False,False,False,False
2,IRC_d2q+BM25.LT,0.1736,0.3337,0.2937,0.0793,0.3211,0.2397,False,False,False,False,False,False
3,IRC_BM25+monoT5.LT,0.1895,0.3429,0.3359,0.0856,0.3376,0.2662,True,False,False,True,True,True
4,IRC_RRF(BM25+Bo1-XSqrA_M-PL2).LT,0.1557,0.322,0.2716,0.0738,0.3068,0.2202,True,False,True,False,True,True
5,IRC_E5_base.LT,0.1661,0.3554,0.305,0.0726,0.3131,0.2325,False,True,False,True,False,False


# Delta

In [21]:
results_WT["name"] = results_WT["name"].str.replace(".WT", "")
results_WT["name"] = results_WT["name"].str.replace(".ST", "")
results_ST["name"] = results_ST["name"].str.replace(".ST", "")
results_LT["name"] = results_LT["name"].str.replace(".WT", "")
results_LT["name"] = results_LT["name"].str.replace(".LT", "")

  results_WT["name"] = results_WT["name"].str.replace(".WT", "")
  results_WT["name"] = results_WT["name"].str.replace(".ST", "")
  results_ST["name"] = results_ST["name"].str.replace(".ST", "")
  results_LT["name"] = results_LT["name"].str.replace(".WT", "")
  results_LT["name"] = results_LT["name"].str.replace(".LT", "")


In [23]:
rall = results_WT.merge(results_ST, left_on="name", right_on="name", suffixes=["_WT", "_ST"]).merge(results_LT, left_on="name", right_on="name", suffixes=["", "_LT"])

In [24]:
rall["ndcg_WT_ST"] = round(rall["ndcg_WT"] - rall["ndcg_ST"],4)
rall["ndcg_WT_LT"] = round(rall["ndcg_WT"] - rall["ndcg"],4)
rall["ndcg_ST_LT"] =round( rall["ndcg_ST"] - rall["ndcg"],4)

rall["P_20_WT_ST"] = round(rall["P_20_WT"] - rall["P_20_ST"],4)
rall["P_20_WT_LT"] = round(rall["P_20_WT"] - rall["P_20"],4)
rall["P_20_ST_LT"] = round(rall["P_20_ST"] - rall["P_20"],4)

rall["map_WT_ST"] = round(rall["map_WT"] - rall["map_ST"],4)
rall["map_WT_LT"] = round(rall["map_WT"] - rall["map"],4)
rall["map_ST_LT"] = round(rall["map_ST"] - rall["map"],4)

rall["ndcg_cut_20_WT_ST"] = round(rall["ndcg_cut_20_WT"] - rall["ndcg_cut_20_ST"],4)
rall["ndcg_cut_20_WT_LT"] = round(rall["ndcg_cut_20_WT"] - rall["ndcg_cut_20"],4)
rall["ndcg_cut_20_ST_LT"] = round(rall["ndcg_cut_20_ST"] - rall["ndcg_cut_20"],4)

rall["recip_rank_WT_ST"] = round(rall["recip_rank_WT"] - rall["recip_rank_ST"],4)
rall["recip_rank_WT_LT"] = round(rall["recip_rank_WT"] - rall["recip_rank"],4)
rall["recip_rank_ST_LT"] = round(rall["recip_rank_ST"] - rall["recip_rank"],4)
rall["bpref_WT_ST"] = round(rall["bpref_WT"] - rall["bpref_ST"],4)
rall["bpref_WT_LT"] = round(rall["bpref_WT"] - rall["bpref"],4)
rall["bpref_ST_LT"] = round(rall["bpref_ST"] - rall["bpref"],4)


In [25]:
rall

Unnamed: 0,name,map_WT,bpref_WT,recip_rank_WT,P_20_WT,ndcg_WT,ndcg_cut_20_WT,map +_WT,map -_WT,map p-value_WT,...,map_ST_LT,ndcg_cut_20_WT_ST,ndcg_cut_20_WT_LT,ndcg_cut_20_ST_LT,recip_rank_WT_ST,recip_rank_WT_LT,recip_rank_ST_LT,bpref_WT_ST,bpref_WT_LT,bpref_ST_LT
0,IRC_E5_base,0.1629,0.327,0.2923,0.0673,0.2891,0.2214,38.0,46.0,0.123987,...,-0.0062,-0.0013,-0.0111,-0.0098,0.0008,-0.0127,-0.0135,-0.0249,-0.0284,-0.0035
1,IRC_BM25,0.1328,0.2924,0.2479,0.0648,0.2697,0.1945,,,,...,-0.0024,-0.0182,-0.0223,-0.0041,-0.0088,-0.0166,-0.0078,-0.023,-0.0247,-0.0017
2,IRC_RRF(BM25+Bo1-XSqrA_M-PL2),0.1355,0.3122,0.2487,0.0658,0.2842,0.1984,47.0,38.0,0.542216,...,-0.0041,-0.0151,-0.0218,-0.0067,-0.007,-0.0229,-0.0159,-0.0142,-0.0098,0.0044
3,IRC_BM25+monoT5,0.1642,0.3093,0.2934,0.0781,0.3034,0.2406,49.0,33.0,0.028947,...,0.0005,-0.0196,-0.0256,-0.006,-0.0337,-0.0425,-0.0088,-0.0392,-0.0336,0.0056
4,IRC_d2q+BM25,0.1347,0.3109,0.2257,0.0684,0.2746,0.1963,46.0,38.0,0.758876,...,-0.0056,-0.0353,-0.0434,-0.0081,-0.0532,-0.068,-0.0148,-0.0244,-0.0228,0.0016
5,IRC_BM25+colBERT,0.1551,0.3246,0.2572,0.0704,0.2883,0.2189,49.0,34.0,0.201482,...,0.0033,-0.023,-0.025,-0.002,-0.0528,-0.0616,-0.0088,-0.0199,-0.0042,0.0157
