In [1]:
import pytrec_eval
import pandas as pd
import os
import numpy as np
import ir_datasets
import json

In [2]:
def load_runs_metadata_table():
    table = []
    for run in os.listdir(f"../data/run"):
        parts = run.split("-")
        fields = {
            "dataset": "-".join(parts[1:-4]),
            "subcollection": parts[-4],
            "queries": parts[-3],
            "method": parts[-2],
            "implementation": parts[-1],
            "filename": run,
        }
        table.append(fields)
    runs = pd.DataFrame(table)
    runs = runs[
        ~((runs["subcollection"] == "WT") & (runs["queries"] != "queries"))
    ]  # longeval WT test only
    runs = runs[
        runs["method"].isin(
            [
                "bm25",
                "bm25+colbert",
                "bm25+monot5",
                "rrf(xsqram__bm25_bo1__pl2)",
                "bm25_d2q10",
            ]
        )
    ]
    return runs

# Trec-Covid
### qrels

In [3]:
def load_ir_dataset_qrels(dataset_name):
    dataset = ir_datasets.load(dataset_name)
    dataset_name_clean = dataset_name.replace("/", "-").replace("cord19-", "")
    with open(f"../data/qrels/{dataset_name_clean}.qrels", "w") as file_out:
        for qrel in dataset.qrels_iter():
            file_out.write(f"{qrel.query_id} 0 {qrel.doc_id} {qrel.relevance}\n")

In [4]:
def limit_to_core_qrels(qrels_path):
    qrels = pd.read_csv(qrels_path, sep=" ", header=None, names=["qid", "0", "docid", "relevance"])
    qrels = qrels[qrels["qid"]<=30]
    return qrels

In [177]:
load_ir_dataset_qrels("cord19/trec-covid/round1")
load_ir_dataset_qrels("cord19/trec-covid/round2")
load_ir_dataset_qrels("cord19/trec-covid/round3")
load_ir_dataset_qrels("cord19/trec-covid/round4")
load_ir_dataset_qrels("cord19/trec-covid/round5")

In [156]:
limit_to_core_qrels("../data/qrels/cord19-trec-covid-round1.qrels").to_csv("../data/qrels/trec-covid-round1.qrels-core_queries", sep=" ", header=None, index=False)
limit_to_core_qrels("../data/qrels/cord19-trec-covid-round2.qrels").to_csv("../data/qrels/trec-covid-round2.qrels-core_queries", sep=" ", header=None, index=False)
limit_to_core_qrels("../data/qrels/cord19-trec-covid-round3.qrels").to_csv("../data/qrels/trec-covid-round3.qrels-core_queries", sep=" ", header=None, index=False)
limit_to_core_qrels("../data/qrels/cord19-trec-covid-round4.qrels").to_csv("../data/qrels/trec-covid-round4.qrels-core_queries", sep=" ", header=None, index=False)
limit_to_core_qrels("../data/qrels/cord19-trec-covid-round5.qrels").to_csv("../data/qrels/trec-covid-round5.qrels-core_queries", sep=" ", header=None, index=False)

### runs

In [5]:
runs_table = load_runs_metadata_table()

In [146]:
for _, row in runs_table[runs_table["dataset"] =="trec-covid"].iterrows():
    run = pd.read_csv("../data/run/"+row.filename, sep=" ", header=None, names=["qid", "Q0", "docid", "rank", "score", "method"])
    run = run[run["qid"]<=30]
    run.to_csv("../data/run-core_queries/"+row.filename, sep=" ", header=None, index=False)

# LongEval
### qrels

In [6]:
qrel_paths = {
    "longeval-WT": "../data/dataset/LongEval/longeval-relevance-judgements/heldout-test.txt",
    "longeval-WT-train": "../data/dataset/LongEval/publish/French/Qrels/train.txt",
    "longeval-ST": "../data/dataset/LongEval/longeval-relevance-judgements/a-short-july.txt",
    "longeval-LT": "../data/dataset/LongEval/longeval-relevance-judgements/b-long-september.txt",
}

In [174]:
# move normal qrels
pd.read_csv(qrel_paths["longeval-ST"], sep=" ", header=None, names=["qid", "0", "doc_id", "relevance"]).to_csv("../data/qrels/longeval-ST.qrels-test", sep=" ", header=None, index=False)
pd.read_csv(qrel_paths["longeval-LT"], sep=" ", header=None, names=["qid", "0", "doc_id", "relevance"]).to_csv("../data/qrels/longeval-LT.qrels-test", sep=" ", header=None, index=False)

# move train qrels
test = pd.read_csv(qrel_paths["longeval-WT"], sep=" ", header=None, names=["qid", "0", "doc_id", "relevance"])
train = pd.read_csv(qrel_paths["longeval-WT-train"], sep=" ", header=None, names=["qid", "0", "doc_id", "relevance"])
merged = pd.concat([test, train]).to_csv("../data/qrels/longeval-WT.qrels", sep=" ", header=None, index=False)

In [7]:
def filter_qrels(path, subcollection, core_docs=False):
    # load qrels
    qrels = pd.read_csv(path, sep=" ", header=None, names=["qid", "0", "docid", "relevance"])

    # prepare patches
    longeval_core_docs = pd.read_csv("../evaluation/core_docs.tsv", sep="\t", index_col=0)
    patch_doc = longeval_core_docs[f"docno_{subcollection}"].reset_index()[[f"docno_{subcollection}", "index"]].set_index(f"docno_{subcollection}")["index"].astype(str).to_dict()

    longeval_core_queries = pd.read_csv("../evaluation/core_queries.tsv", sep="\t", index_col=0)
    patch_queries = longeval_core_queries[f"qid_{subcollection}"].reset_index()[[f"qid_{subcollection}", "index"]].set_index(f"qid_{subcollection}")["index"].astype(str).to_dict()

    if core_docs:
        qrels["docid"] = qrels["docid"].apply(lambda x: patch_doc.get(x, None))
    else:
        qrels["docid"] = qrels["docid"].apply(lambda x: patch_doc.get(x, x))
    qrels["qid"] = qrels["qid"].apply(lambda x: patch_queries.get(x, None))

    qrels = qrels.dropna()

    return qrels

In [147]:
filter_qrels(qrel_paths["longeval-ST"], "ST").to_csv("../data/qrels/longeval-ST.qrels-test-core_queries", sep=" ", header=None, index=False)
filter_qrels(qrel_paths["longeval-LT"], "LT").to_csv("../data/qrels/longeval-LT.qrels-test-core_queries", sep=" ", header=None, index=False)

In [107]:
test = filter_qrels(qrel_paths["longeval-WT"], "WT")
train = filter_qrels(qrel_paths["longeval-WT-train"], "WT")
merged = pd.concat([test, train]).to_csv("../data/qrels/longeval-WT.qrels-core_queries", sep=" ", header=None, index=False)

### Runs

In [114]:
def filter_run(row, core_docs=False):
    run = pd.read_csv("../data/run/"+row.filename, sep=" ", header=None, names=["qid", "Q0", "docid", "rank", "score", "method"])
    
    # prepare patches
    longeval_core_docs = pd.read_csv("../evaluation/core_docs.tsv", sep="\t", index_col=0)
    patch_doc = longeval_core_docs[f"docno_{row.subcollection}"].reset_index()[[f"docno_{row.subcollection}", "index"]].set_index(f"docno_{row.subcollection}")["index"].astype(str).to_dict()

    longeval_core_queries = pd.read_csv("../evaluation/core_queries.tsv", sep="\t", index_col=0)
    patch_queries = longeval_core_queries[f"qid_{row.subcollection}"].reset_index()[[f"qid_{row.subcollection}", "index"]].set_index(f"qid_{row.subcollection}")["index"].astype(str).to_dict()

    if core_docs:
        run["docid"] = run["docid"].apply(lambda x: patch_doc.get(x, None))
    else:
        run["docid"] = run["docid"].apply(lambda x: patch_doc.get(x, x))
    run["qid"] = run["qid"].apply(lambda x: patch_queries.get(x, None))

    return run.dropna()

In [117]:
runs_table = load_runs_metadata_table()
for _, row in runs_table[runs_table["dataset"] =="longeval"].iterrows():
    run = filter_run(row)
    run.to_csv("../data/run-core_queries/"+row.filename, sep=" ", header=None, index=False)

### topics

In [13]:
longeval_core_queries = pd.read_csv("../evaluation/core_queries.tsv", sep="\t", index_col=0)
patch_queries = longeval_core_queries[f"qid_WT"].reset_index()[[f"qid_WT", "index"]].set_index(f"qid_WT")["index"].astype(str).to_dict()

In [10]:
import pyterrier as pt
if not pt.started():
    pt.init()

PyTerrier 0.9.2 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [11]:
topics_path = "../data/index/index-longeval-WT-pyterrier/queries/full.trec"
topics = pt.io.read_topics(topics_path)

In [15]:
topics["qid"] = topics["qid"].apply(lambda x: patch_queries.get(x, None))

In [19]:
topics.dropna().to_csv("longeval_topics_core_queries_unified.tsv", sep=" ", header=None, index=False)

In [21]:
topics = pt.io.read_topics("longeval_topics_core_queries_unified.tsv", format="singleline")

In [22]:
topics

Unnamed: 0,qid,query
0,147,solar hot water
1,154,the lord of the rings
2,108,loa car
3,125,one way car rental
4,16,meteorological service of canada
...,...,...
119,148,sophie marceau
120,149,sosh customer space
121,151,sylvie retailleau
122,105,land of france


### TripClick

In [158]:
with open("../scripts/tripclick-subcollections.json", "r") as file:
    tripclick_subcollections = json.load(file)

In [159]:
tripclick_subcollections = pd.DataFrame(tripclick_subcollections.items(), columns=["subcollection", "doc_id"])

In [160]:
tripklick_t1_docids = tripclick_subcollections[tripclick_subcollections["doc_id"].isin(["t1", np.nan])]["subcollection"].to_list()  # some subcollections are nan because of missing metadata
tripklick_t2_docids = tripclick_subcollections[tripclick_subcollections["doc_id"].isin(["t1", "t2", np.nan])]["subcollection"].to_list()

In [161]:
len(tripklick_t1_docids), len(tripklick_t2_docids)

(565452, 1084487)

In [162]:
# limit qrels to core queries
def limit_qrels_by_docs(qrels_file_path, doc_ids, dataset_name):
    df = pd.read_csv(qrels_file_path, sep=" ", header=None, names=["qid", "Q0", "docid", "relevance"])
    df = df[df["docid"].astype(str).isin(doc_ids)]
    df.to_csv(f"../data/qrels/{dataset_name}", sep=" ", header=None, index=False)

In [163]:
limit_qrels_by_docs("../data/dataset/TripClick/benchmark/qrels/qrels.dctr.head.test.txt", tripklick_t1_docids, "tripclick-test-head-t1.qrels-test-head-dctr")

In [164]:
limit_qrels_by_docs("../data/dataset/TripClick/benchmark/qrels/qrels.dctr.head.test.txt", tripklick_t2_docids, "tripclick-test-head-t2.qrels-test-head-dctr")

In [165]:
# move full qrels for last subcollection
!cp ../data/dataset/TripClick/benchmark/qrels/qrels.dctr.head.test.txt ../data/qrels/tripclick-test-head-t3.qrels-test-head-dctr

In [166]:
# Move normal qrels
!cp ../data/qrels/tripclick-test-head-t3.qrels-test-head-dctr ../data/qrels/tripclick-test-head-t3.qrels-test-head-dctr-core_queries

In [167]:
!cp ../data/qrels/tripclick-test-head-t2.qrels-test-head-dctr ../data/qrels/tripclick-test-head-t2.qrels-test-head-dctr-core_queries

In [168]:
!cp ../data/qrels/tripclick-test-head-t1.qrels-test-head-dctr ../data/qrels/tripclick-test-head-t1.qrels-test-head-dctr-core_queries

#### runs
the runs can be moved, all qrels are used at any time

In [173]:
for _, row in runs_table[runs_table["dataset"] =="tripclick-test-head"].iterrows():
    run = pd.read_csv("../data/run/"+row.filename, sep=" ", header=None, names=["qid", "Q0", "docid", "rank", "score", "method"])
    run.to_csv("../data/run-core_queries/"+row.filename, sep=" ", header=None, index=False)

# Merge LongEval WT Runs

In [4]:
table = []
for run in os.listdir("../data/run"):
    parts = run.split("-")
    fields = {"dataset":  "-".join(parts[1:-4]),
    "subcollection":  parts[-4],
    "queries": parts[-3],
    "method": parts[-2],
    "implementation": parts[-1],
    "filename": run}
    table.append(fields)

In [5]:
df = pd.DataFrame(table)

In [36]:
for method in df[df["dataset"]=="longeval"][df["subcollection"]=="WT"]["method"].unique():
    runs = df[df["dataset"]=="longeval"][df["subcollection"]=="WT"][df["method"]==method]["filename"].to_list()
    implementation = runs[0].split("-")[-1]
    full_run = pd.concat([pd.read_csv(base_path+runs[0], sep=" ", header=None, names=["qid", "Q0", "docid", "rank", "score", "method"]), pd.read_csv(base_path+runs[1], sep=" ", header=None, names=["qid", "Q0", "docid", "rank", "score", "method"])])
    full_run.to_csv(f"../data/run/run-longeval-WT-queries-{method}-{implementation}", sep=" ", header=None, index=False)

  for method in df[df["dataset"]=="longeval"][df["subcollection"]=="WT"]["method"].unique():
  runs = df[df["dataset"]=="longeval"][df["subcollection"]=="WT"][df["method"]==method]["filename"].to_list()
  runs = df[df["dataset"]=="longeval"][df["subcollection"]=="WT"][df["method"]==method]["filename"].to_list()
  runs = df[df["dataset"]=="longeval"][df["subcollection"]=="WT"][df["method"]==method]["filename"].to_list()
  runs = df[df["dataset"]=="longeval"][df["subcollection"]=="WT"][df["method"]==method]["filename"].to_list()
  runs = df[df["dataset"]=="longeval"][df["subcollection"]=="WT"][df["method"]==method]["filename"].to_list()
  runs = df[df["dataset"]=="longeval"][df["subcollection"]=="WT"][df["method"]==method]["filename"].to_list()
  runs = df[df["dataset"]=="longeval"][df["subcollection"]=="WT"][df["method"]==method]["filename"].to_list()
  runs = df[df["dataset"]=="longeval"][df["subcollection"]=="WT"][df["method"]==method]["filename"].to_list()
