In [1]:
import pandas as pd
import json
import os

# LongEval
- filter runs for core queries
- filter runs for core docs
- filter qrels 
- harmonize

In [15]:
def filter_qrels(subcollection, mode=""):
    qrel_paths = {
        "WT": "../data/dataset/LongEval/longeval-relevance-judgements/heldout-test.txt",
        "WT-train": "../data/dataset/LongEval/publish/French/Qrels/train.txt",
        "ST": "../data/dataset/LongEval/longeval-relevance-judgements/a-short-july.txt",
        "LT": "../data/dataset/LongEval/longeval-relevance-judgements/b-long-september.txt",
    }

    # load core queries
    longeval_core_queries = pd.read_csv("../evaluation/core_queries.tsv", sep="\t", index_col=0)
    # load qrels
    qrels = pd.read_csv(qrel_paths[subcollection + mode], sep=" ", header=None, names=["qid", "Q0", "docid", "relevance"])

    patch = longeval_core_queries[f"qid_{subcollection}"].reset_index()[[f"qid_{subcollection}", "index"]].set_index(f"qid_{subcollection}")["index"].astype(str).to_dict()

    # rename qrels
    qrels["qid"] = qrels["qid"].apply(lambda x: patch.get(x, None))

    # filter
    return qrels.dropna()

In [16]:
wt = filter_qrels("WT")
wt_train = filter_qrels("WT", "-train")
wt = pd.concat([wt, wt_train])
wt.to_csv("../data/qrels/longeval-WT.qrels-test-core_queries", sep=" ", header=None, index=False)

In [17]:
st = filter_qrels("ST")
st.to_csv("../data/qrels/longeval-ST.qrels-test-core_queries", sep=" ", header=None, index=False)

In [18]:
st = filter_qrels("LT")
st.to_csv("../data/qrels/longeval-LT.qrels-test-core_queries", sep=" ", header=None, index=False)

### Filter Runs

In [8]:
def load_runs_metadata_table():
    table = []
    for run in os.listdir(f"../data/run"):
        parts = run.split("-")
        fields = {
            "dataset": "-".join(parts[1:-4]),
            "subcollection": parts[-4],
            "queries": parts[-3],
            "method": parts[-2],
            "implementation": parts[-1],
            "filename": run,
        }
        table.append(fields)
    runs = pd.DataFrame(table)
    runs = runs[
        ~((runs["subcollection"] == "WT") & (runs["queries"] != "queries"))
    ]  # longeval WT test only
    runs = runs[
        runs["method"].isin(
            [
                "bm25",
                "bm25+colbert",
                "bm25+monot5",
                "rrf(xsqram__bm25_bo1__pl2)",
                "bm25_d2q10",
            ]
        )
    ]
    return runs

In [20]:
longeval_core_docs

Unnamed: 0,docno_WT,docno_ST,docno_LT
0,doc062210000001,doc072204608334,doc092205609019
1,doc062210000002,doc072204608335,doc092205609020
2,doc062210000003,doc072204608336,doc092205609021
5,doc062210000006,doc072204608339,doc092205609023
7,doc062210000008,doc072204608341,doc092205609024
...,...,...,...
1570723,doc062200909296,doc072211909464,doc092208702718
1570724,doc062200909297,doc072211909465,doc092208702719
1570725,doc062200909298,doc072211909466,doc092208702720
1570729,doc062200909302,doc072211909470,doc092208702721


In [9]:
def filter_run(row, queries=True):
    run = pd.read_csv("../data/run/"+row.filename, sep=" ", header=None, names=["qid", "Q0", "docid", "rank", "score", "method"])
    
    if queries:
        longeval_core_queries = pd.read_csv("../evaluation/core_queries.tsv", sep="\t", index_col=0)
        
        patch = longeval_core_queries[f"qid_{row.subcollection}"].reset_index()[[f"qid_{row.subcollection}", "index"]].set_index(f"qid_{row.subcollection}")["index"].astype(str).to_dict()

        run["qid"] = run["qid"].apply(lambda x: patch.get(x, None))
    return run.dropna()

In [10]:
runs_table = load_runs_metadata_table()

In [11]:
for _, row in runs_table[runs_table["dataset"] =="longeval"].iterrows():
    run = filter_run(row)
    run.to_csv("../data/run-core_queries/"+row.filename, sep=" ", header=None, index=False)

In [27]:
def filter_run(row):
    run = pd.read_csv("../data/run-core_queries/"+row.filename, sep=" ", header=None, names=["qid", "Q0", "docid", "rank", "score", "method"])
    
    longeval_core_docs = pd.read_csv("../evaluation/core_docs.tsv", sep="\t", index_col=0)
        
    patch = longeval_core_docs[f"docno_{row.subcollection}"].reset_index()[[f"docno_{row.subcollection}", "index"]].set_index(f"docno_{row.subcollection}")["index"].astype(str).to_dict()

    run["docid"] = run["docid"].apply(lambda x: patch.get(x, x))
    return run

In [28]:
for _, row in runs_table[runs_table["dataset"] =="longeval"].iterrows():
    run = filter_run(row)
    run.to_csv("../data/run-core_docs/"+row.filename, sep=" ", header=None, index=False)

# Trec-covid

In [12]:
runs_table = load_runs_metadata_table()

In [13]:
for _, row in runs_table[runs_table["dataset"] =="trec-covid"].iterrows():
    run = pd.read_csv("../data/run/"+row.filename, sep=" ", header=None, names=["qid", "Q0", "docid", "rank", "score", "method"])
    run = run[run["qid"]<=30]
    run.to_csv("../data/run-core_queries/"+row.filename, sep=" ", header=None, index=False)

# Trip Click

In [14]:
for _, row in runs_table[runs_table["dataset"] =="tripclick-test-head"].iterrows():
    run = pd.read_csv("../data/run/"+row.filename, sep=" ", header=None, names=["qid", "Q0", "docid", "rank", "score", "method"])
    run.to_csv("../data/run-core_queries/"+row.filename, sep=" ", header=None, index=False)