In [1]:
from src.exp_logger import logger

from src.load_index import setup_system, tag
import yaml
import os
import pandas as pd
import pyterrier as pt  # type: ignore

with open("../settings.yml", "r") as yamlfile:
    config = yaml.load(yamlfile, Loader=yaml.FullLoader)

PyTerrier 0.9.2 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [15]:
runs = [
"IRC_RRF(BM25-XSqrA_M-PL2).WT"
]
for run in runs:
    r = pd.read_csv(f"../results/train/{run}", sep=" ", header=None)
    r.groupby(0).apply(lambda x: x.sort_values(3, ascending=True).head(1000)).reset_index(drop=True).to_csv(run, sep=" ", header=None, index=False)

In [16]:
index, topics, qrels = setup_system(index_name ="WT", train=True)

original = pt.io.read_results("../results/train/IRC_RRF(BM25-XSqrA_M-PL2).WT")
short =  pt.io.read_results("../src/IRC_RRF(BM25-XSqrA_M-PL2).WT")

results = pt.Experiment(
    [original, short],
    topics,
    qrels,
    eval_metrics=["ndcg", "P_20", "map", "ndcg_cut_20", "recip_rank", "bpref"],
    names=["original", "short"],
    filter_by_qrels = True,
    round=4)

Loaded index with  1570734 documents.


In [17]:
results

Unnamed: 0,name,ndcg,P_20,map,ndcg_cut_20,recip_rank,bpref
0,original,0.2967,0.0656,0.1462,0.2101,0.2646,0.338
1,short,0.2929,0.0656,0.1461,0.2101,0.2645,0.3339


In [8]:
r

Unnamed: 0,0,1,2,3,4,5
0,q092210070,Q0,doc092206607761,1,0.049180,IRC_RRF(BM25+Bo1-XSqrA_M-PL2).LT
1,q092210070,Q0,doc092203705546,2,0.048131,IRC_RRF(BM25+Bo1-XSqrA_M-PL2).LT
2,q092210070,Q0,doc092204900140,3,0.047875,IRC_RRF(BM25+Bo1-XSqrA_M-PL2).LT
3,q092210070,Q0,doc092200801916,4,0.046635,IRC_RRF(BM25+Bo1-XSqrA_M-PL2).LT
4,q092210070,Q0,doc092207701870,5,0.045228,IRC_RRF(BM25+Bo1-XSqrA_M-PL2).LT
...,...,...,...,...,...,...
1316113,q0922999,Q0,doc092209504483,1058,0.000948,IRC_RRF(BM25+Bo1-XSqrA_M-PL2).LT
1316114,q0922999,Q0,doc092204407058,1059,0.000947,IRC_RRF(BM25+Bo1-XSqrA_M-PL2).LT
1316115,q0922999,Q0,doc092200708356,1060,0.000946,IRC_RRF(BM25+Bo1-XSqrA_M-PL2).LT
1316116,q0922999,Q0,doc092201108044,1061,0.000945,IRC_RRF(BM25+Bo1-XSqrA_M-PL2).LT


In [2]:
results_path = os.path.join("..", config["results_path"]).replace("trec", "submission")

In [19]:
!ls {results_path}

 IRC_BM25+colBERT.LT	 IRC_d2q+BM25.WT
 IRC_BM25+colBERT.meta	 IRC_d2q+BM25.zip
 IRC_BM25+colBERT.ST	 IRC_E5_base.LT
 IRC_BM25+colBERT.WT	 IRC_E5_base.meta
 IRC_BM25+colBERT.zip	 IRC_E5_base.ST
 IRC_BM25+monoT5.LT	 IRC_E5_base.WT
 IRC_BM25+monoT5.meta	 IRC_E5_base.zip
 IRC_BM25+monoT5.ST	'IRC_RRF(BM25+Bo1-XSqrA_M-PL2).LT'
 IRC_BM25+monoT5.WT	'IRC_RRF(BM25+Bo1-XSqrA_M-PL2).meta'
 IRC_BM25+monoT5.zip	'IRC_RRF(BM25+Bo1-XSqrA_M-PL2).ST'
 IRC_d2q+BM25.LT	'IRC_RRF(BM25+Bo1-XSqrA_M-PL2).WT'
 IRC_d2q+BM25.meta	'IRC_RRF(BM25+Bo1-XSqrA_M-PL2).zip'
 IRC_d2q+BM25.ST


In [3]:
runs = [
    # 'IRC_BM25+colBERT.LT',
    # 'IRC_d2q+BM25.WT',
    # 'IRC_BM25+colBERT.ST',
    # 'IRC_E5_base.LT',
    # 'IRC_BM25+colBERT.WT',
    # 'IRC_E5_base.ST',
    # 'IRC_BM25+monoT5.LT',
    # 'IRC_E5_base.WT',
    # 'IRC_BM25+monoT5.ST',
    # 'IRC_BM25+monoT5.WT',
    # 'IRC_d2q+BM25.ST'
    # 'IRC_d2q+BM25.LT',
    'IRC_RRF(BM25+Bo1-XSqrA_M-PL2).WT',
    'IRC_RRF(BM25+Bo1-XSqrA_M-PL2).ST',
    'IRC_RRF(BM25+Bo1-XSqrA_M-PL2).LT',
        ]

In [4]:
def check_run(run_tag):
    """Run sanitizer:
    - check if all topics are in the collection
    - check if all topics are in the run
    - check if topics with < 1000 docs exist
    - check if topics with > 1000 docs exist
    - check if all doc ids are valid
    - evaluate on projected qrels from WT

    Args:
        run_tag (_type_): _description_
    """
    # load data
    index, topics, qrels = setup_system(index_name =run_tag[-2:], train=False)
    collection_topics = topics["qid"].unique().tolist()


    run = pd.read_csv(os.path.join(results_path, run_tag), sep=" ", header=None, names=["topic", "Q0", "docid", "rank", "score", "tag"])
    run_topics = run["topic"].unique().tolist()

    print(f"----------- Check run `{run_tag}` and collection `{run_tag[-2:]}` -----------")


    BM25 = pt.BatchRetrieve(index, wmodel="BM25")

    # check if all topics are in the collection
    not_in_collection = set(run_topics).difference(collection_topics)  # topics in run but not in collection
    if not_in_collection:
        print(f"- Found {len(not_in_collection)} topics in run but not in collection: {not_in_collection}")
    else:
        print("All topics in run are in collection")
    print("\n")

    # check if all topics are in the run
    not_in_run = set(collection_topics).difference(run_topics)  # topics in collection but not in run
    if not_in_run:
        print(f"Found {len(not_in_run)} topics in collection but not in run: {not_in_run}")
        for topic in not_in_run:
            query = topics[topics["qid"] == topic]["query"].iloc[0]
            results = BM25.search(query)

            print(f"- Topic `{topic}`: `{query}`. BM25 found {len(results)} docs")
    else:
        print("All topics in collection are in run")
    print("\n")

    # check if all topics have 1000 docs
    not_1k = run.groupby("topic").count()[run.groupby("topic").count()["docid"]<1000]
    if len(not_1k):
        print(f"Found {len(not_1k)} topics with less than 1000 docs")
        for i in not_1k.itertuples():
            query = topics[topics["qid"] == i[0]]["query"].iloc[0]
            print(f"Topic `{i[0]}`: `{query}` has only {i[1]} docs")
    
    else:
        print("All topics have 1000 docs")
    print("\n")

    # check if all topics have more than 1000 docs
    more_1k = run.groupby("topic").count()[run.groupby("topic").count()["docid"]>1000]
    if len(more_1k):
        print(f"Found {len(more_1k)} topics with more than 1000 docs")
        for i in more_1k.itertuples():
            query = topics[topics["qid"] == i[0]]["query"].iloc[0]
            print(f"Topic `{i[0]}`: `{query}` has {i[1]} docs")
    else:
        print("No topics have more than 1000 docs")
    print("\n")

    # check if all doc ids are valid
    id_no = {}
    meta = index.getMetaIndex()

    while True:
        try:
            id_no[len(id_no)] = meta.getItem("docno", len(id_no))
        except:
            break
    docs_in_run = set(run["docid"].unique().tolist())
    docs_in_index = set(id_no.values())
    not_in_index = docs_in_run.difference(docs_in_index)
    if not_in_index:
        print(f"Found {len(not_in_index)} docs in run but not in index")
        for doc in not_in_index:
            print(f"- Doc `{doc}`")
    else:
        print("All docs in run are in index")
    print("\n")

    # evaluate on projected qrels
    if run_tag[-2:] != "WT":
        qrels = pt.io.read_qrels(f"../data/qrels_{run_tag[-2:]}.txt")
        run = pt.io.read_results(os.path.join(results_path, run_tag))
        num_topics = len(qrels["qid"].unique().tolist())
        print(f"Evaluate on {num_topics} rojected topics")

        results = pt.Experiment(
            [run],
            topics,
            qrels,
            eval_metrics=["ndcg", "P_20", "map", "ndcg_cut_20", "recip_rank", "bpref"],
            names=[run_tag],
            filter_by_qrels = True,
            round=4)
        print(results)

In [5]:
for run in runs:
    check_run(run)
    print("\n\n")

Loaded index with  1570734 documents.
----------- Check run `IRC_RRF(BM25+Bo1-XSqrA_M-PL2).WT` and collection `WT` -----------
All topics in run are in collection


All topics in collection are in run


All topics have 1000 docs


No topics have more than 1000 docs


All docs in run are in index





13:30:31.311 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 3,4 GiB of memory would be required.
Loaded index with  1593376 documents.
----------- Check run `IRC_RRF(BM25+Bo1-XSqrA_M-PL2).ST` and collection `ST` -----------
All topics in run are in collection


Found 4 topics in collection but not in run: {'q072212314', 'q072214697', 'q072224942', 'q072222604'}
- Topic `q072212314`: `the`. BM25 found 0 docs
- Topic `q072214697`: `to`. BM25 found 0 docs
- Topic `q072224942`: `the`. BM25 found 0 docs
- Topic `q072222604`: `a`. BM25 found 0 docs


All 

# Content changes, relevance stays

In [375]:
qrels_WT = pd.read_csv('../data/publish/French/Qrels/train.txt', sep=' ', names=['query', '0', 'docid', 'label'])
topics_WT = pd.read_csv('../data/publish/English/Queries/train.tsv', sep='\t', names=['qid', 'query'])
urls_WT = pd.read_csv("../data/publish/French/urls.txt", sep="\t", names=["docno", "url"])

In [376]:
qrels = qrels_WT.merge(topics_WT, left_on='query', right_on="qid", how='left')[["qid","0", "docid", "label", "query_y"]]
qrels = qrels.rename(columns={"query_y": "query", "qid": "qid_WT"})

In [377]:
qrels = qrels.merge(urls_WT, left_on="docid", right_on="docno", how="left")[["qid_WT", "0", "docid", "label", "query", "url"]]

In [378]:
# add qids ST, LT
topics_ST = pd.read_csv('../data/test-collection/A-Short-July/English/Queries/test07.tsv', sep='\t', names=['qid', 'query'])

In [379]:
topics_LT = pd.read_csv('../data/test-collection/B-Long-September/English/Queries/test09.tsv', sep='\t', names=['qid', 'query'])


In [380]:
qrels = qrels.merge(topics_ST, left_on="query", right_on="query", how="left").rename(columns={"qid": "qid_ST"})

In [381]:
qrels = qrels.merge(topics_LT, left_on="query", right_on="query", how="left").rename(columns={"qid": "qid_LT"})

In [382]:
urls_ST = pd.read_csv("../data/test-collection/A-Short-July/French/Documents/urls.txt", sep="\t", names=["docno", "url"])
urls_LT = pd.read_csv("../data/test-collection/B-Long-September/French/Documents/urls.txt", sep="\t", names=["docno", "url"])

In [383]:
qrels = qrels.merge(urls_ST, left_on="url", right_on="url", how="left").rename(columns={"docno": "docid_ST"})

In [384]:
qrels = qrels.merge(urls_LT, left_on="url", right_on="url", how="left").rename(columns={"docno": "docid_LT"})


In [274]:
qrels[["qid_ST", "0", "docid_ST", "label"]].dropna().to_csv("../data/qrels_ST.txt", sep=" ", header=False, index=False)

In [275]:
qrels[["qid_LT", "0", "docid_LT", "label"]].dropna().to_csv("../data/qrels_LT.txt", sep=" ", header=False, index=False)

In [None]:
qrels[["qid_LT", "0", "docid_LT", "label"]].dropna().to_csv("../data/qrels_LT.txt", sep=" ", header=False, index=False)


In [386]:
qrels[["qid_WT", "0", "docid", "label"]].dropna().to_csv("../data/qrels_WT.txt", sep=" ", header=False, index=False)
