In [1]:
from src.exp_logger import logger

import pyterrier as pt  # type: ignore
import json
from src.load_index import setup_system
from tqdm import tqdm


logger.setLevel("INFO")

PyTerrier 0.9.2 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [2]:
index, topics, qrels = setup_system("WT")

Loaded index with  1570734 documents.


In [3]:
id_no = {}
meta = index.getMetaIndex()

while True:
    try:
        id_no[len(id_no)] = meta.getItem("docno", len(id_no))
    except:
        print("Done")
        break

no_id = {v: k for k, v in id_no.items()}


def make_passages(doc):
    passages = doc.split("\n")

    # remove short passages
    passages = [passage for passage in passages if len(passage.strip().split(" "))>=3]

    # split into chuncs of 50 words
    result = []
    gathered_passages = ""

    for passage in passages:
        if len(gathered_passages.split(" ")) + len(passage.split(" ")) <= 65:
            gathered_passages = gathered_passages  + " " + passage
            # print("Gathering further")
        elif len(gathered_passages.split(" ")) + len(passage.split(" ")) > 65:
            if len(gathered_passages.split(" ")) == 0:
                result.append(passage.strip())
            else:
                result.append(gathered_passages.strip())
                # print(len(gathered_passages.split(" ")))

                gathered_passages = passage
    return result


def clean(row):
    doc = row["doc"]
    doc = doc.replace(row["docno"], "")
    result = make_passages(doc)
    return result

def find_max_list(list):
    list_len = [len(i) for i in list]
    return max(list_len)

Done


In [4]:
def relevant_passages():
    rels = topics.merge(qrels[qrels["label"]>=1], on="qid")

    for _, query in tqdm(topics.iterrows(), total=len(topics)):
        qid = query["qid"]
        query = query["query"]

        ##### Get relevant passages #####
        rel_docs = rels[rels["qid"] == qid]["docno"].tolist()  # get relevant docs
        if not rel_docs:
            logger.warning(f"Skipping {qid}: `{query}`, not relevant docs found")
            continue


        for docno in rel_docs:
            docid = no_id[docno]
            doc = index.getMetaIndex().getItem("text", docid)
            cleaned_doc = doc.replace(docno, " ").strip()
            doc_pass = make_passages(cleaned_doc)
            logger.info(f"Found {len(doc_pass)} passages for {docno}")
        
            with open("data/passages/t5/WT-relevant-passages.jsonl", "a+") as f:
                for passage in doc_pass:
                    json.dump({"qid": qid, "docno": docno, "passage": passage}, f)
                    f.write("\n")

In [5]:
def not_relevant_passages():
    bm25 = pt.BatchRetrieve(index, wmodel="BM25")
    for _, query in tqdm(topics.iterrows(), total=len(topics)):
        qid = query["qid"]
        base = bm25(topics[topics["qid"]==qid])  # baseline
        gradet = base.merge(qrels, on=["qid", "docno"])  # add grading    
        not_relevant_docs = gradet[gradet["label"]==0]["docno"].tolist()  # get not rel docs
        logger.info(f"Found {len(not_relevant_docs)} graded and not relevant docs for {qid}")

        for docno in not_relevant_docs:
            docid = no_id[docno]
            doc = index.getMetaIndex().getItem("text", docid)
            cleaned_doc = doc.replace(docno, " ").strip()
            doc_pass = make_passages(cleaned_doc)
            logger.info(f"Found {len(doc_pass)} passages for {docno}")
        
            with open("data/passages/t5/WT-not-relevant-passages.jsonl", "a+") as f:
                for passage in doc_pass:
                    json.dump({"qid": qid, "docno": docno, "passage": passage}, f)
                    f.write("\n")

In [7]:
logger.setLevel("WARNING")

relevant_passages()

not_relevant_passages()

100%|██████████| 672/672 [00:05<00:00, 120.43it/s]
100%|██████████| 672/672 [16:07<00:00,  1.44s/it]
