In [1]:
from src.exp_logger import logger

import pyterrier as pt  # type: ignore
import json
from src.load_index import setup_system

logger.setLevel("WARNING")

PyTerrier 0.9.2 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [2]:
index, topics, qrels = setup_system("WT")


20:18:14.060 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 3,4 GiB of memory would be required.
Loaded index with  1570734 documents.


In [3]:
meta = index.getMetaIndex()

In [4]:
id_no = {}

while True:
    try:
        id_no[len(id_no)] = meta.getItem("docno", len(id_no))
    except:
        print("Done")
        break

Done


In [5]:
no_id = {v: k for k, v in id_no.items()}

In [6]:
def make_passages(doc):
    passages = doc.split("\n")

    # remove short passages
    passages = [passage for passage in passages if len(passage.strip().split(" "))>=3]

    # split into chuncs of 50 words
    result = []
    gathered_passages = ""

    for passage in passages:
        # print("passage:", len(passage.split(" ")), "gathered Passage:", len(gathered_passages.split(" ")))

        if len(gathered_passages.split(" ")) + len(passage.split(" ")) <= 65:
            gathered_passages = gathered_passages  + " " + passage
            # print("Gathering further")
        elif len(gathered_passages.split(" ")) + len(passage.split(" ")) > 65:
            if len(gathered_passages.split(" ")) == 0:
                result.append(passage.strip())
            else:
                result.append(gathered_passages.strip())
                # print(len(gathered_passages.split(" ")))

                gathered_passages = passage
    return result


def clean(row):
    doc = row["doc"]
    doc = doc.replace(row["docno"], "")
    result = make_passages(doc)
    return result

def find_max_list(list):
    list_len = [len(i) for i in list]
    return max(list_len)

In [16]:
bm25 = pt.BatchRetrieve(index, wmodel="BM25")

try:
    with open("done.txt", "r") as fin:
        done = fin.read().splitlines()
except:
    done = []

from tqdm import tqdm

# get relevant docs
rels = topics.merge(qrels[qrels["label"]>=1], on="qid")


for _, query in tqdm(topics.iterrows(),total=len(topics)):
    qid = query["qid"]
    query_text = query["query"]

    if str(qid) in done:
        continue

    ##### Get relevant passages #####
    passages = []
    rel_docs = rels[rels["qid"] == qid]["docno"].tolist()  # get relevant docs
    if not rel_docs:
        logger.warning(f"Skipping {qid}, not relevant docs found")
        continue


    for docno in rel_docs:
        docid = no_id[docno]
        doc = index.getMetaIndex().getItem("text", docid)
        cleaned_doc = doc.replace(docno, " ").strip()
        doc_pass = make_passages(cleaned_doc)
        passages.append(doc_pass)
        logger.info(f"Found {len(doc_pass)} passages for {docno}")
    



    ##### Get not relevant but challenging passages #####

    # sample graded not rel but retrieved docs
    base = bm25(topics[topics["qid"]==qid])  # baseline

    if len(base) == 0:
        logger.warning(f"Skipping {qid}, no docs found")
        continue

    logger.info(f"Retrieved {len(base)} docs for {qid}")

    gradet = base.merge(qrels, on=["qid", "docno"])  # add grading    
    gradet = gradet[gradet["label"]==0]["docno"].tolist()  # get not rel docs
    logger.info(f"Found {len(gradet)} graded and not relevant docs for {qid}")


    not_rel_passages = []
    for docno in gradet:
        docid = no_id[docno]
        doc = index.getMetaIndex().getItem("text", docid)
        cleaned_doc = doc.replace(docno, " ").replace("|", "").strip()
        doc_pass = make_passages(cleaned_doc)
        not_rel_passages.append(doc_pass)

    
    tot_passages = sum([len(i) for i in passages])
    tot_not_rel_passages = sum([len(i) for i in not_rel_passages])
    logger.info(f"Found {tot_passages} relevant passages and {tot_not_rel_passages} not relevant passages")
    if tot_passages > tot_not_rel_passages:
        print("Not enough not relevant passages")
        # sample from not gradet not rel but retrieved docs
        # additional
        not_rel_docs_all = base["docno"].to_list()
        set(not_rel_docs_all)-set(gradet)-set(rel_docs)

        i = 0
        while tot_not_rel_passages < tot_passages or i == len(not_rel_docs_all)-1 :
            docid = no_id[not_rel_docs_all[i]]
            doc = index.getMetaIndex().getItem("text", docid)
            cleaned_doc = doc.replace(docno, " ").replace("|", "").strip()
            doc_pass = make_passages(cleaned_doc)
            not_rel_passages.append(doc_pass)    
            i+=1
            tot_not_rel_passages += len(doc_pass)
            if i == len(not_rel_docs_all)-1:
                
                continue




    
    ##### Assamble triplets #####
    non_rel_passage_pool = []
    for i in range(0, find_max_list(not_rel_passages)):
        for list_d in not_rel_passages:
            try:
                item = list_d[i]
                if not item:
                    continue  # TODO fix empty non rel passages
                non_rel_passage_pool.append(item)
            except IndexError:
                pass
        
    
    triplets = []
    c = 0
    for rel_doc in passages:
        for rel_passage in rel_doc:
            non_rel_passage = non_rel_passage_pool[c]
            triplets.append([query_text, rel_passage, non_rel_passage])
            c+=1

    with open("passages.jsonl", "+a") as fin:
        for triplet in triplets:
            fin.write(json.dumps(triplet))
            fin.write("\n")

    with open("done.txt", "+a") as fin:
        fin.write(str(qid))
        fin.write("\n")
    
    print("Done with", qid)




Not enough not relevant passages
Done with q062219519


 83%|████████▎ | 558/672 [00:00<00:00, 609.65it/s]

Done with q062219521
Done with q062219533
Done with q062219608
Done with q062219780
Not enough not relevant passages
Done with q062219826
Done with q062219889
Done with q062219914
Not enough not relevant passages
Done with q062219963
Done with q062219965
Not enough not relevant passages
Done with q062220046
Not enough not relevant passages
Done with q062220131
Not enough not relevant passages
Done with q062220278
Done with q062220336
Done with q062220412
Done with q062220442
Not enough not relevant passages
Done with q062220476
Done with q062220574
Not enough not relevant passages
Done with q062220605
Done with q062220619
Done with q062220753
Not enough not relevant passages
Done with q062220762
Done with q062220773


 86%|████████▋ | 580/672 [00:18<00:04, 22.77it/s] 

Done with q062220776




Not enough not relevant passages
Done with q062220779
Not enough not relevant passages
Done with q062220782
Not enough not relevant passages
Done with q062220787
Done with q062220793
Not enough not relevant passages
Done with q062220819
Not enough not relevant passages
Done with q062221016
Not enough not relevant passages
Done with q062224109
Not enough not relevant passages
Done with q062221161
Done with q062221217
Done with q062221255
Done with q062221293
Done with q062221417
Done with q062221495
Done with q062221499
Done with q062221554
Done with q062224735
Done with q062221661
Not enough not relevant passages
Done with q062221706
Done with q062221744
Done with q062221798
Not enough not relevant passages
Done with q062221837
Not enough not relevant passages
Done with q062221847
Done with q062221852
Done with q062221854


 90%|█████████ | 606/672 [00:38<00:08,  7.96it/s]

Done with q062221860


 90%|█████████ | 607/672 [00:39<00:08,  7.71it/s]

Done with q062225017
Done with q062221997
Done with q062222119
Done with q062222120




Done with q062222128
Done with q062222144
Not enough not relevant passages
Done with q062222154
Not enough not relevant passages
Done with q062222203
Done with q062222431
Not enough not relevant passages
Done with q062222593
Not enough not relevant passages
Done with q062222626
Done with q062222757
Not enough not relevant passages
Done with q062222854


 92%|█████████▏| 621/672 [00:49<00:10,  5.07it/s]

Done with q062222895


 93%|█████████▎| 622/672 [00:50<00:10,  4.90it/s]

Done with q062222896
Done with q062222903
Done with q062222955
Done with q062222971
Done with q062223061
Done with q062223111
Not enough not relevant passages
Done with q062223182
Done with q062223204
Done with q062223216


 94%|█████████▍| 631/672 [00:57<00:11,  3.59it/s]

Not enough not relevant passages
Done with q062223259




Done with q062223362
Not enough not relevant passages
Done with q062223524




Not enough not relevant passages
Done with q062223539


 95%|█████████▍| 637/672 [01:00<00:10,  3.27it/s]

Not enough not relevant passages
Done with q062223882
Done with q062223886
Done with q062223892
Done with q062223898


 95%|█████████▌| 641/672 [01:03<00:10,  2.84it/s]

Not enough not relevant passages
Done with q062223900
Not enough not relevant passages
Done with q062223902
Not enough not relevant passages
Done with q062223910


 96%|█████████▌| 644/672 [01:06<00:10,  2.56it/s]

Not enough not relevant passages
Done with q062223911
Done with q062223916


 96%|█████████▌| 646/672 [01:07<00:10,  2.37it/s]

Not enough not relevant passages
Done with q062223934
Not enough not relevant passages
Done with q062224051


 96%|█████████▋| 648/672 [01:09<00:10,  2.22it/s]

Not enough not relevant passages
Done with q062224086


 97%|█████████▋| 649/672 [01:10<00:10,  2.10it/s]

Done with q062224117


 97%|█████████▋| 650/672 [01:10<00:11,  1.99it/s]

Done with q062224226


 97%|█████████▋| 651/672 [01:11<00:11,  1.83it/s]

Done with q062224291


 97%|█████████▋| 652/672 [01:12<00:11,  1.70it/s]

Done with q062224292


 97%|█████████▋| 653/672 [01:13<00:12,  1.57it/s]

Done with q062224315


 97%|█████████▋| 654/672 [01:14<00:12,  1.49it/s]

Done with q062224326


 97%|█████████▋| 655/672 [01:15<00:11,  1.42it/s]

Not enough not relevant passages
Done with q062224382


 98%|█████████▊| 656/672 [01:15<00:11,  1.44it/s]

Done with q062224463


 98%|█████████▊| 657/672 [01:16<00:10,  1.38it/s]

Not enough not relevant passages
Done with q062224514


 98%|█████████▊| 658/672 [01:17<00:10,  1.32it/s]

Done with q062224541


 98%|█████████▊| 659/672 [01:18<00:09,  1.32it/s]

Done with q062224551


 98%|█████████▊| 660/672 [01:18<00:09,  1.30it/s]

Not enough not relevant passages
Done with q062224560


 98%|█████████▊| 661/672 [01:19<00:08,  1.31it/s]

Not enough not relevant passages
Done with q062224589


 99%|█████████▊| 662/672 [01:20<00:07,  1.28it/s]

Not enough not relevant passages
Done with q062224711


 99%|█████████▊| 663/672 [01:21<00:07,  1.27it/s]

Not enough not relevant passages
Done with q062224751


 99%|█████████▉| 664/672 [01:22<00:06,  1.23it/s]

Done with q062224794


 99%|█████████▉| 665/672 [01:23<00:05,  1.26it/s]

Not enough not relevant passages
Done with q062224848


 99%|█████████▉| 666/672 [01:23<00:04,  1.24it/s]

Not enough not relevant passages
Done with q062224851


 99%|█████████▉| 667/672 [01:24<00:03,  1.25it/s]

Done with q062224875


 99%|█████████▉| 668/672 [01:25<00:03,  1.25it/s]

Done with q062224914


100%|█████████▉| 669/672 [01:26<00:02,  1.24it/s]

Done with q062224961


100%|█████████▉| 670/672 [01:27<00:01,  1.21it/s]

Done with q062225030


100%|█████████▉| 671/672 [01:27<00:00,  1.25it/s]

Done with q062225194


100%|██████████| 672/672 [01:28<00:00,  7.59it/s]

Not enough not relevant passages
Done with q062225197





In [14]:
not_rel_docs_all

['doc062209200074']