In [11]:
from pyserini.search.lucene import LuceneSearcher
import re
import json

In [23]:
exp_queries = "covid"

if exp_queries == "51-100":
    qrel_path = "src/main/resources/topics-and-qrels/qrels.adhoc.51-100.txt"
    query_path = "src/main/resources/topics-and-qrels/topics.adhoc.51-100.txt"
    index_path = "indexes/lucene-index.disk12"
elif exp_queries == "covid":
    qrel_path = "src/main/resources/topics-and-qrels/qrels.covid-round1.txt"
    query_path = "src/main/resources/topics-and-qrels/topics.covid-round1.xml"
    index_path = "indexes/lucene-index-cord19-abstract-2020-07-16/"
    
searcher = LuceneSearcher(index_path)
    

In [29]:
# query_id : raw query words
all_query_words = {}
# query_id : [relevant document ids]
all_qrels = {}

if exp_queries == "51-100":
    with open(query_path, "r") as f:
        current_id = None
        for line in f:
            if "Number:" in line:
                current_id = str(int(line.split()[2]))
                all_query_words[current_id] = []
            if "Topic:" in line:
                query = line.split()[2:]
                all_query_words[current_id] = " ".join(query)

elif exp_queries == "covid":
    with open(query_path, "r") as f:
        for line in f:
            if "<topic number=" in line:
                current_id = re.sub("[^0-9]", "", line)
                all_query_words[current_id] = []
            if "<query>" in line:
                line = re.sub("<query>", "", line)
                line = re.sub("</query>", "", line)
                line = re.sub("\n", "", line)
                all_query_words[current_id] = line

with open(qrel_path, "r") as f:
    for line in f:
        query_id, _, doc_id, _ = line.split()
        if query_id not in all_qrels:
            all_qrels[query_id] = []
        all_qrels[query_id].append(doc_id)

In [31]:
# perform search, collect all document ids that ARE NOT returned by the query
# query_id : [relevant document ids not returned by search]

missing_docids = {}
topk=1000
for query_id in all_query_words:
    missing_docids[query_id] = []
    query = all_query_words[query_id]
    rel_docs = set(all_qrels[query_id])
    hits = searcher.search(query, k=topk)
    hits_ids = set([hit.docid for hit in hits])
    missing_docids[query_id] = list(rel_docs.difference(hits_ids))


In [35]:
# collect doc text for all missing docs
# for all docs, make broad word index (map word to doc id occurrences)
# doc_id : doc text

all_needed = {}
inverted_index = {}

for x in missing_docids:
    for docid in missing_docids[x]:
        all_needed[docid] = None
print(len(all_needed))

for i,docid in enumerate(list(all_needed.keys())):
    doc = searcher.doc(docid)
    if doc == None:
        del all_needed[docid]
    else:
        doc_text = doc.raw()
        doc_text = doc_text.lower()
        doc_text = re.sub("[^a-z ]", " ", doc_text)
        doc_words = [x for x in doc_text.split() if len(x) > 3]
        all_needed[docid] = doc_words

        for word in doc_words:
            if word not in inverted_index:
                inverted_index[word] = []
            inverted_index[word].append(docid)

    if i % 10000 == 0: print(i)
print(len(all_needed))

4025
0
3818


In [37]:
query_expansion_scores = {}
for query_id in missing_docids:
    rel_ids = {x: None for x in missing_docids[query_id] if x in all_needed}
    # start with words in relevant documents
    all_rel_words = {}
    for doc_id in rel_ids:
        for word in all_needed[doc_id]:
            all_rel_words[word] = {"rel_score": 0, "else_score": 0}

    for word in all_rel_words:
        rel_score = 0
        else_score = 0
        for occ_id in inverted_index[word]:
            if occ_id in rel_ids:
                all_rel_words[word]["rel_score"] += 1
            else:
                all_rel_words[word]["else_score"] += 1
    query_expansion_scores[query_id] = all_rel_words
    print(query_id)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30


In [39]:
# now, for each query, find the words with the highest ratio
expanded_queries = {}
for query_id in query_expansion_scores:
    # collect scores for each expansion word
    sorted_words = []
    for word in query_expansion_scores[query_id]:
        score = query_expansion_scores[query_id][word]["rel_score"] / (query_expansion_scores[query_id][word]["else_score"]+1)
        if query_expansion_scores[query_id][word]["rel_score"] > 100:
            sorted_words.append((word, score))
    sorted_words = sorted(sorted_words, reverse=True, key=lambda x: x[1])[:10]
    expanded_queries[query_id] = {"query": all_query_words[query_id], "expansion_terms": [x[0] for x in sorted_words]}
json.dump(expanded_queries, open("expanded_queries/optimal." + exp_queries + ".json", "w"))

In [43]:
# make various expansion datasets FOR PASSING DIRECTLY TO BM25
# weight 0, weight 1, weight 5, weight 10
expanded_queries = json.load(open("expanded_queries/optimal." + exp_queries + ".json", "r"))

weights = [0,1,5,10]

for weight in weights:
    lines = []
    with open(query_path, "r") as f:
        for line in f:
            lines.append(line)

    if exp_queries == "51-100":
        current_id = None
        for i,line in enumerate(lines):
            if "Number:" in line:
                current_id = str(int(line.split()[2]))
            if "Topic:" in line:
                line = line[:-1] # remove new line
                split_line = line.split()
                query = split_line[2:]
                meta_data = " ".join(split_line[0:2])
                line  = meta_data
                for j in range(weight):
                    line += " " + " ".join(query)
                query_expansion_terms = expanded_queries[current_id]["expansion_terms"]
                line += " " + " ".join(query_expansion_terms)
                lines[i] = line + "\n"
        with open("expanded_queries/optimal.disk12.bm25." + str(weight) + ".51-100.txt", "w") as f:
            for line in lines:
                f.write(line)

    elif exp_queries == "covid":
        for i,line in enumerate(lines):
            if "<topic number=" in line:
                current_id = re.sub("[^0-9]", "", line)
            if "<query>" in line:
                query = re.sub("<query>", "", line)
                query = re.sub("</query>\n", "", query)
                new_line = ""
                for j in range(weight):
                    new_line += query + " "
                query_expansion_terms = expanded_queries[current_id]["expansion_terms"]
                new_line += " " + " ".join(query_expansion_terms)
                lines[i] = "\t<query>" + new_line + "</query>\n"
        with open("expanded_queries/optimal.covid.bm25." + str(weight) + ".xml", "w") as f:
            for line in lines:
                f.write(line)
