In [1]:
from document_preprocessor import *
from indexing import *
from ranker import *
from l2r import *
from relevance import *
from vector_ranker import *
from network_features import *
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from time import time
from sentence_transformers import SentenceTransformer
from tqdm import tqdm


In [2]:
doc_to_query = Doc2QueryAugmenter(doc2query_model_name="google/flan-t5-large")


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [16]:
new_data = []
i = 0
with open("symptoms.json") as file:
    for line in file:
        data = json.loads(line)
        for item in data:
            dict_to_append = {}
            dict_to_append["docid"] = i+1
            dict_to_append["title"] = list(data.keys())[i]
            dict_to_append["text"] = list(data.values())[i].replace("\n", " ")
            new_data.append(dict_to_append)
            i += 1

print(len(new_data))


1066


In [18]:
with open("final_data.json", "w") as jsonfile:
    for data in new_data:
        jsonfile.write(json.dumps(data))
        jsonfile.write("\n")


In [19]:
queries = []
for data in new_data:
    queries.extend(doc_to_query.get_queries(data["text"], prefix_prompt="Generate a query for the following text:"))


KeyboardInterrupt: 

In [30]:
import random

n = 3

# Sample size
m = 20

# Perform the sampling n times
query_lists = []

for _ in range(n):
    query_lists.append(random.sample(queries, m))


In [32]:
n = 60

# Sample size
m = 50

# Perform the sampling n times
doc_lists = []
doc_data = [(new_data[i]["docid"], new_data[i]["text"]) for i in range(len(new_data))]

for _ in range(n):
    doc_lists.append(random.sample(doc_data, m))


each query gets one index of doc list which contains (docid, text) tuples
key: query
value: list of tuples

60


In [35]:
query_lists[0][0]


'What part of the body is a growth plate fracture?'

In [44]:
total_list = []
for i in range(3):
    k = 0
    dict_to_append = {}
    for j in range(i*20, (i+1)*20):
        dict_to_append[query_lists[i][k]] = doc_lists[j]
        k+=1
    total_list.append(dict_to_append)






In [46]:
len(total_list[0])


20

In [54]:
rows = [{'Query': query, 'DocID': docid, 'Text': text}
        for query, doc_list in total_list[0].items()
        for docid, text in doc_list]

# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(rows)
df.shape
# Print the DataFrame
# df.to_csv("taimoor.csv", index=False)


(1000, 3)

# random baseline

In [79]:
from random import randint
rand_baseline = []
for i in query_lists:
    int_list = []
    for j in i:
        rank_list = []
        for k in range(10):
            random_num = randint(0, len(doc_data)-1)
            ret_val = doc_data[random_num]
            rank_list.append(ret_val[0])
        int_list.append(rank_list)
    rand_baseline.append(int_list)



In [80]:
print(rand_baseline[0])


[[450, 175, 49, 1011, 790, 186, 158, 532, 477, 389], [217, 922, 342, 769, 503, 1049, 996, 520, 89, 826], [490, 818, 628, 247, 801, 4, 616, 111, 388, 754], [782, 277, 134, 906, 325, 309, 150, 1051, 680, 762], [590, 92, 67, 1026, 746, 674, 1027, 175, 482, 589], [594, 83, 307, 909, 229, 994, 666, 451, 732, 300], [317, 698, 291, 924, 835, 198, 270, 356, 354, 598], [244, 931, 997, 663, 470, 217, 848, 920, 20, 531], [276, 191, 530, 505, 665, 425, 734, 767, 181, 677], [349, 127, 245, 576, 272, 437, 924, 475, 569, 80], [1037, 478, 744, 68, 772, 964, 262, 33, 601, 978], [81, 934, 287, 918, 50, 565, 1034, 634, 947, 165], [855, 773, 151, 382, 134, 690, 2, 153, 51, 165], [864, 705, 918, 52, 195, 8, 549, 289, 3, 354], [108, 907, 737, 339, 72, 496, 807, 482, 444, 830], [183, 285, 636, 225, 299, 126, 704, 188, 9, 374], [187, 925, 872, 725, 336, 970, 282, 994, 146, 32], [948, 567, 634, 47, 809, 135, 711, 507, 869, 247], [378, 715, 358, 590, 658, 685, 923, 889, 451, 390], [669, 338, 947, 605, 418, 572,

# BM25 baseline

In [55]:
doc_preprocessor = doc_preprocessor = RegexTokenizer('\\w+')
doc_index = Indexer.create_index(
    IndexType.InvertedIndex, 'final_data.json', doc_preprocessor, set(), 5)


loading collection: 1066it [00:00, 261209.80it/s]
tokenizing whole corpus: 100%|██████████| 1066/1066 [00:00<00:00, 60879.72it/s]
MWF+Stopword removal + add doc: 1066it [00:00, 23522.72it/s]
sorting index: 100%|██████████| 1823/1823 [00:00<00:00, 596568.32it/s]


In [58]:
relevance_scorer = BM25(doc_index)
bm25_ranker = Ranker(doc_index, doc_preprocessor, set(), relevance_scorer)


In [62]:
bm25_res =[]
for i in query_lists:
    internal_query = []
    for j in i:
        internal_query.append(bm25_ranker.query(j))
    bm25_res.append(internal_query)


In [81]:
print(rand_baseline[0])


[[450, 175, 49, 1011, 790, 186, 158, 532, 477, 389], [217, 922, 342, 769, 503, 1049, 996, 520, 89, 826], [490, 818, 628, 247, 801, 4, 616, 111, 388, 754], [782, 277, 134, 906, 325, 309, 150, 1051, 680, 762], [590, 92, 67, 1026, 746, 674, 1027, 175, 482, 589], [594, 83, 307, 909, 229, 994, 666, 451, 732, 300], [317, 698, 291, 924, 835, 198, 270, 356, 354, 598], [244, 931, 997, 663, 470, 217, 848, 920, 20, 531], [276, 191, 530, 505, 665, 425, 734, 767, 181, 677], [349, 127, 245, 576, 272, 437, 924, 475, 569, 80], [1037, 478, 744, 68, 772, 964, 262, 33, 601, 978], [81, 934, 287, 918, 50, 565, 1034, 634, 947, 165], [855, 773, 151, 382, 134, 690, 2, 153, 51, 165], [864, 705, 918, 52, 195, 8, 549, 289, 3, 354], [108, 907, 737, 339, 72, 496, 807, 482, 444, 830], [183, 285, 636, 225, 299, 126, 704, 188, 9, 374], [187, 925, 872, 725, 336, 970, 282, 994, 146, 32], [948, 567, 634, 47, 809, 135, 711, 507, 869, 247], [378, 715, 358, 590, 658, 685, 923, 889, 451, 390], [669, 338, 947, 605, 418, 572,

In [64]:
print(bm25_res[0])


[[(932, 8.18485063292186), (263, 4.996326731314133), (720, 4.016133910430765), (60, 3.492449202679194), (843, 2.9170304677623395), (584, 2.7865173260220066), (977, 2.591917942762543), (607, 2.347544406230928), (1065, 2.1984945508528693), (95, 2.0952572864022816), (469, 1.961576667052805), (547, 1.8884641778168503), (958, 1.678925044566395), (207, 1.5354287805417517), (777, 1.478468092179599), (921, 1.4541487273075162), (703, 1.1528926686410017), (437, 0.8833845469989889), (748, 0.7419520132367566), (188, 0.6506589982829789), (245, 0.5922056968438145), (502, 0.5223658177669539), (762, 0.3265586666669298), (585, 0.298994148102147), (129, 0.28548465322861494), (451, 0.2637840834829883), (393, 0.24564410603142828), (217, 0.22812307300593815), (791, 0.16651290382036144), (169, 0.1611143358489595), (494, 0.13665093324683064), (888, 0.11789841070833174), (705, -0.033435186665955685), (219, -0.06132305109289393), (203, -0.27155584055880855), (445, -0.3102744289072201), (597, -0.315463696811907