In [78]:
import os
import re
from collections import defaultdict
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
nltk.download("stopwords")
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [79]:
with open('Medical_Abstracts.txt', 'r') as f:
    lines = f.readlines()

paragraphs = []
current_paragraph = ""

for line in lines:
    if line.startswith(".I"):
        if current_paragraph != "":
            paragraphs.append(current_paragraph)
            current_paragraph = ""
    elif line.startswith(".W"):
        continue
    else:
        current_paragraph += line.strip()
if current_paragraph != "":
    paragraphs.append(current_paragraph)

In [80]:
documents = {}
for i in range(len(paragraphs)):
  documents[i+1] = paragraphs[i]

In [81]:
import json
with open('data.txt', 'w') as f:
  json.dump(paragraphs,f)
import json
# with open('data.txt', 'w') as f:
#   file_name = json.load(f)

 preprocess the text by tokenizing, removing stopwords, and stemming
 

In [82]:
# preprocess the text by tokenizing, removing stopwords, and stemming
def preprocess(text):
    ps = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    words = [ps.stem(w) for w in tokens if w.isalpha() and w not in stop_words]
    return words

Building Inverted Index

In [83]:
from collections import defaultdict
# build the inverted index
def build_inverted_index(documents):
    inverted_index = defaultdict(list)
    for doc_id, text in documents.items():
        terms = preprocess(text)
        for term in set(terms):
            df = terms.count(term)
            inverted_index[term].append((doc_id, df))
    return inverted_index

In [84]:
inverted_index = build_inverted_index(documents)

In [85]:
# write the inverted index to disk
def write_inverted_index(inverted_index, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    for term, postings in inverted_index.items():
        df = len(postings)
        posting_list = [str(doc_id) + ',' + str(df) for doc_id, df in postings]
        posting_file = os.path.join(output_dir, term + '.txt')
        with open(posting_file, 'w') as f:
            f.write('\n'.join(posting_list))
    dictionary = [(term, len(postings), term + '.txt') for term, postings in inverted_index.items()]
    dictionary_file = os.path.join(output_dir, 'dictionary.txt')
    with open(dictionary_file, 'w') as f:
        for term, df, posting_file in dictionary:
            f.write(f'{term:<20}{df:<5}{posting_file}\n')

In [86]:
write_inverted_index(inverted_index,"/content/drive/MyDrive/colab files/A2P1")

In [87]:
from google.colab import files
files.download("/content/drive/MyDrive/colab files/A2P1")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [88]:
# read the inverted index from disk
def read_inverted_index(input_dir):
    inverted_index = {}
    dict_file = os.path.join(input_dir, "dictionary.txt")
    with open(dict_file, 'r') as f:
        for line in f:
            term = line[:20].strip()
            try:
                df = int(line[20:25].strip())
            except ValueError:
                # print("Skipping line due to invalid document frequency: {}".format(line))
                continue
            posting_file = line[25:].strip()
            try:
                with open(os.path.join(input_dir, posting_file), 'r') as g:
                    posting_list = []
                    for doc_id in g.read().split(","):
                        doc_id = doc_id.strip()
                        if doc_id.isnumeric():
                            posting_list.append(int(doc_id))
                    inverted_index[term] = (df, posting_list)
            except FileNotFoundError:
                print("Posting file not found: {}".format(posting_file))
                continue
    return inverted_index

In [89]:
inverted_index_read = read_inverted_index("/content/drive/MyDrive/colab files/A2P1")

Calculating tfidf scores

In [90]:
import math

# compute the TF-IDF score for a term in a document
def tfidf(term, doc_id, inverted_index, N):
    tf = 0
    for d, df in inverted_index[term]:
        if d == doc_id:
            tf = df
            break
    idf = math.log(N / len(inverted_index[term]))
    return tf * idf

# compute the TF-IDF vector for a query
def query_tfidf(query, inverted_index, N):
    query_terms = query.strip().lower().split()
    scores = {}
    for term in query_terms:
        if term in inverted_index:
            for doc_id, _ in inverted_index[term]:
                scores[doc_id] = scores.get(doc_id, 0) + tfidf(term, doc_id, inverted_index, N)
    return scores


Calculating the speed of execution of the querries

In [91]:
import time

# define the path to the inverted index
# inverted_index_path = "/path/to/inverted_index"

# # read in the inverted index from disk
# inverted_index = read_inverted_index(inverted_index_path)

# define the total number of documents
N = len(documents)

# define the number of results to return per query
k = 10
queries = [    "paracetamol",    "ibuprofen",    "paracetamol ibuprofen",    "hepatitis",    "virus",    "hepatitis virus",    "fever",    "cold",    "cough",    "sore throat",    "influenza",    "common cold",    "headache",    "muscle ache",    "back pain",    "stomach ache",    "nausea",    "vomiting",    "diarrhea",    "constipation"]

# loop through each query and print the top k results
for query in queries:
    start_time = time.time()
    scores = query_tfidf(query, inverted_index, N)
    #print('scores',scores)
    results = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:k]
    print(f"Query: {query}")
    for doc_id, score in results:
        print(f"Document {doc_id}: {score}")
    print(f"Time taken: {time.time() - start_time} seconds")
    print()

Query: paracetamol
Time taken: 0.001491546630859375 seconds

Query: ibuprofen
Time taken: 0.00022840499877929688 seconds

Query: paracetamol ibuprofen
Time taken: 5.14984130859375e-05 seconds

Query: hepatitis
Time taken: 0.00016760826110839844 seconds

Query: virus
Document 24: 6.198478716492308
Time taken: 8.273124694824219e-05 seconds

Query: hepatitis virus
Document 24: 6.198478716492308
Time taken: 7.748603820800781e-05 seconds

Query: fever
Document 28: 5.5053315359323625
Document 365: 5.5053315359323625
Time taken: 0.0002868175506591797 seconds

Query: cold
Document 82: 4.812184355372417
Document 83: 4.812184355372417
Document 126: 4.812184355372417
Document 421: 4.812184355372417
Time taken: 0.0004024505615234375 seconds

Query: cough
Document 372: 6.198478716492308
Time taken: 6.961822509765625e-05 seconds

Query: sore throat
Time taken: 4.124641418457031e-05 seconds

Query: influenza
Document 59: 11.010663071864725
Document 469: 5.5053315359323625
Time taken: 0.01100444793701

In [92]:
doc_relevance={}
doc_relevance['constipation']=[[203,251,372],[203,372],[203,251,372,442]]
doc_relevance['influenza']=[[59],[59,469],[59,71,469]]
doc_relevance['cough']=[[372,421],[372,83,126,],[372,83,126,421]]
doc_relevance['muscle ache']=[[37,65,76,126,343],[37,65,76,117,126,387],[37,65,76,83,100,117,126,206,277,343]]
doc_relevance['vomiting']=[[203,442],[203,372,442],[203,372,442]]
doc_relevance['hepatitis']=[[16,376],[16,376],[16,368,376]]
doc_relevance['diarrhea']=[[372,442],[372,203,372,442],[372,203,372,442]]
doc_relevance['cold']=[[83,126,421],[82,83,126,421],[41,82,83,126]]
doc_relevance['paracetamol ibuprofen']=[[216,200],[216,200,365,356],[216,200,205,356,214]]
doc_relevance['common cold']=[[83,126,421,372,28,365],[82,83,126,421,372,28],[41,82,83,126]]
doc_relevance['paracetamol']=[[28,214,364,83,126,421,372,28,365],[28,214,364,421,372,28],[28,214,364,41,82,83,126]]
doc_relevance['ibuprofen']=[[200,139,216],[200,356,139,216],[200,205,216]]
doc_relevance['fever']=[[28,214,365,251],[28,214,365,251],[28,214,365]]
doc_relevance['virus']=[[275,372,399,402,403,59,71,371],[275,372,399,400,402,403,475,59,71],[275,372,400,403,475,71,371]]
doc_relevance['hepatitis virus']=[[368,376,16,401],[368,376,16,399,401,402],[368,376,16,372,399,402]]
doc_relevance['sore throat']=[[477,483,71,177,214,252],[477,483,71,177,198,214],[477,71,177,198]]
doc_relevance['headache']=[[216,421,214,28,83,126,421,372,28,365,61,69,71],[216,421,214,365,28,82,83,126,421,372,28,61,62,69,71],[216,421,365,28,41,82,83,126,61,62,71]]
doc_relevance['back pain']=[[378,162,301,235,413,356],[378,162,235,301,235,348,356],[378,235,301,235,356]]
doc_relevance['stomach ache']=[[391,372,442],[391,200,372,203,372,442],[391,200,372,203,372,442]]
doc_relevance['nausea']=[[368,442,203,442],[368,442,203,372,442],[368,442,203,372,442]]

In [93]:
from collections import Counter, defaultdict
doc_labelled={}
this_num_relevance={}
unique_list_of_numbers = []
list_of_numbers = []
for key, value in doc_relevance.items():
  # create a defaultdict to store the counts of each number
  counts = defaultdict(int)
  # iterate through each sublist and count the occurrences of each number
  for sublist in value:
      for num in sublist:
          counts[num] += 1
          list_of_numbers.append(num)
  
  # create a new list of lists with unique numbers and their counts
  output_list = [[num, counts[num], 'RELEVANT' if counts[num] > 1 else 'NOT_RELEVANT'] for num in sorted(counts)]
  
  #print (output_list)                 
  doc_labelled[key] = output_list

print (doc_labelled)  
#Printing unique list of all numbers
unique_list_of_numbers = list(set(list_of_numbers))
#print (unique_list_of_numbers)

{'constipation': [[203, 3, 'RELEVANT'], [251, 2, 'RELEVANT'], [372, 3, 'RELEVANT'], [442, 1, 'NOT_RELEVANT']], 'influenza': [[59, 3, 'RELEVANT'], [71, 1, 'NOT_RELEVANT'], [469, 2, 'RELEVANT']], 'cough': [[83, 2, 'RELEVANT'], [126, 2, 'RELEVANT'], [372, 3, 'RELEVANT'], [421, 2, 'RELEVANT']], 'muscle ache': [[37, 3, 'RELEVANT'], [65, 3, 'RELEVANT'], [76, 3, 'RELEVANT'], [83, 1, 'NOT_RELEVANT'], [100, 1, 'NOT_RELEVANT'], [117, 2, 'RELEVANT'], [126, 3, 'RELEVANT'], [206, 1, 'NOT_RELEVANT'], [277, 1, 'NOT_RELEVANT'], [343, 2, 'RELEVANT'], [387, 1, 'NOT_RELEVANT']], 'vomiting': [[203, 3, 'RELEVANT'], [372, 2, 'RELEVANT'], [442, 3, 'RELEVANT']], 'hepatitis': [[16, 3, 'RELEVANT'], [368, 1, 'NOT_RELEVANT'], [376, 3, 'RELEVANT']], 'diarrhea': [[203, 2, 'RELEVANT'], [372, 5, 'RELEVANT'], [442, 3, 'RELEVANT']], 'cold': [[41, 1, 'NOT_RELEVANT'], [82, 2, 'RELEVANT'], [83, 3, 'RELEVANT'], [126, 3, 'RELEVANT'], [421, 2, 'RELEVANT']], 'paracetamol ibuprofen': [[200, 3, 'RELEVANT'], [205, 1, 'NOT_RELEVA

In [155]:
import random
sampled_querries=random.sample(queries, 10)
sampled_querries


['hepatitis virus',
 'sore throat',
 'fever',
 'hepatitis',
 'paracetamol ibuprofen',
 'virus',
 'vomiting',
 'muscle ache',
 'common cold',
 'stomach ache']

In [156]:
sampled_querries2=random.sample(queries, 10)
sampled_querries2

['ibuprofen',
 'common cold',
 'influenza',
 'paracetamol',
 'constipation',
 'back pain',
 'virus',
 'vomiting',
 'headache',
 'muscle ache']

In [256]:
rnr_doc=[[],[],[],[],[],[],[],[],[],[]]
c2=0
for i in sampled_querries:
  r_no=0
  for j in doc_labelled:
    if i==j:
      rd=[]
      val=doc_labelled.get(j)
      l=len(val)
      c1=0
     
      for a in range(l):
        if val[c1][2]=='RELEVANT':
          r_no=r_no+1
          #print('c2',c2,'c1',c1)
          rnr_doc[c2].append(val[c1][0])
        c1=c1+1
      print(j,'r_no',r_no)
      print(j,c2) 
      if(r_no==0):
        print(j)
  c2=c2+1
   


        



hepatitis virus r_no 6
hepatitis virus 0
sore throat r_no 6
sore throat 1
fever r_no 4
fever 2
hepatitis r_no 2
hepatitis 3
paracetamol ibuprofen r_no 3
paracetamol ibuprofen 4
virus r_no 10
virus 5
vomiting r_no 3
vomiting 6
muscle ache r_no 6
muscle ache 7
common cold r_no 6
common cold 8
stomach ache r_no 5
stomach ache 9


In [97]:
rnr_doc2=[[],[],[],[],[],[],[],[],[],[]]
c2=0
for i in sampled_querries2:
  r_no=0
  for j in doc_labelled:
    if i==j:
      rd=[]
      val=doc_labelled.get(j)
      l=len(val)
      c1=0
     
      for a in range(l):
        if val[c1][2]=='RELEVANT':
          r_no=r_no+1
          #print('c2',c2,'c1',c1)
          rnr_doc2[c2].append(val[c1][0])
        c1=c1+1
      print(j,'r_no',r_no)
      print(j,c2) 
      if(r_no==0):
        print(j)
  c2=c2+1
   

diarrhea r_no 3
diarrhea 0
paracetamol r_no 7
paracetamol 1
influenza r_no 2
influenza 2
hepatitis virus r_no 6
hepatitis virus 3
muscle ache r_no 6
muscle ache 4
common cold r_no 6
common cold 5
ibuprofen r_no 3
ibuprofen 6
stomach ache r_no 5
stomach ache 7
virus r_no 10
virus 8
vomiting r_no 3
vomiting 9


In [158]:
rnr_doc_f=[[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]]
c2=0
for i in queries:
  r_no=0
  for j in doc_labelled:
    if i==j:
      rd=[]
      val=doc_labelled.get(j)
      l=len(val)
      c1=0
     
      for a in range(l):
        if val[c1][2]=='RELEVANT':
          r_no=r_no+1
          #print('c2',c2,'c1',c1)
          rnr_doc_f[c2].append(val[c1][0])
        c1=c1+1
      print(j,'r_no',r_no)
      print(j,c2) 
      if(r_no==0):
        print(j)
  c2=c2+1
   

paracetamol r_no 7
paracetamol 0
ibuprofen r_no 3
ibuprofen 1
paracetamol ibuprofen r_no 3
paracetamol ibuprofen 2
hepatitis r_no 2
hepatitis 3
virus r_no 10
virus 4
hepatitis virus r_no 6
hepatitis virus 5
fever r_no 4
fever 6
cold r_no 4
cold 7
cough r_no 4
cough 8
sore throat r_no 6
sore throat 9
influenza r_no 2
influenza 10
common cold r_no 6
common cold 11
headache r_no 13
headache 12
muscle ache r_no 6
muscle ache 13
back pain r_no 5
back pain 14
stomach ache r_no 5
stomach ache 15
nausea r_no 4
nausea 16
vomiting r_no 3
vomiting 17
diarrhea r_no 3
diarrhea 18
constipation r_no 3
constipation 19


In [159]:
queries

['paracetamol',
 'ibuprofen',
 'paracetamol ibuprofen',
 'hepatitis',
 'virus',
 'hepatitis virus',
 'fever',
 'cold',
 'cough',
 'sore throat',
 'influenza',
 'common cold',
 'headache',
 'muscle ache',
 'back pain',
 'stomach ache',
 'nausea',
 'vomiting',
 'diarrhea',
 'constipation']

In [141]:
rnr_doc_f

[[28, 83, 126, 214, 364, 372, 421],
 [139, 200, 216],
 [200, 216, 356],
 [16, 376],
 [59, 71, 275, 371, 372, 399, 400, 402, 403, 475],
 [16, 368, 376, 399, 401, 402],
 [28, 214, 251, 365],
 [82, 83, 126, 421],
 [83, 126, 372, 421],
 [71, 177, 198, 214, 477, 483],
 [59, 469],
 [28, 82, 83, 126, 372, 421],
 [28, 61, 62, 69, 71, 82, 83, 126, 214, 216, 365, 372, 421],
 [37, 65, 76, 117, 126, 343],
 [162, 235, 301, 356, 378],
 [200, 203, 372, 391, 442],
 [203, 368, 372, 442],
 [203, 372, 442],
 [203, 372, 442],
 [203, 251, 372]]

Sampled 100 documents

In [160]:
sampled_documents=[173, 124, 302, 67, 482, 241, 138, 431, 194, 412, 480, 163, 156, 386, 389, 64, 424, 434, 88, 422, 133, 131, 311, 276, 265, 324, 406, 178, 445, 341, 425, 454, 390, 40, 36, 30, 426, 46, 217, 3, 267, 437, 333, 358, 486, 74, 443, 282, 109, 371, 220, 115, 222, 197, 139, 233, 207, 84, 380, 280, 301, 94, 113, 73, 111, 444, 414, 174, 198, 387, 16, 277, 28, 421, 37, 41, 442, 59, 65, 71, 200, 203, 76, 205, 206, 82, 83, 469, 214, 343, 216, 100, 356, 365, 368, 372, 117, 376, 251, 126]

In [161]:
final_rnr=[[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]]
c=-1

for q in queries:
  v=[]
  c=c+1
  for d in documents:
    v=rnr_doc_f[c]
    if d in v:
      final_rnr[c].append('R')
    else:
      final_rnr[c].append('N')
  #c=c+1




Calculating the no of retrieved relevant document in 10 runs for each querry

In [162]:
import random
len_retrieved_relevant_doc=[[],[],[],[],[],[],[],[],[],[]]
len_relevant_doc=[[],[],[],[],[],[],[],[],[],[]]
total_doc=len(paragraphs)
top_ten_doc=[]
e=[]
z=[]
k=1
j=-1
doc_id=[]
for k1 in range(10):
  top_ten_doc.append(e)
for id in documents:
  doc_id.append(id)
for k in range(10):
  z=random.sample(doc_id,10)
  top_ten_doc[k].append(z)
  z=[]
print(top_ten_doc)
c=0
c1=0
for t in range(10):
  t2=t*2
  t3=t2+79
  tt=top_ten_doc[t]
  t1=top_ten_doc+sampled_documents[t2:t3]
  for q in sampled_querries:
    count=0
    for d in t1:
      #print(c)
      if d in rnr_doc[c]:
        count=count+1
    len_retrieved_relevant_doc[t].append(count)
    c=c+1
  c=0
  tt=[]
  t1=[]
  
for q in sampled_querries:
  count1=0
  for d1 in range(total_doc):
    if d1 in rnr_doc[c1]:
      count1=count1+1
  len_relevant_doc[c1].append(count1)
  c1=c1+1



[[[320, 281, 238, 43, 395, 451, 97, 167, 213, 156], [11, 123, 365, 393, 304, 172, 280, 473, 63, 323], [267, 298, 423, 156, 485, 395, 401, 389, 70, 114], [352, 8, 163, 307, 484, 96, 229, 320, 72, 226], [473, 317, 31, 142, 33, 52, 401, 143, 59, 300], [307, 207, 484, 472, 356, 191, 5, 429, 312, 478], [404, 304, 26, 356, 27, 107, 331, 242, 118, 73], [420, 332, 364, 48, 350, 315, 105, 54, 51, 128], [468, 235, 239, 122, 375, 474, 93, 137, 280, 59], [68, 159, 189, 226, 375, 270, 348, 303, 406, 123]], [[320, 281, 238, 43, 395, 451, 97, 167, 213, 156], [11, 123, 365, 393, 304, 172, 280, 473, 63, 323], [267, 298, 423, 156, 485, 395, 401, 389, 70, 114], [352, 8, 163, 307, 484, 96, 229, 320, 72, 226], [473, 317, 31, 142, 33, 52, 401, 143, 59, 300], [307, 207, 484, 472, 356, 191, 5, 429, 312, 478], [404, 304, 26, 356, 27, 107, 331, 242, 118, 73], [420, 332, 364, 48, 350, 315, 105, 54, 51, 128], [468, 235, 239, 122, 375, 474, 93, 137, 280, 59], [68, 159, 189, 226, 375, 270, 348, 303, 406, 123]], [[3

In [102]:
import random
len_retrieved_relevant_doc2=[[],[],[],[],[],[],[],[],[],[]]
len_relevant_doc2=[[],[],[],[],[],[],[],[],[],[]]
total_doc=len(paragraphs)
top_ten_doc=[]
e=[]
z=[]
k=1
j=-1
doc_id=[]
for k1 in range(10):
  top_ten_doc.append(e)
for id in documents:
  doc_id.append(id)
for k in range(10):
  z=random.sample(doc_id,10)
  top_ten_doc[k].append(z)
  z=[]
print(top_ten_doc)
c=0
c1=0
for t in range(10):
  t2=t*2
  t3=t2+79
  tt=top_ten_doc[t]
  t1=top_ten_doc+sampled_documents[t2:t3]
  for q in sampled_querries2:
    count=0
    for d in t1:
      #print(c)
      if d in rnr_doc2[c]:
        count=count+1
    len_retrieved_relevant_doc2[t].append(count)
    c=c+1
  c=0
  tt=[]
  t1=[]
  
for q in sampled_querries2:
  count1=0
  for d1 in range(total_doc):
    if d1 in rnr_doc2[c1]:
      count1=count1+1
  len_relevant_doc2[c1].append(count1)
  c1=c1+1



[[[253, 177, 175, 23, 426, 85, 434, 423, 159, 161], [60, 272, 20, 284, 316, 353, 62, 48, 247, 403], [439, 275, 299, 472, 392, 296, 291, 479, 444, 406], [249, 373, 251, 233, 221, 339, 350, 336, 377, 276], [456, 364, 436, 47, 340, 313, 461, 459, 268, 361], [351, 121, 273, 22, 186, 16, 206, 289, 243, 8], [218, 483, 73, 172, 112, 458, 336, 418, 281, 230], [12, 458, 352, 145, 93, 173, 307, 88, 225, 14], [328, 230, 462, 3, 352, 326, 346, 89, 132, 310], [156, 260, 122, 107, 24, 180, 424, 468, 384, 237]], [[253, 177, 175, 23, 426, 85, 434, 423, 159, 161], [60, 272, 20, 284, 316, 353, 62, 48, 247, 403], [439, 275, 299, 472, 392, 296, 291, 479, 444, 406], [249, 373, 251, 233, 221, 339, 350, 336, 377, 276], [456, 364, 436, 47, 340, 313, 461, 459, 268, 361], [351, 121, 273, 22, 186, 16, 206, 289, 243, 8], [218, 483, 73, 172, 112, 458, 336, 418, 281, 230], [12, 458, 352, 145, 93, 173, 307, 88, 225, 14], [328, 230, 462, 3, 352, 326, 346, 89, 132, 310], [156, 260, 122, 107, 24, 180, 424, 468, 384, 23

In [103]:
len_retrieved_relevant_doc2

[[1, 2, 1, 1, 2, 2, 1, 1, 2, 1],
 [1, 2, 1, 1, 2, 2, 2, 2, 3, 1],
 [2, 2, 1, 1, 3, 2, 2, 3, 3, 2],
 [2, 2, 1, 1, 3, 2, 2, 3, 3, 2],
 [2, 3, 1, 1, 3, 4, 2, 3, 3, 2],
 [2, 4, 2, 1, 3, 4, 2, 3, 3, 2],
 [2, 4, 2, 1, 4, 4, 3, 3, 3, 2],
 [2, 4, 2, 1, 4, 4, 3, 3, 3, 2],
 [2, 4, 2, 2, 4, 4, 3, 3, 3, 2],
 [3, 5, 2, 2, 5, 5, 3, 4, 4, 3]]

In [104]:
len_relevant_doc2

[[3], [7], [2], [6], [6], [6], [3], [5], [10], [3]]

Calculating the Precision and recall for 10 querries for 10 runs

In [105]:
total_doc=len(paragraphs)
#total_retrieved_doc=len(sampled_documents)
total_retrieved_doc=10
Precision2=[[],[],[],[],[],[],[],[],[],[]]
Recall2=[[],[],[],[],[],[],[],[],[],[]]
top_ten_doc=[]
for i in range(10):
  top_ten_doc.append(i)
c=0
for k in range(10):

  for q in sampled_querries:
    tp=int(len_retrieved_relevant_doc[k][c])
    total_relevant_doc2=int(len_relevant_doc[c][0])
    p=tp/10
    r=tp/total_relevant_doc2
    Precision2[k].append(p)
    Recall2[k].append(r)
    p=0
    r=0
    c=c+1
  c=0

In [164]:
total_doc=len(paragraphs)
#total_retrieved_doc=len(sampled_documents)
total_retrieved_doc2=10
Precision=[[],[],[],[],[],[],[],[],[],[]]
Recall=[[],[],[],[],[],[],[],[],[],[]]
top_ten_doc=[]
for i in range(10):
  top_ten_doc.append(i)
c=0
for k in range(10):

  for q in sampled_querries:
    tp=int(len_retrieved_relevant_doc[k][c])
    total_relevant_doc=int(len_relevant_doc[c][0])
    p=tp/10
    r=tp/total_relevant_doc
    Precision[k].append(p)
    Recall[k].append(r)
    p=0
    r=0
    c=c+1
  c=0


In [165]:
Precision

[[0.1, 0.1, 0.1, 0.1, 0.0, 0.2, 0.1, 0.2, 0.2, 0.1],
 [0.1, 0.2, 0.1, 0.1, 0.1, 0.3, 0.1, 0.2, 0.2, 0.2],
 [0.1, 0.2, 0.1, 0.1, 0.1, 0.3, 0.2, 0.3, 0.2, 0.3],
 [0.1, 0.2, 0.1, 0.1, 0.1, 0.3, 0.2, 0.3, 0.2, 0.3],
 [0.1, 0.2, 0.1, 0.1, 0.1, 0.3, 0.2, 0.3, 0.4, 0.3],
 [0.1, 0.3, 0.2, 0.1, 0.1, 0.3, 0.2, 0.3, 0.4, 0.3],
 [0.1, 0.3, 0.2, 0.1, 0.2, 0.3, 0.2, 0.4, 0.4, 0.3],
 [0.1, 0.3, 0.2, 0.1, 0.3, 0.3, 0.2, 0.4, 0.4, 0.3],
 [0.2, 0.3, 0.3, 0.1, 0.3, 0.3, 0.2, 0.4, 0.4, 0.3],
 [0.2, 0.3, 0.3, 0.1, 0.3, 0.4, 0.3, 0.5, 0.5, 0.4]]

In [166]:
Recall

[[0.16666666666666666,
  0.16666666666666666,
  0.25,
  0.5,
  0.0,
  0.2,
  0.3333333333333333,
  0.3333333333333333,
  0.3333333333333333,
  0.2],
 [0.16666666666666666,
  0.3333333333333333,
  0.25,
  0.5,
  0.3333333333333333,
  0.3,
  0.3333333333333333,
  0.3333333333333333,
  0.3333333333333333,
  0.4],
 [0.16666666666666666,
  0.3333333333333333,
  0.25,
  0.5,
  0.3333333333333333,
  0.3,
  0.6666666666666666,
  0.5,
  0.3333333333333333,
  0.6],
 [0.16666666666666666,
  0.3333333333333333,
  0.25,
  0.5,
  0.3333333333333333,
  0.3,
  0.6666666666666666,
  0.5,
  0.3333333333333333,
  0.6],
 [0.16666666666666666,
  0.3333333333333333,
  0.25,
  0.5,
  0.3333333333333333,
  0.3,
  0.6666666666666666,
  0.5,
  0.6666666666666666,
  0.6],
 [0.16666666666666666,
  0.5,
  0.5,
  0.5,
  0.3333333333333333,
  0.3,
  0.6666666666666666,
  0.5,
  0.6666666666666666,
  0.6],
 [0.16666666666666666,
  0.5,
  0.5,
  0.5,
  0.6666666666666666,
  0.3,
  0.6666666666666666,
  0.6666666666666

Calculating Average Precision and Average Recall for 10 runs,

In [167]:
average_precision2=[]
average_recall2=[]
for q in range(10):
  p=0
  r=0
  for p2 in range (10):
    p1=Precision2[p2][q]
    p=p+p1
    r1=Recall2[p2][q]
    r=r+r1
  avgp=p/10
  avgr=r/10
  average_precision2.append(avgp)
  average_recall2.append(avgr)



In [147]:
average_precision=[]
average_recall=[]
for q in range(10):
  p=0
  r=0
  for p2 in range (10):
    p1=Precision[p2][q]
    p=p+p1
    r1=Recall[p2][q]
    r=r+r1
  avgp=p/10
  avgr=r/10
  average_precision.append(avgp)
  average_recall.append(avgr)

Calculating MAP

In [168]:
l=len(Precision)
c=0
count=0
map=0
for p1 in average_precision:
  c=c+average_precision[count]
  count=count+1
MAP=c/l


In [172]:
l=len(Precision2)
c22=0
count=0
map=0
for p1 in average_precision2:
  c22=c+average_precision2[count]
  count=count+1
MAP2=c22/l

In [174]:
MAP


0.24799999999999994

RESULTS

In [151]:
c=0
for q in sampled_querries:
  print('Query:',c,'Average Precision',average_precision[c],'Average Recall',average_recall[c])
  c=c+1
print('MAP',MAP)

Query: 0 Average Precision 0.32999999999999996 Average Recall 0.5499999999999999
Query: 1 Average Precision 0.29999999999999993 Average Recall 0.29999999999999993
Query: 2 Average Precision 0.32999999999999996 Average Recall 0.5499999999999999
Query: 3 Average Precision 0.16999999999999998 Average Recall 0.425
Query: 4 Average Precision 0.12999999999999998 Average Recall 0.25999999999999995
Query: 5 Average Precision 0.19 Average Recall 0.6333333333333333
Query: 6 Average Precision 0.53 Average Recall 0.4076923076923077
Query: 7 Average Precision 0.09999999999999999 Average Recall 0.5
Query: 8 Average Precision 0.24 Average Recall 0.4
Query: 9 Average Precision 0.16 Average Recall 0.5333333333333333
MAP 0.24799999999999994


**PART 1 C**

In [115]:
%%capture

!pip install elasticsearch==7.14.0
!apt install default-jdk > /dev/null

In [116]:
try:
  import os
  import elasticsearch
  from elasticsearch import Elasticsearch
  import numpy as np
  import pandas as pd
  import sys
  import json
  from ast import literal_eval
  from tqdm import tqdm 
  import datetime
  from elasticsearch import helpers
  import time
  
except Exception as e:
  print(f"error: {e}")

In [117]:
# Download & extract Elasticsearch 7.0.0

!wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.0.0-linux-x86_64.tar.gz -q
!tar -xzf elasticsearch-7.0.0-linux-x86_64.tar.gz
!chown -R daemon:daemon elasticsearch-7.0.0

In [118]:
# Creating daemon instance of elasticsearch
import os
from subprocess import Popen, PIPE, STDOUT
es_server = Popen(['elasticsearch-7.0.0/bin/elasticsearch'], 
                  stdout=PIPE, stderr=STDOUT,
                  preexec_fn=lambda: os.setuid(1)  # as daemon
                 )

In [119]:
# This part is important, since it takes a little amount of time for instance to load

time.sleep(10)

In [120]:
%%bash
# If you get 1 root & 2 daemon process then Elasticsearch instance has started successfully
ps -ef | grep elasticsearch

daemon       950     130  3 14:27 ?        00:00:40 /content/elasticsearch-7.0.0/jdk/bin/java -Xms1g -Xmx1g -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=75 -XX:+UseCMSInitiatingOccupancyOnly -Des.networkaddress.cache.ttl=60 -Des.networkaddress.cache.negative.ttl=10 -XX:+AlwaysPreTouch -Xss1m -Djava.awt.headless=true -Dfile.encoding=UTF-8 -Djna.nosys=true -XX:-OmitStackTraceInFastThrow -Dio.netty.noUnsafe=true -Dio.netty.noKeySetOptimization=true -Dio.netty.recycler.maxCapacityPerThread=0 -Dlog4j.shutdownHookEnabled=false -Dlog4j2.disable.jmx=true -Djava.io.tmpdir=/tmp/elasticsearch-15688500636260520282 -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=data -XX:ErrorFile=logs/hs_err_pid%p.log -Xlog:gc*,gc+age=trace,safepoint:file=logs/gc.log:utctime,pid,tags:filecount=32,filesize=64m -Djava.locale.providers=COMPAT -Dio.netty.allocator.type=unpooled -Des.path.home=/content/elasticsearch-7.0.0 -Des.path.conf=/content/elasticsearch-7.0.0/config -Des.distribution.flavor=default

In [121]:
es = Elasticsearch(hosts = [{"host":"localhost", "port":9200}], timeout = 180)
# Check if python is connected to elasticsearch
es.ping()

True

In [122]:
Settings = {
    "settings":{
        "number_of_shards":1,
        "number_of_replicas":0
    },"mappings":{
        "properties":{
            "medical_doc":{
                "type":"text"
            }
        }
    }
    
}

In [123]:
sampled_documents=[173, 124, 302, 67, 482, 241, 138, 431, 194, 412, 480, 163, 156, 386, 389, 64, 424, 434, 88, 422, 133, 131, 311, 276, 265, 324, 406, 178, 445, 341, 425, 454, 390, 40, 36, 30, 426, 46, 217, 3, 267, 437, 333, 358, 486, 74, 443, 282, 109, 371, 220, 115, 222, 197, 139, 233, 207, 84, 380, 280, 301, 94, 113, 73, 111, 444, 414, 174, 198, 387, 16, 277, 28, 421, 37, 41, 442, 59, 65, 71, 200, 203, 76, 205, 206, 82, 83, 469, 214, 343, 216, 100, 356, 365, 368, 372, 117, 376, 251, 126]

In [124]:
def json_formatter(documents, index_name, index_type='_doc'):
  try:
    List=[]
    c=0
    for key,val in documents.items():
      #if key in sampled_documents:
      #print('key',key)
      dic={}
      dic['_index'] = index_name
      dic['_type'] = index_type
      dic['_id']=c
      source = {}
      source['medical_doc']=val
      dic['_source']=source
      #print(dic)
      List.append(dic)
      c=c+1
    print(List)
    return List
  except Exception as e:
      print("There is a problem: {}".format(e))

In [125]:
MY_INDEX_1 = es.indices.create(index="medical_02",ignore=[400,404], body=Settings)

In [126]:
json_Formatted_dataset = json_formatter(documents=documents, index_name='medical_02', index_type='_doc')

[{'_index': 'medical_02', '_type': '_doc', '_id': 0, '_source': {'medical_doc': '\ufeff.I 1correlation between maternal and fetal plasma levels of glucose and freefatty acids .correlation coefficients have been determined between the levels ofglucose and ffa in maternal and fetal plasma collected at delivery .significant correlations were obtained between the maternal and fetalglucose levels and the maternal and fetal ffa levels . from the size ofthe correlation coefficients and the slopes of regression lines itappears that the fetal plasma glucose level at delivery is very stronglydependent upon the maternal level whereas the fetal ffa level atdelivery is only slightly dependent upon the maternal level .'}}, {'_index': 'medical_02', '_type': '_doc', '_id': 1, '_source': {'medical_doc': 'changes of the nucleic acid and phospholipid levels of the livers in thecourse of fetal and postnatal development .we have followed the evolution of dna, rna and pl in the livers of ratfoeti removed be

In [127]:
# For importing Data to elasticsearch we use elasticsearch's bulk API from elasticsearch.helpers
from elasticsearch import helpers
data=json_Formatted_dataset[1]
print(data)
try:
  res = helpers.bulk(es,json_Formatted_dataset)

  
except Exception as e:
    print(f"error: {e}")

{'_index': 'medical_02', '_type': '_doc', '_id': 1, '_source': {'medical_doc': 'changes of the nucleic acid and phospholipid levels of the livers in thecourse of fetal and postnatal development .we have followed the evolution of dna, rna and pl in the livers of ratfoeti removed between the fifteenth and the twenty-first day ofgestation and of young rats newly-born or at weaning . we can observethe followingfacts.. 1. dna concentration is 1100 ug p on the 15th day, it decreases fromthe 19th day until it reaches a value of 280 ug 5 days after weaning .2. rna concentration is 1400 ug p on the 15th day and decreases to 820during the same period .3. pl concentration is low on the 15th day and during foetal life (700ug) and increases abruptly at birth .4. the ratios rna cyto/dna and pl cyto/dna increase regularly from the18th day of foetal life .5. nuclear rna and pl contents are very high throughout thedevelopment .6. these results enable us to characterize three stages in thedevelopment of



In [128]:
# Get 10 sample of data
query = es.search(
    index="medical_02",
    body={
      "size":10,
      "query": {
        "match_all":{}
      }
    }
)
output = pd.json_normalize((query['hits']['hits']))
output

Unnamed: 0,_index,_type,_id,_score,_source.medical_doc
0,medical_02,_doc,0,1.0,﻿.I 1correlation between maternal and fetal pl...
1,medical_02,_doc,1,1.0,changes of the nucleic acid and phospholipid l...
2,medical_02,_doc,2,1.0,surfactant in fetal lamb tracheal fluid .lambs...
3,medical_02,_doc,3,1.0,placental and cord blood lipids.. comparison i...
4,medical_02,_doc,4,1.0,free fatty acid concentration in maternal plas...
5,medical_02,_doc,5,1.0,the concentration of non-esterified fatty acid...
6,medical_02,_doc,6,1.0,lipid metabolism in toxemia and normal pregnan...
7,medical_02,_doc,7,1.0,essential fatty acids and acids with trans-con...
8,medical_02,_doc,8,1.0,acetoacetate formation by livers from human fe...
9,medical_02,_doc,9,1.0,changes in blood glucose and non-esterified fa...


In [176]:
sampled_querries

['hepatitis virus',
 'sore throat',
 'fever',
 'hepatitis',
 'paracetamol ibuprofen',
 'virus',
 'vomiting',
 'muscle ache',
 'common cold',
 'stomach ache']

In [257]:
l=[]
c=-1
for q in sampled_querries:
  c=c+1
  query = es.search(
      index="medical_02",
      body={
          "size":20,
          "query":{
              "bool":{
                  "must":[
                          {"match":{"medical_doc":q}}
                  ]
              }
          }
      }
  )
  l.append(query)
  print('querry',q,l[c],'---------------------------------------------------')
retrieved=[]
o2=[[],[],[],[],[],[],[],[],[],[],[]]
for co in range(10):
  output={}
  o1=[]
  
  l1=0
  output=l[co]
  o1=output['hits']['hits']
  l1=len(o1)
  if l1!=0:
    
    for i in range (l1):
      c=output['hits']['hits'][i]['_id']
      #print(c,'co',co)
      list1.append(c)
    #print(list1,co)
    o2[co]=list1
  #print('Printing id----','q',q,o2)
  elif l1==0:
    #print('l1=0',l1,co)
    o2[co]=0
  list1=[]
  #print('l1',l1)
  retrieved.append(l1)
  #print(o1)
  #print('r',retrieved)
  #print('----------',co,output['hits']['hits'])
  #print('------o1----',co,len(o1))
print(retrieved)
print(o2)

querry hepatitis virus {'took': 8, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 19, 'relation': 'eq'}, 'max_score': 9.589279, 'hits': [{'_index': 'medical_02', '_type': '_doc', '_id': '15', '_score': 9.589279, '_source': {'medical_doc': "treatment of active chronic hepatitis and lupoid hepatitis with6-mercaptopurine and azothioprine .6-mercaptopurine or azothioprine ('imuran') was used successfully in 3patients with active chronic hepatitis and 2 with lupoid hepatitis, forperiods up to 1 year . these drugs allowed modification and evenabolition of discomforting corticosteroid regimes . their action inchronic hepatitis may be analogous to their anti-immune action insuppressing homograft rejection ."}}, {'_index': 'medical_02', '_type': '_doc', '_id': '375', '_score': 8.794051, '_source': {'medical_doc': '3645. autoimmune hepatitisin 30% of 301 patients with cirrhosis, the cause was uncertain.  of these 90cases the c

In [305]:
import random
len_retrieved_relevant_doc3=[[],[],[],[],[],[],[],[],[],[]]
len_relevant_doc3=[[],[],[],[],[],[],[],[],[],[]]
total_doc=len(paragraphs)
top_ten_doc=[]
e=[]
z=[]
k=1
j=-1
doc_id=[]
dd=[[],[],[],[],[],[],[],[],[],[]]
for k1 in range(10):
  top_ten_doc.append(e)
for id in documents:
  doc_id.append(id)
for k in range(10):
  z=random.sample(doc_id,10)
  top_ten_doc[k].append(z)
  z=[]
print(top_ten_doc)
c=0
c1=0
y=[]

for t1 in range (10):
  y=o2[t1]
  print(y)
  if y:
    for o in y:
      x=int(o)+1
      dd[t1].append(x)
      x=0
    else:dd[t1].append(0)

print(len(dd),'dd',dd)
print('o2',o2)
j=[]
for t in range(10):
  dd1=0
  o=o2[t]
  j=dd[t]
  print('o',o,'dd',j,'rnr',rnr_doc[t])
  if j:
    for d in j :
      if d in rnr_doc[t]:
        count=count+1
        print('inside',count)
      elif d not in rnr_doc[t]:
        count=count+0
    print('final_count',count)
  len_retrieved_relevant_doc3[t].append(count)
  c=c+1
  c=0
  dd1=dd1+1
  count=0
  
for q in sampled_querries2:
  count1=0
  for d1 in range(total_doc):
    if d1 in rnr_doc[c1]:
      count1=count1+1
  len_relevant_doc3[c1].append(count1)
  c1=c1+1


[[[413, 145, 115, 126, 295, 117, 137, 394, 411, 280], [398, 287, 251, 145, 471, 132, 221, 50, 352, 107], [405, 358, 60, 314, 72, 439, 244, 202, 213, 463], [454, 325, 10, 334, 359, 192, 331, 260, 158, 383], [321, 39, 88, 165, 29, 275, 38, 123, 362, 69], [397, 25, 8, 83, 483, 488, 456, 334, 74, 169], [68, 330, 81, 159, 473, 486, 244, 477, 438, 97], [237, 308, 416, 379, 280, 46, 45, 148, 88, 452], [32, 38, 22, 316, 261, 315, 136, 486, 154, 126], [128, 279, 454, 478, 70, 333, 299, 451, 121, 46]], [[413, 145, 115, 126, 295, 117, 137, 394, 411, 280], [398, 287, 251, 145, 471, 132, 221, 50, 352, 107], [405, 358, 60, 314, 72, 439, 244, 202, 213, 463], [454, 325, 10, 334, 359, 192, 331, 260, 158, 383], [321, 39, 88, 165, 29, 275, 38, 123, 362, 69], [397, 25, 8, 83, 483, 488, 456, 334, 74, 169], [68, 330, 81, 159, 473, 486, 244, 477, 438, 97], [237, 308, 416, 379, 280, 46, 45, 148, 88, 452], [32, 38, 22, 316, 261, 315, 136, 486, 154, 126], [128, 279, 454, 478, 70, 333, 299, 451, 121, 46]], [[413

In [306]:
len_retrieved_relevant_doc3

[[3], [0], [2], [2], [0], [6], [2], [6], [5], [1]]

In [314]:
len_retrieved_relevant_doc


[[1, 1, 1, 1, 0, 2, 1, 2, 2, 1],
 [1, 2, 1, 1, 1, 3, 1, 2, 2, 2],
 [1, 2, 1, 1, 1, 3, 2, 3, 2, 3],
 [1, 2, 1, 1, 1, 3, 2, 3, 2, 3],
 [1, 2, 1, 1, 1, 3, 2, 3, 4, 3],
 [1, 3, 2, 1, 1, 3, 2, 3, 4, 3],
 [1, 3, 2, 1, 2, 3, 2, 4, 4, 3],
 [1, 3, 2, 1, 3, 3, 2, 4, 4, 3],
 [2, 3, 3, 1, 3, 3, 2, 4, 4, 3],
 [2, 3, 3, 1, 3, 4, 3, 5, 5, 4]]

In [316]:
retrieved

[19, 0, 2, 3, 0, 16, 2, 9, 20, 1]

In [319]:
total_doc=len(paragraphs)
total_retrieved_doc2=[]
#total_retrieved_doc=len(sampled_documents)
for i in range(10):
  total_retrieved_doc2.append(retrieved[i])


Precision3=[[],[],[],[],[],[],[],[],[],[]]
Recall3=[[],[],[],[],[],[],[],[],[],[]]
top_ten_doc=[]
for i in range(10):
  top_ten_doc.append(i)
c=0
for k in range(10):

  for q in sampled_querries:
    tp=int(len_retrieved_relevant_doc3[k][0])
    total_relevant_doc=int(len_relevant_doc3[c][0])
    if(retrieved[k]):
      p=tp/retrieved[k]
    elif(retrieved[k]==0):p=0
    r=tp/total_relevant_doc
    Precision3[k].append(p)
    Recall3[k].append(r)
    p=0
    r=0
    c=c+1
  c=0

In [320]:
average_precision3=[]
average_recall3=[]
for q in range(10):
  p=0
  r=0
  for p2 in range (10):
    p1=Precision3[p2][q]
    p=p+p1
    r1=Recall3[p2][q]
    r=r+r1
  avgp=p/10
  avgr=r/10
  average_precision3.append(avgp)
  average_recall3.append(avgr)

In [321]:
l=len(Precision3)
c=0
count=0
map=0
for p1 in average_precision3:
  c=c+average_precision3[count]
  count=count+1
MAP3=c/l


In [322]:
c=0
for q in sampled_querries:
  print('Querries',q,'Average Precision',average_precision3[c],'Average Recall',average_recall3[c])
  c=c+1
print('----------------Elastic Search Results-----------')
print('MAP',MAP3)

Querries hepatitis virus Average Precision 0.5116228070175438 Average Recall 0.45
Querries sore throat Average Precision 0.5116228070175438 Average Recall 0.45
Querries fever Average Precision 0.5116228070175438 Average Recall 0.675
Querries hepatitis Average Precision 0.5116228070175438 Average Recall 1.35
Querries paracetamol ibuprofen Average Precision 0.5116228070175438 Average Recall 0.9
Querries virus Average Precision 0.5116228070175438 Average Recall 0.26999999999999996
Querries vomiting Average Precision 0.5116228070175438 Average Recall 0.9
Querries muscle ache Average Precision 0.5116228070175438 Average Recall 0.45
Querries common cold Average Precision 0.5116228070175438 Average Recall 0.45
Querries stomach ache Average Precision 0.5116228070175438 Average Recall 0.5399999999999999
----------------Elastic Search Results-----------
MAP 0.5116228070175437
