# Welcome to COVID-19 Information Retrieval Engine 
                                                                                
### Project is part of Text analysis and Retrieval, FER 

## 1.) Preprocessing of the dataset 

#### Note: This part can be replaced by different method of parsing, just be sure to finish with a same data structure as we have

In [1]:
#lets import our own Parser class that will parse dataset based on the source.
from Parser import *

parser = Parser([Dataset.FINAL])
parser.parse(indexByFile = False)
papers = parser.data_dicts

### Now we are going to combine all papers in one dictonary... We do this because we do want to test on all papers, but just by tweaking this little bit, you can determine which dataset you want.

In [4]:
all_papers = {}

current_id = 0
for dataset in papers: #<--- change this in order to change which dataset you want in dictonary
    for paper_id in papers[dataset]:
        all_papers[current_id] = papers[dataset][paper_id]
        current_id += 1

# 1.b) Importing queries

In [28]:
from TaskQuery import *

#We are importing all the queries here.
queries = TaskQuery.questions()
print(queries)

['What is known about Covid-19 transmission?', 'What is known about Covid-19 incubation?', 'What is known about Covid-19 environmental stability?']


## IMPORTANT: Here enter the Query for which you want the end result!

In [29]:
QUERY = "What is know about incubation?".lower()
print(QUERY)

what is know about incubation?


# 2.) BioNER filtering

### Here we are going to filter the dataset. We are going to take a query and extract keywords from it. Then, we will extract keywords from each paper and compare query keywords from paper keywords. Those papers that do not match with query keywords will be removed.

In [30]:
import scispacy
import spacy
import re

#download this https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_sm-0.2.4.tar.gz and put
#in the data folder
bioNER = spacy.load("./data/en_core_sci_sm")
covid_keywords = ["covid19","sars-cov-2","covid-19","2019-ncov","ncov-2019"]

In [31]:
#Getting keywords for Query
query_NER = bioNER(QUERY)
print(query_NER.ents)

(incubation,)


In [32]:
def checkKeyword(word):
    for keyword in covid_keywords:
        if keyword in word:
            return True
    return False

In [33]:
#now we are going through all the documents and parsing them to get the important BIO terms like - coronavirus etc.
#then we are going to count the number of times some word from query has appered in doc, and divide that counter with 
#number of words in document. That will give us a probability score that we can use.
from nltk.stem import WordNetLemmatizer 

lemmatizer = WordNetLemmatizer()

filtered_results = {}
alpha = 1 #parameter by which we are determining how much are we valuing COVID19 terms in body
beta = 3 #parametar by which we are determining how much are we valuing search terms in title
gamma = 2 #paramtar by which we are determining how much are we valuing search terms in abstract

NER_scores = []
if(len(set(query_NER.ents)) != 0):
    for i in range(len(all_papers)):
        isRelevant = False
        paper = all_papers[i]
        document = bioNER(paper.body)
        title = bioNER(paper.title)
        abstract = bioNER(paper.abstract)
        
        query_counter = 0
        for query in set(query_NER.ents):
            counter = 0
            for keyword in set(document.ents):
                lemma_keyword = lemmatizer.lemmatize(keyword.text.lower())
                if(lemma_keyword == lemmatizer.lemmatize(query.text.lower())):
                    counter += 1
                    counter *= beta
                if(lemma_keyword in covid_keywords or lemma_keyword.endswith("ncov") or checkKeyword(lemma_keyword)):
                    counter += 1
                if(lemma_keyword in covid_keywords):
                    isRelevant = True
            counter = counter/len(set(document.ents))
            query_counter += counter
            
        
        title_counter = 0
        for query in set(query_NER.ents):
            counter = 0
            for keyword in set(title.ents):
                lemma_keyword = lemmatizer.lemmatize(keyword.text.lower())
                if(lemma_keyword == lemmatizer.lemmatize(query.text.lower())):
                    counter += 1 
                    counter *= beta
                if(lemma_keyword in covid_keywords or lemma_keyword.endswith("ncov") or checkKeyword(lemma_keyword)):
                    counter += 1
                if(lemma_keyword in covid_keywords):
                    isRelevant = True
            if(len(set(title.ents)) != 0):
                counter = counter/len(set(title.ents))
            else:
                counter = 0
            title_counter += counter
        
            
        abstract_counter = 0
        for query in set(query_NER.ents):
            counter = 0
            for keyword in set(abstract.ents):
                lemma_keyword = lemmatizer.lemmatize(keyword.text.lower())
                if(lemma_keyword == lemmatizer.lemmatize(query.text.lower())):
                    counter += 1
                    counter *= beta
                if(lemma_keyword in covid_keywords or lemma_keyword.endswith("ncov") or checkKeyword(lemma_keyword)):
                    counter += 1
                if(lemma_keyword in covid_keywords):
                    isRelevant = True
            if(len(set(abstract.ents)) != 0):
                counter = counter/len(set(abstract.ents))
            else:
                counter = 0
            abstract_counter += counter
        
        
        if isRelevant:
            query_counter = (query_counter+title_counter+abstract_counter)/len(set(query.ents))
        else:
            query_counter = 0
        NER_scores.append((query_counter,i))
    NER_scores.sort(reverse = True)
    print(NER_scores)

[(6687379313292.725, 83), (1863529.072057915, 14), (1108014.2834224598, 45), (11.060687616087336, 73), (2.0279273739858086, 44), (1.94998713660921, 88), (0.45014578777465375, 48), (0.2715675417453803, 71), (0.21544715447154472, 57), (0.1561108017817372, 86), (0.15050232786081844, 51), (0.13704141138259393, 20), (0.10344827586206896, 1), (0.10074136998996061, 25), (0.05200320512820513, 0), (0.0270301357257879, 29), (0.020833333333333332, 68), (0.0017543859649122807, 31), (0, 97), (0, 96), (0, 95), (0, 94), (0, 93), (0, 92), (0, 91), (0, 90), (0, 89), (0, 87), (0, 85), (0, 84), (0, 82), (0, 81), (0, 80), (0, 79), (0, 78), (0, 77), (0, 76), (0, 75), (0, 74), (0, 72), (0, 70), (0, 69), (0, 67), (0, 66), (0, 65), (0, 64), (0, 63), (0, 62), (0, 61), (0, 60), (0, 59), (0, 58), (0, 56), (0, 55), (0, 54), (0, 53), (0, 52), (0, 50), (0, 49), (0, 47), (0, 46), (0, 43), (0, 42), (0, 41), (0, 40), (0, 39), (0, 38), (0, 37), (0, 36), (0, 35), (0, 34), (0, 33), (0, 32), (0, 30), (0, 28), (0, 27), (0,

In [34]:
#Now we have filtered and removed all the papers that have score zero.
for score in NER_scores:
    #if(score[0] != 0.0):
    filtered_results[all_papers[score[1]]] = score

# 2.) Word2vec

In [12]:
#User manual
#----------------------------------
#Install --> pip3 install gensim (apart from gensim, you will need numpy)
#Download word2vec file -->  https://code.google.com/archive/p/word2vec/
import gensim.models.keyedvectors as word2vec
import numpy as np

unable to import 'smart_open.gcs', disabling that module


In [13]:
#Here we initialize word2vec with already pretrained vectors
word2vec = word2vec.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary=True)

In [35]:
import nltk
query = nltk.word_tokenize(QUERY)
print(query)

['what', 'is', 'know', 'about', 'incubation', '?']


In [36]:
keywords_that_are_not_in_word2vec = ["covid19","sars-cov-2","covid-19"]

query_vector = np.zeros(300)
for word in query:
    if(word in word2vec.vocab):
        query_vector += word2vec[word]
    if(word.lower() in keywords_that_are_not_in_word2vec):
        query_vector += word2vec["coronavirus"]

In [37]:
from numpy.linalg import norm

W2V_scores = []
for paper in filtered_results:
    paper_vector = np.zeros(300)
    title_vector = np.zeros(300)
    abstract_vector = np.zeros(300)
    tokens = bioNER(paper.body)
    title_tokens = bioNER(paper.title)
    abstract_tokens = bioNER(paper.abstract)
    
    for token in set(tokens.ents):
        keyword = lemmatizer.lemmatize(token.text)
        if(keyword in word2vec.vocab):
            if(keyword in query):
                paper_vector += word2vec[keyword]*alpha
            else:
                paper_vector += word2vec[keyword]
        
    for token in set(title_tokens.ents):
        keyword = lemmatizer.lemmatize(token.text)
        if(keyword in word2vec.vocab):
            if(keyword in query):
                title_vector += word2vec[keyword]*beta
            else:
                title_vector += word2vec[keyword]
    
    for token in set(abstract_tokens.ents):
        keyword = lemmatizer.lemmatize(token.text)
        if(keyword in word2vec.vocab):
            if(keyword in query):
                abstract_vector += word2vec[keyword]*gamma
            else:
                abstract_vector += word2vec[keyword]
        
    cos_sim_paper = np.inner(query_vector,paper_vector)/(norm(query_vector)*norm(paper_vector))
    cos_sim_title = np.inner(query_vector,title_vector)/(norm(query_vector)*norm(title_vector))
    cos_sim_abstract = np.inner(query_vector,abstract_vector)/(norm(query_vector)*norm(abstract_vector))
    
    cos_sim = cos_sim_paper*alpha + cos_sim_title*beta + cos_sim_abstract*gamma
    
    score = filtered_results[paper]
    w2v_score = (score[0]*cos_sim,score[1])
    
    filtered_results[paper] = w2v_score
    W2V_scores.append(w2v_score)
W2V_scores.sort(reverse = True)
print(W2V_scores)



[(17235615131948.322, 83), (4098257.9436287903, 14), (2697706.5716558257, 45), (12.456554692937472, 73), (4.837248143671472, 44), (3.689209495013166, 88), (0.568308938667792, 48), (0.5238970989793873, 71), (nan, 57), (0.27934590675398613, 51), (0.23722233040748167, 86), (0.2038984327545309, 20), (0.18339533804509744, 25), (0.14877129283825052, 1), (0.06396074320644037, 0), (0.05615135305794936, 29), (0.018608102103565952, 68), (0.0024921308580466038, 31), (nan, 97), (0.0, 96), (0.0, 95), (nan, 94), (0.0, 93), (0.0, 92), (0.0, 91), (0.0, 90), (0.0, 89), (0.0, 87), (0.0, 85), (0.0, 84), (0.0, 82), (0.0, 81), (0.0, 80), (0.0, 79), (0.0, 78), (0.0, 77), (0.0, 76), (0.0, 75), (0.0, 74), (0.0, 72), (nan, 70), (0.0, 69), (0.0, 67), (0.0, 66), (0.0, 65), (0.0, 64), (0.0, 63), (0.0, 62), (0.0, 61), (0.0, 60), (0.0, 59), (0.0, 58), (nan, 56), (0.0, 55), (nan, 54), (0.0, 53), (0.0, 52), (0.0, 50), (0.0, 49), (0.0, 47), (0.0, 46), (0.0, 43), (0.0, 42), (0.0, 41), (0.0, 40), (0.0, 39), (0.0, 38), (

In [41]:
print(all_papers[73])

[1mTitle[0m
Association of Population migration and Coronavirus Disease 2019 epidemic control

[1mAbstract[0m
To analyze the impact of different patterns of migration flow in two cities, Hefei and Shenzhen, on the epidemic and disease control of Coronavirus Disease 2019 , in order to provide insight for making differentiated controlling policies.Hefei and Shenzhen between January 19 and February 11, 2020, from data officially published by the provincial and municipal Centers for Disease Control and Prevention (CDC). From these data we calculated basic reproduction number R0 to reflect the rate of spread of COVID-19 in these cities.Aggregated data of population migration during the same period was extracted from Baidu Migration.The change of R0 in the two cites were analyzed and compared. Spearman correlation analysis between R0 and population inflow from epidemic focus were performed.

[1mBody[0m
In December 2019, a cluster of patients with pneumonia of unknown etiology was repor

In [42]:
for result in W2V_scores:
    print(str(all_papers[result[1]].id) +" : "+str(result[0]))

1f80f32ab88715153f9690b096115e53f2fd6121 : 17235615131948.322
af266fac8970a7960e96630a67d91bec5dda0335 : 4098257.9436287903
cbc05d14c57b91081970a232ab83bc993f998fe2 : 2697706.5716558257
a09c1ad0f05bbcc256cd667ad7411e6690134370 : 12.456554692937472
3c0e6714183f9935f79f8b2e2a708833b52f00e0 : 4.837248143671472
f3ff1ecae96700f41b83d2a034a3a959428388b0 : 3.689209495013166
6115e9fbaed5e154ff92cb8d02387fec72251dab : 0.568308938667792
f7b069911c90d9deab85becbad3e9a633e0bf57c : 0.5238970989793873
a33f451d8e2caff57c1133f14198454a64993c47 : nan
3b181a741d4beafda3ba3a8fc68239493d49f6aa : 0.27934590675398613
adb09e4e5c7331b2aa661b3d3bb0a643e00d11bc : 0.23722233040748167
67a2ae408efcb6686a22ae27435c7e4fa10c4a21 : 0.2038984327545309
8b2e50eb3ea84225580fbdbccccb4fcd3f062feb : 0.18339533804509744
dac1b1607ae72b9509ab26367e0d55016e8132a8 : 0.14877129283825052
0aff00101d5ccc6592987185ab833f95d842f98b : 0.06396074320644037
c8df44a3612e85e267351e936ddeb8fc5867afa1 : 0.05615135305794936
f3cbd842225279a42440

# 3.) Doc2vec 

In [85]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

In [86]:
tagged_data = []
for paper in filtered_results:
    score = filtered_results[paper]
    
    tokens = [token.text for token in bioNER(paper.title.lower()).ents]
    doc = TaggedDocument(words = tokens, tags = [str(score[1])])
    tagged_data.append(doc)

#### This is the training part. Here we are making our own word embeddings. That means we are basically going to make our own word2vec. In other words, for every word from our dataset our model will make a vector in 20 dimensional space. Furthermore, every vectors will be similar if the words they are representing are similar. E.g. vectors for word coronavirus and covid19 will be similar.

In [87]:
max_epochs = 1000
vec_size = 20 #word2vec has 300, but I left 100 here
alpha = 0.025

model = Doc2Vec(vector_size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)

model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    model.train(tagged_data,total_examples=model.corpus_count,epochs=model.iter)
    print('iteration {0}'.format(epoch))
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha
print("Done.")

  


iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
iteration 50
iteration 51
iteration 52
iteration 53
iteration 54
iteration 55
iteration 56
iteration 57
iteration 58
iteration 59
iteration 60
iteration 61
iteration 62
iteration 63
iteration 64
iteration 65
iteration 66
iteration 67
iteration 68
iteration 69
iteration 70
iteration 71
iteration 72
iteration 73
iteration 74
iteration 75
iteration 76
iteration

iteration 600
iteration 601
iteration 602
iteration 603
iteration 604
iteration 605
iteration 606
iteration 607
iteration 608
iteration 609
iteration 610
iteration 611
iteration 612
iteration 613
iteration 614
iteration 615
iteration 616
iteration 617
iteration 618
iteration 619
iteration 620
iteration 621
iteration 622
iteration 623
iteration 624
iteration 625
iteration 626
iteration 627
iteration 628
iteration 629
iteration 630
iteration 631
iteration 632
iteration 633
iteration 634
iteration 635
iteration 636
iteration 637
iteration 638
iteration 639
iteration 640
iteration 641
iteration 642
iteration 643
iteration 644
iteration 645
iteration 646
iteration 647
iteration 648
iteration 649
iteration 650
iteration 651
iteration 652
iteration 653
iteration 654
iteration 655
iteration 656
iteration 657
iteration 658
iteration 659
iteration 660
iteration 661
iteration 662
iteration 663
iteration 664
iteration 665
iteration 666
iteration 667
iteration 668
iteration 669
iteration 670
iterat

#### Here we are testing our word embeddings with some query. Our query will be "Coronavirus transmission" and we are hoping to  find all the documents that are talking about coronavirus transmission. 

In [88]:
test_data = word_tokenize(QUERY.lower()) #change this query to test different things 
v1 = model.infer_vector(test_data)
print(QUERY)

What is known about Covid-19 incubation?


In [89]:
similar_doc = model.docvecs.most_similar([v1],topn = len(tagged_data))
print(similar_doc)

[('32', 0.8288108110427856), ('36', 0.7724782228469849), ('29', 0.7410563230514526), ('21', 0.7408602833747864), ('56', 0.7141501307487488), ('61', 0.7138939499855042), ('73', 0.7138521671295166), ('5', 0.7123087644577026), ('90', 0.7116215229034424), ('22', 0.7050129771232605), ('7', 0.7047147750854492), ('67', 0.7039968967437744), ('51', 0.7011739015579224), ('92', 0.6980803608894348), ('80', 0.6914718151092529), ('1', 0.678401529788971), ('94', 0.6776716709136963), ('64', 0.6767700910568237), ('70', 0.6763937473297119), ('87', 0.6745656132698059), ('23', 0.6743944883346558), ('55', 0.6717816591262817), ('69', 0.6713817119598389), ('75', 0.6706154346466064), ('65', 0.6691190004348755), ('83', 0.6644261479377747), ('52', 0.6606899499893188), ('20', 0.6606761813163757), ('76', 0.6597611904144287), ('28', 0.6566731929779053), ('38', 0.6555248498916626), ('89', 0.6481664776802063), ('60', 0.6476477384567261), ('16', 0.6435396671295166), ('15', 0.6323037147521973), ('44', 0.63152194023132

In [90]:
results = {}
for doc in similar_doc:
    results[int(doc[0])] = doc[1]

d2v_results = []
for paper in filtered_results:
    score = filtered_results[paper]
    result = results[score[1]]
    freshed_score = (result*score[0],score[1])
    
    filtered_results[paper] = freshed_score
    d2v_results.append(freshed_score)
d2v_results.sort(reverse = True)

In [91]:
for result in d2v_results:
    print(str(all_papers[result[1]].id) +" : "+str(result[0]))

f7b069911c90d9deab85becbad3e9a633e0bf57c : 1.2003139796591633
cbc05d14c57b91081970a232ab83bc993f998fe2 : 0.7358148924637329
3b181a741d4beafda3ba3a8fc68239493d49f6aa : 0.392638305206255
6115e9fbaed5e154ff92cb8d02387fec72251dab : 0.390590956733092
af266fac8970a7960e96630a67d91bec5dda0335 : 0.2978944702408219
8b2e50eb3ea84225580fbdbccccb4fcd3f062feb : 0.032486395188119015
f3ff1ecae96700f41b83d2a034a3a959428388b0 : 0.009365771372346853
c8df44a3612e85e267351e936ddeb8fc5867afa1 : 0.009278198535456654
67a2ae408efcb6686a22ae27435c7e4fa10c4a21 : 0.00896006870301911
a09c1ad0f05bbcc256cd667ad7411e6690134370 : 0.008838653940377354
f3cbd842225279a42440e63e2bdc48d6c571efb6 : 0.0026316366580143268
dac1b1607ae72b9509ab26367e0d55016e8132a8 : 0.002211740988380312
9b5f5119bbfbded3245acc37859cefde967458e7 : 0.0019107883549362421
8ceb037798bd3fa6941261d1b888fe0cb79f2850 : 0.0018619937387311662
0e8773d0887abfa54cb1b618fcdf491e7a0a2c8a : 0.0013513093265090368
7b22c0d8cb7675bcc5aa283fe3bfef6c72052519 : 0.0009

# 4.) bm25? 

In [3]:
from rank_bm25 import BM25Okapi

In [24]:
corpus = []
for id in all_papers:
    doc = bioNER(all_papers[id].whole_text)
    tokens = []
    for word in doc:
        tokens.append(word.text.lower())
    corpus.append(tokens)
bm25 = BM25Okapi(corpus)

In [37]:
query = ["covid-19","cure"]
doc_scores = bm25.get_scores(query)

results = []
for i in range(len(doc_scores)):
    results.append((doc_scores[i],i))

results.sort(reverse = True)
print(results)

[(9.829746884352339, 25), (7.552576445117346, 85), (5.801097309775391, 1), (4.569421450374556, 72), (4.4880634589118245, 20), (4.479673770795969, 50), (4.444581740282896, 44), (4.389977630182508, 70), (4.281615707515277, 14), (4.184136719699542, 47), (3.814568027577314, 29), (3.768211432944475, 82), (3.0184902095204, 67), (2.8127342672591067, 56), (1.0395806124283071, 37), (0.0, 94), (0.0, 93), (0.0, 92), (0.0, 91), (0.0, 90), (0.0, 89), (0.0, 88), (0.0, 87), (0.0, 86), (0.0, 84), (0.0, 83), (0.0, 81), (0.0, 80), (0.0, 79), (0.0, 78), (0.0, 77), (0.0, 76), (0.0, 75), (0.0, 74), (0.0, 73), (0.0, 71), (0.0, 69), (0.0, 68), (0.0, 66), (0.0, 65), (0.0, 64), (0.0, 63), (0.0, 62), (0.0, 61), (0.0, 60), (0.0, 59), (0.0, 58), (0.0, 57), (0.0, 55), (0.0, 54), (0.0, 53), (0.0, 52), (0.0, 51), (0.0, 49), (0.0, 48), (0.0, 46), (0.0, 45), (0.0, 43), (0.0, 42), (0.0, 41), (0.0, 40), (0.0, 39), (0.0, 38), (0.0, 36), (0.0, 35), (0.0, 34), (0.0, 33), (0.0, 32), (0.0, 31), (0.0, 30), (0.0, 28), (0.0, 27