## Parsing of the dataset CORD-19


### Importing the parser and parsing the datasets

In [1]:
from Parser import *
from TaskQuery import *

In [2]:
#Creating a Parser and specifying what kind of dataset we want to parse
parser = Parser([Dataset.BIORXIV])
parser.parse(indexByFile = False);

### Example of accesing the paper by index

In [3]:
#We can access the date by index or by file name, but we have to change in the parse function
#what kind of invoke we want
# print(parser.data_dicts[Dataset.BIORXIV][2])

### Accesing certain elements of the paper

In [4]:
#By method titles(), abstracts() and bodies() you can access to certain elements of the paper
# paper_abstracts = parser.titles()
# for abstract in paper_abstracts[Dataset.BIORXIV].values():
#     print(abstract)

# Word2Vec demonstration

In [8]:
#User manual
#----------------------------------
#Install --> pip3 install gensim (apart from gensim, you will need numpy)
#Download word2vec file -->  https://code.google.com/archive/p/word2vec/
import gensim.models.keyedvectors as word2vec

In [9]:
#Here we initialize word2vec with already pretrained vectors
word2vec = word2vec.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary=True)

In [10]:
#As you can see, coronavirus is extremely similar with other virus terms 
word2vec.most_similar("coronavirus")

[('corona_virus', 0.7276226282119751),
 ('coronaviruses', 0.7216538190841675),
 ('paramyxovirus', 0.7113003730773926),
 ('SARS_coronavirus', 0.6601907014846802),
 ('arenavirus', 0.6494410037994385),
 ('influenza_virus', 0.6449826955795288),
 ('H#N#_subtype', 0.6360139846801758),
 ('H#N#_strain', 0.6324741840362549),
 ('H7_virus', 0.6261191964149475),
 ('flu_virus', 0.6249204874038696)]

In [11]:
#So word2vec is basically a dict, where for word it returns us a 300 dimensional vector. The more the words are similiar
#so are the vectors going to be similar (talking here about cosine similarity!).
# word2vec["cure"]

# Doc2Vec demonstration

### So, now we are heading into the big guns! Doc2Vec is basically word2vec, but only for words that are appearing in our dataset. Meaning that words like Coronavirus, Covid19, Wuhan and other important phrases will be recognized here by our model. In contrast, word2vec couldn't recognize covid19, because that's new term for this disease.

In [12]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

#### Here we are basically making our own dataset. We are taking our own papers ( parser.toList() will return all papers in dataset) and tagging them.

In [13]:
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(parser.toList())]

#### This is the training part. Here we are making our own word embeddings. That means we are basically going to make our own word2vec. In other words, for every word from our dataset our model will make a vector in 20 dimensional space. Furthermore, every vectors will be similar if the words they are representing are similar. E.g. vectors for word coronavirus and covid19 will be similar.

In [14]:
max_epochs = 120
vec_size = 20 #word2vec has 300, but I left 100 here
alpha = 0.025

d2v_model = Doc2Vec(vector_size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)

d2v_model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    #print('iteration {0}'.format(epoch))
    d2v_model.train(tagged_data,total_examples=d2v_model.corpus_count,epochs=d2v_model.iter)
    # decrease the learning rate
    d2v_model.alpha -= 0.0002
    # fix the learning rate, no decay
    d2v_model.min_alpha = d2v_model.alpha
print("Done.")

  from ipykernel import kernelapp as app


Done.


In [15]:
from nltk.corpus import stopwords
from nltk import word_tokenize

# query = TaskQuery.questions()[0]
query = TaskQuery.topics()[5]

print('QUERY > ', query)

QUERY >  Persistence of virus on surfaces of different materials


#### Here we are testing our word embeddings with some query. Our query will be "Coronavirus transmission" and we are hoping to  find all the documents that are talking about coronavirus transmission. 
#### Here we are finding the embeddings that will correnspond with our query. Function most_similar() will return us id and percentage of similarity with corrensponding query. E.g. ('43', 0.834 ) means that document with id 43 is 83% similar with query. ( although this isn't really percentage, this is similarity, but thats the gist :) )

In [16]:
#finding the most similar doc
def d2v_most_similar(query):
    test_data = word_tokenize(query.lower())
    v1 = d2v_model.infer_vector(test_data)
    return d2v_model.docvecs.most_similar([v1])
    
similar_docs = d2v_most_similar(query)
print(similar_docs)
print(len(similar_docs))

[('93', 0.7126882076263428), ('0', 0.6838240623474121), ('10', 0.668682336807251), ('20', 0.6443622708320618), ('96', 0.5813637971878052), ('67', 0.5454071760177612), ('26', 0.5204913020133972), ('91', 0.5128259658813477), ('89', 0.5120604038238525), ('16', 0.4907408654689789)]
10


### This is the most similar document with our query within our dataset of 100 papers.

In [17]:
# print(tagged_data[33])
# print(parser.toList()[53])

# Language Modeling - NLTK

In [27]:
from nltk import word_tokenize, sent_tokenize
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import MLE

### Setup for training of LM models and getting query relevance

In [32]:
def train_lm_model(file, n_gram = 3):
#     tokenized_text = [list(map(str.lower, word_tokenize(sent))) for sent in sent_tokenize(file)]
    tokenized_text = []
    for sent in sent_tokenize(file):
        word_tokens = word_tokenize(sent) 
        filtered_sentence_tokens = [w.lower() for w in word_tokens if not w in set(stopwords.words('english'))] 
        tokenized_text.append(filtered_sentence_tokens)
    train_data, padded_sents = padded_everygram_pipeline(n_gram, tokenized_text)
    lm_model = MLE(n_gram)
    lm_model.fit(train_data, padded_sents)
    return lm_model

In [33]:
def lm_query_relevance(model, sent):
    return sum([ model.score(s) for s in list(map(str.lower, word_tokenize(sent))) ])

### Just a comparison for doc2vec's top 10 documents and the LM model scores they get

In [34]:
for index, topic_sim in similar_docs:
    lm_model = train_lm_model(parser.toList()[int(index)])
    lm_score = lm_query_relevance(lm_model, query)
    print(index, topic_sim, lm_score)

93 0.7126882076263428 0.0006792324673119375
0 0.6838240623474121 0.0
10 0.668682336807251 0.0003027550711474417
20 0.6443622708320618 0.0013429373702844585
96 0.5813637971878052 0.0005103776794828172
67 0.5454071760177612 0.0
26 0.5204913020133972 0.0
91 0.5128259658813477 0.0010104412260020209
89 0.5120604038238525 0.005164319248826291
16 0.4907408654689789 0.00045330915684496827


## Train LM models for the current parser documents (BIORXIV articles)

In [35]:
def lm_most_similar(query, most_similar_count = 10):
    scores = {}
    for index, model in enumerate(lm_models):
        scores[index] = lm_query_relevance(model, query)
    return sorted(scores.items(), key=lambda x: x[1], reverse=True)[:most_similar_count]

In [26]:
def bold(data):
    return Paper.bold(str(data))

query_collections = [TaskQuery.questions(), TaskQuery.topics()]
n_grams = [1, 2, 3]

for collection in query_collections:
    for n_gram in n_grams:
        lm_models = [ train_lm_model(file, n_gram) for file in parser.toList() ]
        print(Paper.bold('N_GRAM used > ' + str(n_gram)))
        matches_by_n_gram = 0
        for query in collection:
            query = ' '.join([w for w in word_tokenize(query) if not w in set(stopwords.words('english'))])
            lm_scores = [ score[0] for score in lm_most_similar(query) ]
            d2v_scores = [ int(score[0]) for score in d2v_most_similar(query) ]
            matches = list(set(lm_scores).intersection(d2v_scores))
            matches_by_n_gram += len(matches)
    #       FIRST OUTPUT - a lot of details
    #         print('QUERY > ', bold(query))
    #         print('Top 10 - LM')
    #         print(lm_scores)
    #         print('Top 10 - d2v')
    #         print(d2v_scores)
    #         print('Matches: ', bold(str(matches) + '\n'))
    #       SECOND OUTPUT - only match numbers
            print('{:9}{:90}{}{}'.format('QUERY >', bold(query), '|> matches count:', (len(matches))))
        print('Matches by n_gram: ', bold(matches_by_n_gram))

[1mN_GRAM used > 1[0m
QUERY >  [1mWhat range incubation periods disease humans ?[0m                                    |> matches count:4
QUERY >  [1mWhat know basic reproduction number ?[0m                                             |> matches count:2
QUERY >  [1mHow long individuals contagious ?[0m                                                 |> matches count:1
QUERY >  [1mWhat know asymptomatic transmission children ?[0m                                    |> matches count:0
QUERY >  [1mWhat know seasonality transmission ?[0m                                              |> matches count:3
QUERY >  [1mWhat know viral shedding duration ?[0m                                               |> matches count:0
QUERY >  [1mHow long individuals contagious , even recovery ?[0m                                 |> matches count:1
QUERY >  [1mDoes range incubation period vary across age groups ?[0m                             |> matches count:2
QUERY >  [1mDoes range incubati