# Phase I - Data Preparation and Modelling
<hr>
This is a <b>one time process</b> to generate vector embeddings for the document corpus using ensemble approach and store the model objects/configuration states for the searching purposes. There are 3 stages in this phase - <br>
1. Data Preparation and Wrangling <br>
2. Generation of embeddings for the document corpus for different techniques in ensemble approach<br>
3. Saving the model objects and configuration for later use<br>

## Setup

### Importing packages

In [244]:
import pandas as pd
import csv
import json
import time
import pickle
import torch
import numpy as np 
import sys
import os 
import nltk
import pandas as pd
import spacy
import scipy
import gensim
from tqdm.auto import tqdm
from pprint import pprint

from flask import Flask, render_template, jsonify, request

from gensim.models import KeyedVectors

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from sklearn import mixture

from scipy import spatial
from sentence_transformers import SentenceTransformer
from transformers import *
from summarizer import Summarizer
from gensim.summarization.summarizer import summarize

from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

### Setup the nltk library

In [116]:
nltk.data.path.append('../bin/')
nltk.download('stopwords',download_dir='../bin/', quiet=True)
nltk.download('punkt',download_dir='../bin/', quiet=True)
stop_words = stopwords.words('english')
nltk.download('averaged_perceptron_tagger',download_dir='../bin/', quiet=True)
nltk.download('wordnet',download_dir='../bin/', quiet=True)
nltk.download('omw',download_dir='../bin/', quiet=True)
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

### Helper Functions 

In [117]:
#To clean the abstracts
def clean_docs(doc_list):

    doc_df = pd.DataFrame({'document':doc_list})

    #Clean the data
    # removing everything except alphabets`
    doc_df['clean_doc'] = doc_df['document'].str.replace("[^a-zA-Z#]", " ")

    # removing short wordsstop_words = stopwords.words('english')
    doc_df['clean_doc'] = doc_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

    # make all text lowercase
    doc_df['clean_doc'] = doc_df['clean_doc'].apply(lambda x: x.lower())

    stop_words = stopwords.words('english')

    # tokenization
    tokenized_doc = doc_df['clean_doc'].apply(lambda x: x.split())

    # remove stop-words
    tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])

    # de-tokenization
    detokenized_doc = []
    for i in range(len(doc_df)):

        try:
            t = ' '.join(tokenized_doc[i])
            detokenized_doc.append(t)
        except:
            print(f'Can not put {tokenized_doc[i]} back together')
            detokenized_doc.append('')


    detokenized_doc = np.array(detokenized_doc)

    return detokenized_doc

<a id="data"></a>
## Data loading and Cleaning
<hr>

### Cleaning the abstracts

In [141]:
#load the csv sources
raw_md_data = pd.read_csv('../data/metadata.csv')
raw_md_data.drop_duplicates(['abstract'], inplace=True) #drop duplicates by abstract
raw_md_data.dropna(subset=['abstract'], inplace=True) #remove missing abstracts

raw_md_data.drop_duplicates(['title'], inplace=True) #drop duplicates by title
raw_md_data.dropna(subset=['title'], inplace=True) #remove missing titles

#Remove quasi-duplicate titles
raw_md_data['clean_title'] = clean_docs(raw_md_data['title'].tolist())
raw_md_data.drop_duplicates(['clean_title'], inplace=True)

#Select required columns
df = raw_md_data[['cord_uid', 'title', 'abstract']]
#store as new file
df.to_csv('../data/cleaned_abstracts.csv', header=False, index=False)

#list of abstracts (document corpus)
clean_abstracts = df['abstract']
doc_corpus = list(clean_abstracts)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


### Convert data into JSON format for ingestion into Elasticsearch index

In [142]:
csvfile = open('../data/cleaned_abstracts.csv', 'r', encoding='utf-8')
jsonfile = open('../data/cleaned_abstracts.json', 'w')

fieldnames = ("cord_uid", "title", "abstract") #corresponding to the required columns
reader = csv.DictReader(csvfile, fieldnames)
for row in reader:
    json.dump(row, jsonfile)
    jsonfile.write('\n')

<a id="Modelling and embedding generation"></a>
## Modelling and embedding generation

<a id="Topic-Modeling"></a>
### Topic Modeling with Cosine distance
<hr>
Topic modeling is a NLP unsupervised technique for assigning particular words to clusters. These clusters can be thought of as word clouds and contain similar terms. Latent semantic analysis (LSA) and latent Dirichlet allocation (LDA) are two of the most popular topic modeling methods. Here we use LSA for its computational speed, but LDA could also be considered here. Once again we use Cosine distance with the results of the topic modeling.
<img src="https://i.ibb.co/23sp1Gb/tm-abstracts.png">

In [245]:
NUM_TOPICS= 100

def make_tm_output(doc_list,num_tf_idf_features=1000,num_compons=NUM_TOPICS):
    """
    Make output for topic modeling
    :param doc_list:
    :return:
    """
    detokenized_doc = clean_docs(doc_list)

    # #Run the model
    vectorizer = TfidfVectorizer(stop_words='english',
    max_features= num_tf_idf_features, # keep top 1000 terms
    max_df = 0.25,
    smooth_idf=True)


    tfidf_output = vectorizer.fit_transform(detokenized_doc)

    # SVD represent documents and terms in vectors
    svd_model = TruncatedSVD(n_components=num_compons, algorithm='randomized', n_iter=100, random_state=42)

    svd_model.fit(tfidf_output)

    tm_output = svd_model.fit_transform(tfidf_output)
    return tm_output, vectorizer, svd_model, tfidf_output


In [246]:
start_time = time.time()
tm_output,vectorizer,svd_model,tfidf_output = make_tm_output(doc_corpus,num_compons=NUM_TOPICS)
print('Time taken:', time.time()-start_time, 'seconds')

Time taken: 1355.4940433502197 seconds


In [147]:
#Save models and vectors for the corpus for topic modeling
with open('../models/tm_vectors.pkl', 'wb') as f:
    pickle.dump(tm_output,f)
    
with open('../models/tm_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer,f)

with open('../models/tm_svd_model.pkl', 'wb') as f:
    pickle.dump(svd_model,f)

<a id="TF-IDF"></a>
### TF-IDF with Cosine distance
<hr>
Term Frequency-Inverse Document Frequency (TF-IDF) is a basic NLP method that determines the importance of an individual word relative to a document. That is, words are weighted based on how often then appear in a document and then inversely weighted based on how often they appear across a collection of documents.  Cosine distance (https://en.wikipedia.org/wiki/Cosine_similarity) is a common distance measure in the NLP literature and is used with many of the methods presented here. Cosine distance measures the difference in orientation. Thus, it is possible that two sentences or documents are far apart in Euclidean space but actually have similar orientations and are similar according to Cosine distance.
<a name="some-id"></a>

In [10]:
def tfidf_bow(doc_list):
    #clean the docs
    detokenized_doc = clean_docs(doc_list)
    gen_docs = [[w.lower() for w in word_tokenize(text)] for text in detokenized_doc]

    # create the dictionary
    dictionary = gensim.corpora.Dictionary(gen_docs)

    # Create bag of words
    corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
    tf_idf = gensim.models.TfidfModel(corpus)
    corpus_tfidf = tf_idf[corpus]
    lsi = gensim.models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=300)
    corpus_lsi = lsi[corpus_tfidf]
    return dictionary, lsi, corpus_lsi


In [11]:
start_time = time.time()
dictionary, lsi, corpus_lsi = tfidf_bow(doc_corpus)
print('Time taken:', time.time()-start_time, 'seconds')

I0703 12:54:15.540522 140392763574080 dictionary.py:205] adding document #0 to Dictionary(0 unique tokens: [])
I0703 12:54:17.418383 140392763574080 dictionary.py:205] adding document #10000 to Dictionary(47051 unique tokens: ['abdulaziz', 'acquired', 'admission', 'affected', 'arabia']...)
I0703 12:54:19.173717 140392763574080 dictionary.py:205] adding document #20000 to Dictionary(72989 unique tokens: ['abdulaziz', 'acquired', 'admission', 'affected', 'arabia']...)
I0703 12:54:21.081262 140392763574080 dictionary.py:205] adding document #30000 to Dictionary(84130 unique tokens: ['abdulaziz', 'acquired', 'admission', 'affected', 'arabia']...)
I0703 12:54:23.039579 140392763574080 dictionary.py:205] adding document #40000 to Dictionary(92947 unique tokens: ['abdulaziz', 'acquired', 'admission', 'affected', 'arabia']...)
I0703 12:54:25.002710 140392763574080 dictionary.py:205] adding document #50000 to Dictionary(101059 unique tokens: ['abdulaziz', 'acquired', 'admission', 'affected', 'a

I0703 13:00:09.226670 140392763574080 lsimodel.py:935] orthonormalizing (144354, 400) action matrix
I0703 13:00:32.284240 140392763574080 lsimodel.py:987] 2nd phase: running dense svd on (400, 20000) matrix
I0703 13:00:34.706219 140392763574080 lsimodel.py:1013] computing the final decomposition
I0703 13:00:34.707927 140392763574080 lsimodel.py:106] keeping 300 factors (discarding 10.954% of energy spectrum)
I0703 13:01:01.998676 140392763574080 lsimodel.py:261] merging projections: (144354, 300) + (144354, 300)
I0703 13:03:09.416321 140392763574080 lsimodel.py:106] keeping 300 factors (discarding 11.333% of energy spectrum)
I0703 13:03:44.486397 140392763574080 lsimodel.py:517] processed documents up to #60000
I0703 13:03:44.494930 140392763574080 lsimodel.py:704] topic #0(29.017): 0.233*"patients" + 0.162*"covid" + 0.144*"sars" + 0.097*"group" + 0.096*"cases" + 0.090*"infection" + 0.090*"disease" + 0.088*"respiratory" + 0.087*"clinical" + 0.087*"health"
I0703 13:03:44.506030 14039276

I0703 13:15:13.991231 140392763574080 lsimodel.py:704] topic #3(18.777): 0.448*"sars" + 0.237*"respiratory" + -0.191*"health" + 0.182*"patients" + 0.142*"children" + 0.136*"severe" + 0.123*"coronavirus" + 0.119*"acute" + 0.117*"mers" + 0.117*"syndrome"
I0703 13:15:13.996148 140392763574080 lsimodel.py:704] topic #4(18.526): 0.419*"influenza" + -0.383*"sars" + 0.232*"children" + 0.230*"viruses" + 0.191*"respiratory" + -0.168*"covid" + -0.163*"protein" + 0.157*"infections" + 0.126*"virus" + -0.123*"cells"


Time taken: 1490.5347256660461 seconds


In [12]:
#Save models and vectors for the corpus for TF-IDF
with open('../models/tfidf_dict.pkl', 'wb') as f:
    pickle.dump(dictionary,f)
    
with open('../models/tfidf_lsi.pkl', 'wb') as f:
    pickle.dump(lsi,f)

with open('../models/tfidf_vectors.pkl', 'wb') as f:
    pickle.dump(corpus_lsi,f)

<a id="Word2Vec"></a>
### BERT with Cosine Distance 
<hr>
Bidirectional Encoder Representations from Transformers (BERT) is a pre-trained model developed by Google. Unlike traditional RNNs or LSTMs, which only learn in one direction, BERT is trained in both directions and thus is better at understanding context. Once again we use Cosine distance.

In [13]:
# One time task (already done and stored embeddings in 'models') 
#To generate BERT Embeddings for CORD dataset and store the embeddings
def make_bert_embeddings(doc_list):
    generic_bert_model = SentenceTransformer('../bin/models/')
    return generic_bert_model.encode(doc_list,show_progress_bar=True)

start_time = time.time()
# make the embeddings
corpus_embed = make_bert_embeddings(list(doc_corpus))
print('Time taken:', time.time()-start_time, 'seconds')

with open('../models/bert_corpus_embed.pkl', 'wb') as f:
    pickle.dump(corpus_embed, f)

I0703 13:15:30.404550 140392763574080 SentenceTransformer.py:29] Load pretrained SentenceTransformer: ../bin/models/
I0703 13:15:30.406009 140392763574080 SentenceTransformer.py:67] Load SentenceTransformer from folder: ../bin/models/
I0703 13:15:30.454168 140392763574080 configuration_utils.py:281] loading configuration file ../bin/models/0_BERT/config.json
I0703 13:15:30.455682 140392763574080 configuration_utils.py:319] Model config BertConfig {
  "_num_labels": 2,
  "architectures": null,
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0

Time taken: 9108.873574256897 seconds


<a id="BERT-cos"></a>
### Pre-trained Word2vec using Word Centroid Similarity 
<hr>
Word2vec is a NLP method for producing word embeddings using a neural network model such as a recurrent neural network (RNN). Word embeddings map words to vectors and thus can be used to represent words and thus documents/sentences. Here a pre-trained word2vec model on Google news is used. Once again we use Cosine distance.

In [14]:
def get_centroid_matrix(EMBEDDING_FILE, vectorizer, tfidf_output):
    w2v_model = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

    words = vectorizer.get_feature_names()

    idx_present = []
    for i, word in enumerate(words):
        try:
            vec = w2v_model[word]
            idx_present.append(i)
        except:
            print(word)

    vocab_words = [words[i] for i in range(len(words)) if i in idx_present]

    term_occurrence_matrix = tfidf_output.toarray()
    term_occurrence_matrix = term_occurrence_matrix[:,idx_present]

    word_embeddings = w2v_model[vocab_words]

    centroid_matrix = np.matmul(term_occurrence_matrix, word_embeddings)

    return centroid_matrix, word_embeddings, idx_present


In [15]:
start_time = time.time()
EMBEDDING_FILE = '../bin/GoogleNews-vectors-negative300.bin.gz'
centroid_matrix, word_embeddings, idx_present = get_centroid_matrix(EMBEDDING_FILE, vectorizer, tfidf_output)
print('Time taken:', time.time()-start_time, 'seconds')

I0703 16:10:05.175721 140392763574080 utils_any2vec.py:341] loading projection weights from ../bin/GoogleNews-vectors-negative300.bin.gz
I0703 16:13:45.143670 140392763574080 utils_any2vec.py:405] loaded (3000000, 300) matrix from ../bin/GoogleNews-vectors-negative300.bin.gz


analyses
covid
hcov
mrna
ncov
pedv
prrsv
syncytial
tgev
wuhan
Time taken: 974.4920225143433 seconds


In [16]:
#Save models and vectors for the corpus for word2vec
with open('../models/centroid_vectors.pkl', 'wb') as f:
    pickle.dump(centroid_matrix,f)
    
with open('../models/word_embeddings.pkl', 'wb') as f:
    pickle.dump(word_embeddings,f)

with open('../models/common_vocab_idx.pkl', 'wb') as f:
    pickle.dump(idx_present,f)

<a id="Doc2vec"></a>
## Doc2vec with Cosine Distance 
<hr>
Doc2vec is very similar to word2vec with a slight altercation that allows the model to consider which document a particular word comes from. The doc2vec model is trained on the dataset of medical abstracts, once again cosine distance is used to measure the similarity between documents.

In [17]:
def preprocess_doc2vec(doc_list):

    preprocess_list = []
    for count, doc in enumerate(doc_list):
        tokens = gensim.parsing.preprocess_string(doc)
        preprocess_list.append(gensim.models.doc2vec.TaggedDocument(tokens, [count]))
    return preprocess_list


def train_doc2vec(train_corpus):
    model = gensim.models.doc2vec.Doc2Vec(dm=1, vector_size=300, window=10, min_count=2, epochs=20, seed=42, workers=6)
    model.build_vocab(train_corpus)
    model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

    return model

In [18]:
start_time = time.time()
preprocess_list = preprocess_doc2vec(doc_corpus)
doc2vec_model = train_doc2vec(preprocess_list)
print(time.time()-start_time, 'seconds')

I0703 16:29:29.675451 140392763574080 doc2vec.py:1377] collecting all words and their counts
I0703 16:29:29.678025 140392763574080 doc2vec.py:1319] PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
I0703 16:29:30.050217 140392763574080 doc2vec.py:1319] PROGRESS: at example #10000, processed 1184549 words (3189369/s), 42368 word types, 10000 tags
I0703 16:29:30.399915 140392763574080 doc2vec.py:1319] PROGRESS: at example #20000, processed 2254253 words (3068558/s), 66679 word types, 20000 tags
I0703 16:29:30.801015 140392763574080 doc2vec.py:1319] PROGRESS: at example #30000, processed 3514489 words (3150392/s), 76160 word types, 30000 tags
I0703 16:29:31.200228 140392763574080 doc2vec.py:1319] PROGRESS: at example #40000, processed 4787273 words (3197073/s), 83759 word types, 40000 tags
I0703 16:29:31.601248 140392763574080 doc2vec.py:1319] PROGRESS: at example #50000, processed 6047450 words (3150772/s), 91397 word types, 50000 tags
I0703 16:29:31.955792 140392763

I0703 16:30:12.850034 140392763574080 base_any2vec.py:1302] EPOCH 2 - PROGRESS: at 43.71% examples, 521028 words/s, in_qsize 11, out_qsize 0
I0703 16:30:13.861137 140392763574080 base_any2vec.py:1302] EPOCH 2 - PROGRESS: at 48.01% examples, 521846 words/s, in_qsize 12, out_qsize 0
I0703 16:30:14.893269 140392763574080 base_any2vec.py:1302] EPOCH 2 - PROGRESS: at 52.41% examples, 520123 words/s, in_qsize 11, out_qsize 0
I0703 16:30:15.935516 140392763574080 base_any2vec.py:1302] EPOCH 2 - PROGRESS: at 57.08% examples, 518476 words/s, in_qsize 12, out_qsize 0
I0703 16:30:16.956444 140392763574080 base_any2vec.py:1302] EPOCH 2 - PROGRESS: at 61.80% examples, 516985 words/s, in_qsize 12, out_qsize 0
I0703 16:30:17.965720 140392763574080 base_any2vec.py:1302] EPOCH 2 - PROGRESS: at 66.23% examples, 513472 words/s, in_qsize 11, out_qsize 0
I0703 16:30:19.023603 140392763574080 base_any2vec.py:1302] EPOCH 2 - PROGRESS: at 70.91% examples, 511766 words/s, in_qsize 12, out_qsize 0
I0703 16:30:2

I0703 16:31:00.684846 140392763574080 base_any2vec.py:1302] EPOCH 4 - PROGRESS: at 51.71% examples, 515194 words/s, in_qsize 11, out_qsize 0
I0703 16:31:01.698748 140392763574080 base_any2vec.py:1302] EPOCH 4 - PROGRESS: at 56.23% examples, 513623 words/s, in_qsize 12, out_qsize 0
I0703 16:31:02.705533 140392763574080 base_any2vec.py:1302] EPOCH 4 - PROGRESS: at 60.79% examples, 512335 words/s, in_qsize 11, out_qsize 0
I0703 16:31:03.710139 140392763574080 base_any2vec.py:1302] EPOCH 4 - PROGRESS: at 65.32% examples, 509889 words/s, in_qsize 12, out_qsize 0
I0703 16:31:04.715250 140392763574080 base_any2vec.py:1302] EPOCH 4 - PROGRESS: at 69.85% examples, 508838 words/s, in_qsize 12, out_qsize 0
I0703 16:31:05.724549 140392763574080 base_any2vec.py:1302] EPOCH 4 - PROGRESS: at 74.30% examples, 507554 words/s, in_qsize 11, out_qsize 0
I0703 16:31:06.729360 140392763574080 base_any2vec.py:1302] EPOCH 4 - PROGRESS: at 78.74% examples, 506564 words/s, in_qsize 11, out_qsize 0
I0703 16:31:0

I0703 16:31:48.980621 140392763574080 base_any2vec.py:1302] EPOCH 6 - PROGRESS: at 61.25% examples, 511646 words/s, in_qsize 11, out_qsize 0
I0703 16:31:49.981704 140392763574080 base_any2vec.py:1302] EPOCH 6 - PROGRESS: at 65.87% examples, 509984 words/s, in_qsize 11, out_qsize 0
I0703 16:31:51.020069 140392763574080 base_any2vec.py:1302] EPOCH 6 - PROGRESS: at 70.39% examples, 507937 words/s, in_qsize 12, out_qsize 0
I0703 16:31:52.042986 140392763574080 base_any2vec.py:1302] EPOCH 6 - PROGRESS: at 75.08% examples, 507986 words/s, in_qsize 11, out_qsize 0
I0703 16:31:53.056710 140392763574080 base_any2vec.py:1302] EPOCH 6 - PROGRESS: at 79.36% examples, 505687 words/s, in_qsize 12, out_qsize 0
I0703 16:31:54.075161 140392763574080 base_any2vec.py:1302] EPOCH 6 - PROGRESS: at 83.86% examples, 504477 words/s, in_qsize 11, out_qsize 0
I0703 16:31:55.102126 140392763574080 base_any2vec.py:1302] EPOCH 6 - PROGRESS: at 88.13% examples, 502218 words/s, in_qsize 11, out_qsize 0
I0703 16:31:5

I0703 16:32:36.979693 140392763574080 base_any2vec.py:1302] EPOCH 8 - PROGRESS: at 69.75% examples, 506738 words/s, in_qsize 12, out_qsize 0
I0703 16:32:38.031572 140392763574080 base_any2vec.py:1302] EPOCH 8 - PROGRESS: at 74.47% examples, 506017 words/s, in_qsize 11, out_qsize 0
I0703 16:32:39.060507 140392763574080 base_any2vec.py:1302] EPOCH 8 - PROGRESS: at 79.18% examples, 506018 words/s, in_qsize 11, out_qsize 0
I0703 16:32:40.080228 140392763574080 base_any2vec.py:1302] EPOCH 8 - PROGRESS: at 83.86% examples, 505756 words/s, in_qsize 12, out_qsize 0
I0703 16:32:41.084450 140392763574080 base_any2vec.py:1302] EPOCH 8 - PROGRESS: at 88.30% examples, 504925 words/s, in_qsize 12, out_qsize 0
I0703 16:32:42.102885 140392763574080 base_any2vec.py:1302] EPOCH 8 - PROGRESS: at 92.83% examples, 504269 words/s, in_qsize 11, out_qsize 0
I0703 16:32:43.150170 140392763574080 base_any2vec.py:1302] EPOCH 8 - PROGRESS: at 97.51% examples, 503900 words/s, in_qsize 11, out_qsize 0
I0703 16:32:4

I0703 16:33:24.816732 140392763574080 base_any2vec.py:1302] EPOCH 10 - PROGRESS: at 78.84% examples, 505482 words/s, in_qsize 12, out_qsize 0
I0703 16:33:25.887544 140392763574080 base_any2vec.py:1302] EPOCH 10 - PROGRESS: at 83.42% examples, 503400 words/s, in_qsize 11, out_qsize 0
I0703 16:33:26.910867 140392763574080 base_any2vec.py:1302] EPOCH 10 - PROGRESS: at 87.97% examples, 502691 words/s, in_qsize 11, out_qsize 0
I0703 16:33:27.916259 140392763574080 base_any2vec.py:1302] EPOCH 10 - PROGRESS: at 92.39% examples, 502011 words/s, in_qsize 11, out_qsize 0
I0703 16:33:28.924147 140392763574080 base_any2vec.py:1302] EPOCH 10 - PROGRESS: at 96.80% examples, 501333 words/s, in_qsize 11, out_qsize 0
I0703 16:33:29.585647 140392763574080 base_any2vec.py:349] worker thread finished; awaiting finish of 5 more threads
I0703 16:33:29.608578 140392763574080 base_any2vec.py:349] worker thread finished; awaiting finish of 4 more threads
I0703 16:33:29.615708 140392763574080 base_any2vec.py:34

I0703 16:34:13.264767 140392763574080 base_any2vec.py:1302] EPOCH 12 - PROGRESS: at 88.82% examples, 504692 words/s, in_qsize 12, out_qsize 0
I0703 16:34:14.281805 140392763574080 base_any2vec.py:1302] EPOCH 12 - PROGRESS: at 93.36% examples, 504105 words/s, in_qsize 12, out_qsize 0
I0703 16:34:15.287469 140392763574080 base_any2vec.py:1302] EPOCH 12 - PROGRESS: at 97.77% examples, 503376 words/s, in_qsize 12, out_qsize 0
I0703 16:34:15.695731 140392763574080 base_any2vec.py:349] worker thread finished; awaiting finish of 5 more threads
I0703 16:34:15.718234 140392763574080 base_any2vec.py:349] worker thread finished; awaiting finish of 4 more threads
I0703 16:34:15.733360 140392763574080 base_any2vec.py:349] worker thread finished; awaiting finish of 3 more threads
I0703 16:34:15.748628 140392763574080 base_any2vec.py:349] worker thread finished; awaiting finish of 2 more threads
I0703 16:34:15.759448 140392763574080 base_any2vec.py:349] worker thread finished; awaiting finish of 1 mo

I0703 16:35:01.222326 140392763574080 base_any2vec.py:1302] EPOCH 14 - PROGRESS: at 97.77% examples, 503383 words/s, in_qsize 11, out_qsize 0
I0703 16:35:01.655866 140392763574080 base_any2vec.py:349] worker thread finished; awaiting finish of 5 more threads
I0703 16:35:01.664977 140392763574080 base_any2vec.py:349] worker thread finished; awaiting finish of 4 more threads
I0703 16:35:01.674319 140392763574080 base_any2vec.py:349] worker thread finished; awaiting finish of 3 more threads
I0703 16:35:01.692187 140392763574080 base_any2vec.py:349] worker thread finished; awaiting finish of 2 more threads
I0703 16:35:01.706180 140392763574080 base_any2vec.py:349] worker thread finished; awaiting finish of 1 more threads
I0703 16:35:01.709486 140392763574080 base_any2vec.py:349] worker thread finished; awaiting finish of 0 more threads
I0703 16:35:01.710280 140392763574080 base_any2vec.py:1344] EPOCH - 14 : training on 11945646 raw words (11577264 effective words) took 23.0s, 503473 effect

I0703 16:35:47.776496 140392763574080 base_any2vec.py:349] worker thread finished; awaiting finish of 4 more threads
I0703 16:35:47.778551 140392763574080 base_any2vec.py:349] worker thread finished; awaiting finish of 3 more threads
I0703 16:35:47.794995 140392763574080 base_any2vec.py:349] worker thread finished; awaiting finish of 2 more threads
I0703 16:35:47.813030 140392763574080 base_any2vec.py:349] worker thread finished; awaiting finish of 1 more threads
I0703 16:35:47.824632 140392763574080 base_any2vec.py:349] worker thread finished; awaiting finish of 0 more threads
I0703 16:35:47.825597 140392763574080 base_any2vec.py:1344] EPOCH - 16 : training on 11945646 raw words (11576662 effective words) took 23.0s, 503579 effective words/s
I0703 16:35:48.850425 140392763574080 base_any2vec.py:1302] EPOCH 17 - PROGRESS: at 4.10% examples, 523279 words/s, in_qsize 11, out_qsize 0
I0703 16:35:49.854895 140392763574080 base_any2vec.py:1302] EPOCH 17 - PROGRESS: at 8.69% examples, 511183

I0703 16:36:34.216783 140392763574080 base_any2vec.py:349] worker thread finished; awaiting finish of 2 more threads
I0703 16:36:34.222323 140392763574080 base_any2vec.py:349] worker thread finished; awaiting finish of 1 more threads
I0703 16:36:34.230087 140392763574080 base_any2vec.py:349] worker thread finished; awaiting finish of 0 more threads
I0703 16:36:34.230824 140392763574080 base_any2vec.py:1344] EPOCH - 18 : training on 11945646 raw words (11576480 effective words) took 23.1s, 500110 effective words/s
I0703 16:36:35.244275 140392763574080 base_any2vec.py:1302] EPOCH 19 - PROGRESS: at 4.17% examples, 537736 words/s, in_qsize 11, out_qsize 0
I0703 16:36:36.260840 140392763574080 base_any2vec.py:1302] EPOCH 19 - PROGRESS: at 8.76% examples, 514923 words/s, in_qsize 11, out_qsize 0
I0703 16:36:37.263648 140392763574080 base_any2vec.py:1302] EPOCH 19 - PROGRESS: at 13.31% examples, 485527 words/s, in_qsize 11, out_qsize 0
I0703 16:36:38.273782 140392763574080 base_any2vec.py:130

I0703 16:37:20.253400 140392763574080 base_any2vec.py:349] worker thread finished; awaiting finish of 0 more threads
I0703 16:37:20.254100 140392763574080 base_any2vec.py:1344] EPOCH - 20 : training on 11945646 raw words (11576751 effective words) took 23.0s, 504126 effective words/s
I0703 16:37:20.255048 140392763574080 base_any2vec.py:1380] training on a 238912920 raw words (231535595 effective words) took 460.5s, 502835 effective words/s


658.0700414180756 seconds


In [19]:
docvecs = doc2vec_model.docvecs

#Save models and vectors for the corpus for word2vec
with open('../models/d2v_model.pkl', 'wb') as f:
    pickle.dump(doc2vec_model,f)
    
with open('../models/d2v_vectors.pkl', 'wb') as f:
    pickle.dump(docvecs,f)

<a id="SciBert-cos"></a>
## SciBERT with Cosine distance
<hr>
SciBERT (https://github.com/allenai/scibert) is a BERT model that is specifically pre-trained on medical journal articles. Therefore the model is potentially more suitable for analyzing coronavirus related material than a typical pre-trained BERT model. Cosine distance is also used with the SciBERT model.

In [7]:
model = BertModel.from_pretrained("../bin/models/scibert_scivocab_uncased/")
tokenizer = AutoTokenizer.from_pretrained("../bin/models/scibert_scivocab_uncased/")
device = torch.device("cpu")
model.to(device)

def get_scibert_embedding(doc,model,tokenizer,device):

    # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
    input_ids = torch.tensor([tokenizer.encode(doc,add_special_tokens=True, max_length=512)],device=device)
    try:
        with torch.no_grad():
            last_hidden_states = model(input_ids)  # Models outputs are now tuples
            test = last_hidden_states[0].mean(1).detach()
        return np.array(test.cpu())
    except Exception as e:
        print(e)
        print(f'Was not able to get embeddings for {doc}')
        return []

I0705 08:31:14.933726 139995567662912 configuration_utils.py:281] loading configuration file ../bin/models/scibert_scivocab_uncased/config.json
I0705 08:31:14.935277 139995567662912 configuration_utils.py:319] Model config BertConfig {
  "_num_labels": 2,
  "architectures": null,
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "min_length": 0,
  "model_type": "bert",
  "no_repeat_ngram_size": 0,
  "num_attentio

In [8]:
start_time = time.time()
abstract_embedd_list = []
for index_f in tqdm(list(doc_corpus)):
    embed_vals = get_scibert_embedding(index_f,model,tokenizer,device)

    if len(embed_vals)>0:
        abstract_embedd_list.append(embed_vals)
print(time.time()-start_time)

with open('../models/scibert_corpus_embed.pkl', 'wb') as f:
    pickle.dump(abstract_embedd_list, f)

HBox(children=(FloatProgress(value=0.0, max=103482.0), HTML(value='')))


26110.16163277626


# Phase II - Indexing, Searching and Summarization
<hr> 
<b>Query based recurring tasks</b> <br>
Once Phase I completes, we index the vectors and documents in Elasticsearch and perform our user searches on the given index. There are 5 stages in this phase - <br>
1. Data/Embedding Ingestion into Elasticsearch as dense vectors <br>
2. Generation of embeddings for user search input <br>
3. Matching (user input embeddings and document vectors) using Dense Vector API and score using cosine similarity <br>
4. Summarizing the relevant retrieved documents using BERT summarizer <br>
5. Displaying search results in the Flask search UI <br>

## Indexing and Searching in Elasticsearch

### Helper functions

In [4]:
#Indexing data in ES
def index_data():
    print("Creating the index.")
    client.indices.delete(index=INDEX_NAME, ignore=[404])

    with open(INDEX_FILE) as index_file:
        source = index_file.read().strip()
        client.indices.create(index=INDEX_NAME, body=source)

    docs = []
    count = 0

    with open(DATA_FILE) as data_file:
        for line in data_file:
            line = line.strip()

            try:
                doc = json.loads(line)
            except:
                print(count, "Could not load doc:", line)
                continue
        
            docs.append(doc)
            count += 1

            if count % BATCH_SIZE == 0:
                index_batch(docs,count)
                docs = []
                print("Indexed {} documents.".format(count))

        if docs:
            index_batch(docs,count)
            print("Indexed {} documents.".format(count))

    client.indices.refresh(index=INDEX_NAME)
    print("Done indexing.")

def index_batch(docs,count):
    requests = []
    for i, doc in enumerate(docs):
        request = doc
        request["_op_type"] = "index"
        request["_index"] = INDEX_NAME
        request["tm_doc_vec"] = tm_doc_embed(i+count-BATCH_SIZE)
        request["tfidf_doc_vec"] = tfidf_doc_embed(i+count-BATCH_SIZE)
        request["bert_doc_vec"] = bert_doc_embed(i+count-BATCH_SIZE)
        request["w2v_doc_vec"] = w2v_doc_embed(i+count-BATCH_SIZE)
        request["scibert_doc_vec"] = scibert_doc_embed(i+count-BATCH_SIZE)
        request["d2v_doc_vec"] = d2v_doc_embed(i+count-BATCH_SIZE)
        requests.append(request)
    bulk(client, requests)

"""Intermediate func (not used independently anymore)
Created to test the code from within notebook and now, the same functionality is implemented in flask code""" 

def handle_query():
    query = input("Enter query: ")

    embedding_start = time.time()
    tm_query_vec = tm_query_embed(query)
    tfidf_query_vec = tfidf_query_embed(query)
    bert_query_vec = bert_query_embed(query)
    w2v_query_vec = w2v_query_embed(query)
    embedding_time = time.time() - embedding_start

    script_query = {
        "script_score": {
            "query": {"match_all": {}},
            "script": {
                "source": "cosineSimilarity(params.w2v_qv, doc['w2v_doc_vec']) + cosineSimilarity(params.tm_qv, doc['tm_doc_vec']) + cosineSimilarity(params.tfidf_qv, doc['tfidf_doc_vec']) + cosineSimilarity(params.bert_qv, doc['bert_doc_vec']) + 4.0",
                "params": {"w2v_qv": w2v_query_vec, "tm_qv": tm_query_vec, "tfidf_qv": tfidf_query_vec, "bert_qv": bert_query_vec}
            }
        }
    }

    search_start = time.time()
    response = client.search(
        index=INDEX_NAME,
        body={
            "size": SEARCH_SIZE,
            "query": script_query,
            "_source": {"includes": ["title", "abstract", "url", "authors"]}
        }
    )
    search_time = time.time() - search_start

    print()
    print("{} total hits.".format(response["hits"]["total"]["value"]))
    print("embedding time: {:.2f} ms".format(embedding_time * 1000))
    print("search time: {:.2f} ms".format(search_time * 1000))
    for hit in response["hits"]["hits"]:
        print("id: {}, score: {}".format(hit["_id"], hit["_score"]))
        print(hit["_source"])
        print()
    
    return response

# Wrapper to run ingestion to ES
def run_query_loop():
    while True:
        try:
            handle_query()
        except KeyboardInterrupt:
            return

## Embedding functions for all models

In [139]:
def tm_doc_embed(idx):
    doc_vec = tm_output[idx,]
    if(sum(doc_vec)==0): #to avoid all zeros (cosine similarity)
        doc_vec = doc_vec + 1e-6
    return doc_vec.tolist()

def tm_query_embed(query):
    clean_query = clean_docs([query])
    tfidf_query_output = vectorizer.transform(clean_query)
    target_vec = svd_model.transform(tfidf_query_output)[0]
    if(sum(target_vec)==0): #to avoid all zeros (cosine similarity)
        target_vec = target_vec + 1e-6
    return target_vec.tolist()

In [6]:
def tfidf_doc_embed(idx):
    doc_vec = [item[1] for item in corpus_lsi[idx]]
    if len(doc_vec)!=300: #to ensure size of vectors is 300
        doc_vec.extend((300-len(doc_vec))*[1e-6])
    return doc_vec

def tfidf_query_embed(query):
    detokenized_compare_doc = clean_docs([query])
    gen_compare_docs = [[w.lower() for w in word_tokenize(text)] for text in detokenized_compare_doc]
    query_doc_bow = dictionary.doc2bow(gen_compare_docs[0])
    query_lsi = lsi[query_doc_bow]
    query_vec_tfidf = [item[1] for item in query_lsi]
    return query_vec_tfidf

In [101]:
generic_bert_model = SentenceTransformer('../bin/models/')

def bert_doc_embed(idx):
    return bert_embeddings[idx].tolist()

def bert_query_embed(query):
    bert_vec = generic_bert_model.encode([query],show_progress_bar=False)
    return bert_vec[0].tolist()

I0707 15:00:59.041806 140149010638656 SentenceTransformer.py:29] Load pretrained SentenceTransformer: ../bin/models/
I0707 15:00:59.043388 140149010638656 SentenceTransformer.py:67] Load SentenceTransformer from folder: ../bin/models/
I0707 15:00:59.078465 140149010638656 configuration_utils.py:281] loading configuration file ../bin/models/0_BERT/config.json
I0707 15:00:59.079786 140149010638656 configuration_utils.py:319] Model config BertConfig {
  "_num_labels": 2,
  "architectures": null,
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0

In [8]:
def w2v_doc_embed(idx):
    doc_vec = centroid_matrix[idx,]
    if(sum(doc_vec)==0): #to avoid all zeros (cosine similarity)
        doc_vec = doc_vec + 1e-6
    return doc_vec.tolist()

def w2v_query_embed(query):
    qvec = vectorizer.transform([query]).toarray()
    qvec = qvec[:,idx_present]
    qvec_transformed = np.matmul(qvec,word_embeddings)[0]
    if(sum(qvec_transformed)==0): #to avoid all zeros (cosine similarity)
        qvec_transformed = qvec_transformed + 1e-6
    return qvec_transformed.tolist()

In [122]:
scibert_model = BertModel.from_pretrained("../bin/models/scibert_scivocab_uncased/")
scibert_tokenizer = AutoTokenizer.from_pretrained("../bin/models/scibert_scivocab_uncased/")
device = torch.device("cpu")
scibert_model.to(device)

def scibert_doc_embed(idx):
    return scibert_embeddings[idx][0].tolist()

def scibert_query_embed(query):
    # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
    input_ids = torch.tensor([scibert_tokenizer.encode(query,add_special_tokens=True, max_length=512)],device=device)
#     try:
    with torch.no_grad():
        last_hidden_states = scibert_model(input_ids)  # Models outputs are now tuples
        test = last_hidden_states[0].mean(1).detach()
    return test.cpu()[0].tolist()
#     except Exception as e:
#         print(e)
#         print(f'Was not able to get embeddings for {query}')
#         return []

I0708 08:50:59.138453 140149010638656 configuration_utils.py:281] loading configuration file ../bin/models/scibert_scivocab_uncased/config.json
I0708 08:50:59.139977 140149010638656 configuration_utils.py:319] Model config BertConfig {
  "_num_labels": 2,
  "architectures": null,
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "min_length": 0,
  "model_type": "bert",
  "no_repeat_ngram_size": 0,
  "num_attentio

In [10]:
def d2v_doc_embed(idx):
    doc_vec = docvecs[idx]
    if(sum(doc_vec)==0): #to avoid all zeros (cosine similarity)
        doc_vec = doc_vec + 1e-6
    return doc_vec.tolist()

def d2v_query_embed(query):
    tokens = gensim.parsing.preprocess_string(query)
    qvec = doc2vec_model.infer_vector(tokens, steps=50, alpha=0.025)
    return qvec.tolist()

## Loading Saved Models

In [140]:
#load models and vectors for topic modeling
with open('../models/tm_vectors.pkl', 'rb') as f:
    tm_output = pickle.load(f)
    
with open('../models/tm_vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)

with open('../models/tm_svd_model.pkl', 'rb') as f:
    svd_model = pickle.load(f)

In [12]:
#load models and vectors for TF-IDF
with open('../models/tfidf_dict.pkl', 'rb') as f:
    dictionary = pickle.load(f)
    
with open('../models/tfidf_lsi.pkl', 'rb') as f:
    lsi = pickle.load(f)

with open('../models/tfidf_vectors.pkl', 'rb') as f:
    corpus_lsi = pickle.load(f)

In [13]:
#Load already saved BERT embeddings from models folder
with open('../models/bert_corpus_embed.pkl', 'rb') as f:
    bert_embeddings = pickle.load(f)

In [14]:
with open('../models/centroid_vectors.pkl', 'rb') as f:
    centroid_matrix = pickle.load(f)
    
with open('../models/word_embeddings.pkl', 'rb') as f:
    word_embeddings = pickle.load(f)

with open('../models/common_vocab_idx.pkl', 'rb') as f:
    idx_present = pickle.load(f)

In [15]:
#Load already saved SciBERT embeddings from models folder
with open('../models/scibert_corpus_embed.pkl', 'rb') as f:
    scibert_embeddings = pickle.load(f)

In [16]:
#Save models and vectors for the corpus for word2vec
with open('../models/d2v_model.pkl', 'rb') as f:
    doc2vec_model = pickle.load(f)
    
with open('../models/d2v_vectors.pkl', 'rb') as f:
    docvecs = pickle.load(f)

### Indexing and Querying

In [148]:
INDEX_NAME = "docs"
INDEX_FILE = "../resources/index.json"

DATA_FILE = "../data/cleaned_abstracts.json"
BATCH_SIZE = 10000

SEARCH_SIZE = 10

GPU_LIMIT = 0.5

client = Elasticsearch()

start_time = time.time()
index_data()
print('Time taken for indexing:', time.time()-start_time, 'seconds')

#run_query_loop()

I0708 11:24:03.564091 140149010638656 base.py:117] DELETE http://localhost:9200/docs [status:200 request:0.074s]


Creating the index.


I0708 11:24:03.696436 140149010638656 base.py:117] PUT http://localhost:9200/docs [status:200 request:0.130s]
I0708 11:24:24.686377 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:1.140s]
I0708 11:24:27.087969 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.897s]
I0708 11:24:29.467203 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.884s]
I0708 11:24:31.868253 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.923s]
I0708 11:24:34.311772 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.960s]
I0708 11:24:36.744939 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.956s]
I0708 11:24:39.164820 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.931s]
I0708 11:24:41.587873 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.

Indexed 10000 documents.


I0708 11:25:33.077422 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.841s]
I0708 11:25:35.438446 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.914s]
I0708 11:25:37.754664 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.885s]
I0708 11:25:40.229357 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:1.017s]
I0708 11:25:42.578551 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.919s]
I0708 11:25:44.970600 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.921s]
I0708 11:25:47.353524 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.908s]
I0708 11:25:49.710479 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.898s]
I0708 11:25:52.028752 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:

Indexed 20000 documents.


I0708 11:26:41.607923 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.965s]
I0708 11:26:44.011717 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.903s]
I0708 11:26:46.412694 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.940s]
I0708 11:26:48.892685 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.988s]
I0708 11:26:51.264645 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.946s]
I0708 11:26:53.633738 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.930s]
I0708 11:26:56.000675 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.946s]
I0708 11:26:58.413725 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.984s]
I0708 11:27:00.849692 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:

Indexed 30000 documents.


I0708 11:27:48.778278 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.872s]
I0708 11:27:51.133156 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.877s]
I0708 11:27:53.541770 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.945s]
I0708 11:27:55.837228 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.876s]
I0708 11:27:58.222113 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.947s]
I0708 11:28:00.549819 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.887s]
I0708 11:28:02.839428 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.852s]
I0708 11:28:05.275180 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.916s]
I0708 11:28:07.623777 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:

Indexed 40000 documents.


I0708 11:28:56.909611 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:1.000s]
I0708 11:28:59.388112 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.953s]
I0708 11:29:01.738186 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.892s]
I0708 11:29:04.151578 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.952s]
I0708 11:29:06.569805 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.953s]
I0708 11:29:09.134556 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.959s]
I0708 11:29:11.631788 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.931s]
I0708 11:29:14.036104 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.947s]
I0708 11:29:16.393944 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:

Indexed 50000 documents.


I0708 11:30:06.117877 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.856s]
I0708 11:30:08.554373 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.966s]
I0708 11:30:10.904803 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.895s]
I0708 11:30:13.245454 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.878s]
I0708 11:30:15.724876 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.966s]
I0708 11:30:18.121310 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.902s]
I0708 11:30:20.560724 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.899s]
I0708 11:30:22.908934 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.877s]
I0708 11:30:25.291285 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:

Indexed 60000 documents.


I0708 11:31:13.583811 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.911s]
I0708 11:31:15.980944 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.878s]
I0708 11:31:18.480574 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.974s]
I0708 11:31:20.962885 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.962s]
I0708 11:31:23.372315 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.910s]
I0708 11:31:25.714727 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.865s]
I0708 11:31:28.030007 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.865s]
I0708 11:31:30.512045 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:1.054s]
I0708 11:31:32.922219 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:

Indexed 70000 documents.


I0708 11:32:22.662954 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.885s]
I0708 11:32:25.122667 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.934s]
I0708 11:32:27.488504 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.879s]
I0708 11:32:29.869364 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.866s]
I0708 11:32:32.264664 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.906s]
I0708 11:32:34.694537 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.941s]
I0708 11:32:37.225919 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.932s]
I0708 11:32:39.605862 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.893s]
I0708 11:32:41.978754 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:

Indexed 80000 documents.


I0708 11:33:31.279854 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.880s]
I0708 11:33:33.659977 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.914s]
I0708 11:33:36.026925 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.855s]
I0708 11:33:38.350382 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.877s]
I0708 11:33:40.691431 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.861s]
I0708 11:33:43.106203 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.888s]
I0708 11:33:45.630390 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.882s]
I0708 11:33:48.068383 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.925s]
I0708 11:33:50.553371 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:

Indexed 90000 documents.


I0708 11:34:39.687284 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.899s]
I0708 11:34:42.087421 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.884s]
I0708 11:34:44.553508 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.936s]
I0708 11:34:46.987158 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.891s]
I0708 11:34:49.446610 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.949s]
I0708 11:34:52.003087 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:1.042s]
I0708 11:34:54.378403 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.862s]
I0708 11:34:56.787267 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.897s]
I0708 11:34:59.124922 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:

Indexed 100000 documents.


I0708 11:35:37.358636 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.889s]
I0708 11:35:40.711387 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:1.853s]
I0708 11:35:43.081490 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.903s]
I0708 11:35:45.448853 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.870s]
I0708 11:35:48.082733 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:1.152s]
I0708 11:35:50.421741 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.878s]
I0708 11:35:52.693308 140149010638656 base.py:117] POST http://localhost:9200/_bulk [status:200 request:0.830s]


Indexed 103482 documents.


I0708 11:35:52.900137 140149010638656 base.py:117] POST http://localhost:9200/docs/_refresh [status:200 request:0.201s]


Done indexing.
Time taken for indexing: 709.6641943454742 seconds


## Extractive Text Summarization

In [232]:
custom_config = AutoConfig.from_pretrained('../bin/models/bert/')
custom_config.output_hidden_states = True
custom_tokenizer = AutoTokenizer.from_pretrained('../bin/models/bert')
custom_model = AutoModel.from_pretrained('../bin/models/bert', config=custom_config)
summ_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)

#Pass the concatenated text string from top docs for summarisation
def get_summary(text):
    body = str(text)
#     model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)
    summary = summ_model(body, min_length=60, ratio=0.1)
    return summary

I0715 09:13:15.733747 140149010638656 configuration_utils.py:281] loading configuration file ../bin/models/bert/config.json
I0715 09:13:15.735370 140149010638656 configuration_utils.py:319] Model config BertConfig {
  "_num_labels": 2,
  "architectures": null,
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "min_length": 0,
  "model_type": "bert",
  "no_repeat_ngram_size": 0,
  "num_attention_heads": 12,
  "num

## Searching via Flask Interface

In [264]:
app = Flask(__name__, template_folder='../resources')

@app.route('/')
def index():
    return render_template('index.html')
#     return render_template('testpage.html')


@app.route('/search')
def analyser():
    query = request.args.get('q')
    
    if query:
        embedding_start = time.time()
        tm_query_vec = tm_query_embed(query)
        tfidf_query_vec = tfidf_query_embed(query)
        bert_query_vec = bert_query_embed(query)
        w2v_query_vec = w2v_query_embed(query)
        scibert_query_vec = scibert_query_embed(query)
        d2v_query_vec = d2v_query_embed(query)
        embedding_time = time.time() - embedding_start
        
        tm_cos_sim = "cosineSimilarity(params.tm_qv, doc['tm_doc_vec']) + 1.0"
        tfidf_cos_sim = "cosineSimilarity(params.tfidf_qv, doc['tfidf_doc_vec']) + 1.0"
        bert_cos_sim = "cosineSimilarity(params.bert_qv, doc['bert_doc_vec']) + 1.0"
        w2v_cos_sim = "cosineSimilarity(params.w2v_qv, doc['w2v_doc_vec']) + 1.0"
        scibert_cos_sim = "cosineSimilarity(params.scibert_qv, doc['scibert_doc_vec']) + 1.0"
        d2v_cos_sim = "cosineSimilarity(params.d2v_qv, doc['d2v_doc_vec']) + 1.0"
        sep = " + "
        
        #choose models to include here
        similarity_func = d2v_cos_sim + sep + tfidf_cos_sim + sep + bert_cos_sim + sep + w2v_cos_sim
        
        script_query = {
            "script_score": {
                "query": {"match_all": {}},
                "script": {
                    "source": similarity_func,
                    "params": {"d2v_qv": d2v_query_vec, "scibert_qv": scibert_query_vec, "w2v_qv": w2v_query_vec, "tm_qv": tm_query_vec, "tfidf_qv": tfidf_query_vec, "bert_qv": bert_query_vec}
                }
            }
        }

        search_start = time.time()
        response = client.search(
            index=INDEX_NAME,
            body={
                "size": SEARCH_SIZE,
                "query": script_query,
                "_source": {"includes": ["title", "abstract", "url", "authors"]}
            }
        )
        search_time = time.time() - search_start

        #Call to summarizer
        output_text = ""
        for hit in response['hits']['hits']:
            selected_text = hit['_source']['abstract']
            output_text = output_text + " " + selected_text

        summary = summarize(output_text)

        result_disp = {"top_docs" : response, "summary" : summary}
        return jsonify(result_disp)
    
    else:
        return 
app.run(host='0.0.0.0', port=5000)

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


I0727 11:24:11.520863 140149010638656 _internal.py:113]  * Running on http://0.0.0.0:5000/ (Press CTRL+C to quit)
I0727 11:24:19.447227 140142282270464 _internal.py:113] 10.7.28.135 - - [27/Jul/2020 11:24:19] "[37mGET / HTTP/1.1[0m" 200 -
E0727 11:24:20.481744 140142282270464 app.py:1891] Exception on /search [GET]
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/flask/app.py", line 2447, in wsgi_app
    response = self.full_dispatch_request()
  File "/usr/local/lib/python3.8/dist-packages/flask/app.py", line 1953, in full_dispatch_request
    return self.finalize_request(rv)
  File "/usr/local/lib/python3.8/dist-packages/flask/app.py", line 1968, in finalize_request
    response = self.make_response(rv)
  File "/usr/local/lib/python3.8/dist-packages/flask/app.py", line 2097, in make_response
    raise TypeError(
TypeError: The view function did not return a valid response. The function either returned None or ended without a return statement.
I0727 1

# Validation of results

## Calculating MRR using titles as queries

In [237]:
#function to find reciprocal rank for a given query in the topk results (edit similarity_func to choose model combination)
def find_rr(query, topk=20):
    if query:
        embedding_start = time.time()
        tm_query_vec = tm_query_embed(query)
        tfidf_query_vec = tfidf_query_embed(query)
        bert_query_vec = bert_query_embed(query)
        w2v_query_vec = w2v_query_embed(query)
        scibert_query_vec = scibert_query_embed(query)
        d2v_query_vec = d2v_query_embed(query)
        embedding_time = time.time() - embedding_start
        
        tm_cos_sim = "cosineSimilarity(params.tm_qv, doc['tm_doc_vec']) + 1.0"
        tfidf_cos_sim = "cosineSimilarity(params.tfidf_qv, doc['tfidf_doc_vec']) + 1.0"
        bert_cos_sim = "cosineSimilarity(params.bert_qv, doc['bert_doc_vec']) + 1.0"
        w2v_cos_sim = "cosineSimilarity(params.w2v_qv, doc['w2v_doc_vec']) + 1.0"
        scibert_cos_sim = "cosineSimilarity(params.scibert_qv, doc['scibert_doc_vec']) + 1.0"
        d2v_cos_sim = "cosineSimilarity(params.d2v_qv, doc['d2v_doc_vec']) + 1.0"
        sep = " + "
        
        #choose models to include here
        similarity_func = d2v_cos_sim + sep + tfidf_cos_sim + sep + bert_cos_sim
#         similarity_func = bert_cos_sim 
        
        script_query = {
            "script_score": {
                "query": {"match_all": {}},
                "script": {
                    "source": similarity_func,
                    "params": {"d2v_qv": d2v_query_vec, "scibert_qv": scibert_query_vec, "w2v_qv": w2v_query_vec, "tm_qv": tm_query_vec, "tfidf_qv": tfidf_query_vec, "bert_qv": bert_query_vec}
                }
            }
        }

        search_start = time.time()
        response = client.search(
            index=INDEX_NAME,
            body={
                "size": topk,
                "query": script_query,
                "_source": {"includes": ["title", "abstract", "url", "authors"]}
            }
        )
        search_time = time.time() - search_start
        
        rr = 0
        for i, hit in enumerate(response['hits']['hits']):
            if (query == hit['_source']['title']):
                rr = 1/(i+1)
                break
                
        return rr, embedding_time, search_time

In [242]:
#find MRR over a set of <query_set_size> randomly chosen titles in the <topk> results, averaged over <n_iter> runs
n_iter = 10
query_set_size = 100
topk = 20

avg_rr = 0
avg_et = 0
avg_st = 0
    
for j in range(n_iter):
    print("Iteration", j+1, "of", n_iter)
    title_queries = list(np.random.choice(df['title'], query_set_size))

    rrs = np.zeros(query_set_size)
    ets = np.zeros(query_set_size)
    sts = np.zeros(query_set_size)

    for i, query in enumerate(title_queries):
        rrs[i], ets[i], sts[i] = find_rr(query)
        
    avg_rr += np.mean(rrs)
    avg_et += np.mean(ets)
    avg_st += np.mean(sts)
    
avg_rr /= n_iter
avg_et /= n_iter
avg_st /= n_iter

Iteration 1 of 10


I0715 09:59:16.636134 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.204s]
I0715 09:59:16.942436 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.194s]
I0715 09:59:17.247847 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.190s]
I0715 09:59:17.584470 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.192s]
I0715 09:59:17.886944 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.191s]
I0715 09:59:18.184077 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.190s]
I0715 09:59:18.503269 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.198s]
I0715 09:59:18.789981 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.179s]
I0715 09:59:19.092944 140149010638656 base.py:117] GET h

I0715 09:59:38.584737 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.193s]
I0715 09:59:38.888407 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.184s]
I0715 09:59:39.222038 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.189s]
I0715 09:59:39.545061 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.191s]
I0715 09:59:39.852816 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.193s]
I0715 09:59:40.166856 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.199s]
I0715 09:59:40.477202 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.196s]
I0715 09:59:40.776271 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.187s]
I0715 09:59:41.096950 140149010638656 base.py:117] GET h

Iteration 2 of 10


I0715 09:59:48.118515 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.188s]
I0715 09:59:48.531098 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.198s]
I0715 09:59:48.874617 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.185s]
I0715 09:59:49.222990 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.197s]
I0715 09:59:49.533231 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.189s]
I0715 09:59:49.848009 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.201s]
I0715 09:59:50.139804 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.182s]
I0715 09:59:50.428166 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.179s]
I0715 09:59:50.805745 140149010638656 base.py:117] GET h

I0715 10:00:11.165320 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.179s]
I0715 10:00:11.493344 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.194s]
I0715 10:00:11.810235 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.188s]
I0715 10:00:12.131139 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.187s]
I0715 10:00:12.431967 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.192s]
I0715 10:00:12.728568 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.187s]
I0715 10:00:13.051220 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.200s]
I0715 10:00:13.355598 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.189s]
I0715 10:00:13.747704 140149010638656 base.py:117] GET h

Iteration 3 of 10


I0715 10:00:20.939037 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.189s]
I0715 10:00:21.269016 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.183s]
I0715 10:00:21.593097 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.183s]
I0715 10:00:21.901836 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.190s]
I0715 10:00:22.230304 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.197s]
I0715 10:00:22.557034 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.195s]
I0715 10:00:22.864130 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.192s]
I0715 10:00:23.171424 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.195s]
I0715 10:00:23.490894 140149010638656 base.py:117] GET h

I0715 10:00:43.748319 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.178s]
I0715 10:00:44.059580 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.181s]
I0715 10:00:44.378105 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.177s]
I0715 10:00:44.685174 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.180s]
I0715 10:00:45.115767 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.182s]
I0715 10:00:45.440958 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.180s]
I0715 10:00:45.741094 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.183s]
I0715 10:00:46.048059 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.181s]
I0715 10:00:46.365478 140149010638656 base.py:117] GET h

Iteration 4 of 10


I0715 10:00:53.558895 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.182s]
I0715 10:00:53.892428 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.193s]
I0715 10:00:54.197854 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.181s]
I0715 10:00:54.513282 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.181s]
I0715 10:00:54.840099 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.203s]
I0715 10:00:55.159186 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.190s]
I0715 10:00:55.467224 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.188s]
I0715 10:00:55.785732 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.185s]
I0715 10:00:56.097991 140149010638656 base.py:117] GET h

I0715 10:01:15.994292 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.186s]
I0715 10:01:16.313211 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.191s]
I0715 10:01:16.685378 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.191s]
I0715 10:01:17.078302 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.190s]
I0715 10:01:17.400734 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.189s]
I0715 10:01:17.725008 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.189s]
I0715 10:01:18.108032 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.193s]
I0715 10:01:18.426303 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.190s]
I0715 10:01:18.809200 140149010638656 base.py:117] GET h

Iteration 5 of 10


I0715 10:01:25.977411 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.190s]
I0715 10:01:26.304103 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.188s]
I0715 10:01:26.640372 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.185s]
I0715 10:01:26.969425 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.194s]
I0715 10:01:27.266829 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.186s]
I0715 10:01:27.596783 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.183s]
I0715 10:01:28.001995 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.183s]
I0715 10:01:28.304457 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.182s]
I0715 10:01:28.595406 140149010638656 base.py:117] GET h

I0715 10:01:48.375416 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.194s]
I0715 10:01:48.694302 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.184s]
I0715 10:01:49.006045 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.179s]
I0715 10:01:49.333216 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.186s]
I0715 10:01:49.646908 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.181s]
I0715 10:01:49.976167 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.191s]
I0715 10:01:50.343359 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.191s]
I0715 10:01:50.653695 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.190s]
I0715 10:01:50.959171 140149010638656 base.py:117] GET h

Iteration 6 of 10


I0715 10:01:57.902716 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.185s]
I0715 10:01:58.199816 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.178s]
I0715 10:01:58.498861 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.186s]
I0715 10:01:58.920367 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.186s]
I0715 10:01:59.263663 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.206s]
I0715 10:01:59.599479 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.191s]
I0715 10:01:59.901544 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.182s]
I0715 10:02:00.223704 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.184s]
I0715 10:02:00.584192 140149010638656 base.py:117] GET h

I0715 10:02:20.521561 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.181s]
I0715 10:02:20.833362 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.191s]
I0715 10:02:21.140510 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.194s]
I0715 10:02:21.440089 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.188s]
I0715 10:02:21.755851 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.191s]
I0715 10:02:22.088447 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.186s]
I0715 10:02:22.396240 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.182s]
I0715 10:02:22.724439 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.195s]
I0715 10:02:23.038322 140149010638656 base.py:117] GET h

Iteration 7 of 10


I0715 10:02:29.893767 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.183s]
I0715 10:02:30.194873 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.183s]
I0715 10:02:30.495238 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.184s]
I0715 10:02:30.821798 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.178s]
I0715 10:02:31.145308 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.183s]
I0715 10:02:31.445630 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.178s]
I0715 10:02:31.740606 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.185s]
I0715 10:02:32.053350 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.181s]
I0715 10:02:32.367421 140149010638656 base.py:117] GET h

I0715 10:02:52.950793 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.198s]
I0715 10:02:53.254799 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.189s]
I0715 10:02:53.551855 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.189s]
I0715 10:02:53.852004 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.187s]
I0715 10:02:54.227219 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.193s]
I0715 10:02:54.535700 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.190s]
I0715 10:02:54.830188 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.183s]
I0715 10:02:55.121441 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.178s]
I0715 10:02:55.431805 140149010638656 base.py:117] GET h

Iteration 8 of 10


I0715 10:03:02.560256 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.195s]
I0715 10:03:02.887798 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.179s]
I0715 10:03:03.207778 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.203s]
I0715 10:03:03.499158 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.181s]
I0715 10:03:03.810112 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.189s]
I0715 10:03:04.103625 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.185s]
I0715 10:03:04.389471 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.178s]
I0715 10:03:04.686956 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.182s]
I0715 10:03:05.042856 140149010638656 base.py:117] GET h

I0715 10:03:25.068977 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.185s]
I0715 10:03:25.384561 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.190s]
I0715 10:03:25.782269 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.193s]
I0715 10:03:26.098645 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.197s]
I0715 10:03:26.394313 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.184s]
I0715 10:03:26.781532 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.191s]
I0715 10:03:27.152243 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.199s]
I0715 10:03:27.473193 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.193s]
I0715 10:03:27.775945 140149010638656 base.py:117] GET h

Iteration 9 of 10


I0715 10:03:34.908559 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.188s]
I0715 10:03:35.243477 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.216s]
I0715 10:03:35.546101 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.187s]
I0715 10:03:35.888500 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.190s]
I0715 10:03:36.181612 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.187s]
I0715 10:03:36.473277 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.185s]
I0715 10:03:36.829771 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.190s]
I0715 10:03:37.144437 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.182s]
I0715 10:03:37.455854 140149010638656 base.py:117] GET h

I0715 10:03:57.843369 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.187s]
I0715 10:03:58.148857 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.185s]
I0715 10:03:58.459106 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.185s]
I0715 10:03:58.885739 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.187s]
I0715 10:03:59.200381 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.189s]
I0715 10:03:59.510032 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.184s]
I0715 10:03:59.813336 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.194s]
I0715 10:04:00.110513 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.181s]
I0715 10:04:00.543137 140149010638656 base.py:117] GET h

Iteration 10 of 10


I0715 10:04:07.297820 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.189s]
I0715 10:04:07.607109 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.200s]
I0715 10:04:07.901597 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.182s]
I0715 10:04:08.388042 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.184s]
I0715 10:04:08.700510 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.199s]
I0715 10:04:09.005194 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.191s]
I0715 10:04:09.311671 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.194s]
I0715 10:04:09.615206 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.190s]
I0715 10:04:09.922093 140149010638656 base.py:117] GET h

I0715 10:04:29.733191 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.189s]
I0715 10:04:30.051321 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.183s]
I0715 10:04:30.381744 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.193s]
I0715 10:04:30.690433 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.177s]
I0715 10:04:30.993816 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.179s]
I0715 10:04:31.317763 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.192s]
I0715 10:04:31.647501 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.188s]
I0715 10:04:31.964856 140149010638656 base.py:117] GET http://localhost:9200/docs/_search [status:200 request:0.187s]
I0715 10:04:32.285277 140149010638656 base.py:117] GET h

In [243]:
#print MRR, mean embedding and query times
print("Mean Reciprocal Rank (MRR) :", avg_rr)
print("Mean embedding time :", avg_et, 'seconds')
print("Mean search time :", avg_st, 'seconds')

Mean Reciprocal Rank (MRR) : 0.3434526698104098
Mean embedding time : 0.13016126799583433 seconds
Mean search time : 0.19235228180885314 seconds
