In [1]:
%reload_ext memory_profiler

import warnings
warnings.filterwarnings(action='ignore')

# ----------------- Classics -------------------- #
import numpy as np
import pandas as pd

# ---------------- Pandas settings --------------- #
# Removes rows and columns truncation of '...'
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# ------------------- Python libs ---------------- #
import os, sys, re
import pprint
pp = pprint.PrettyPrinter(indent=4)
from pathlib import Path
ROOT_PATH = Path().resolve().parent
sys.path.append(str(ROOT_PATH)) # Add folder root path

import typing as t
import timeit

from tqdm import tqdm
tqdm.pandas()

# ------------------- NLP libs ---------------------- #
from utils.tokenizer import Tokenizer

We know adding `query+question` improves the baseline TF-IDF model, so let's now improve the Query by only identifying key entities from `question` and including those, but first let's understand some term frequency in query, how stopwords affects it and how using SciSpacy we can improve query and maybe do query expansion. 


# 1. Load topics

In [2]:
def load_queries(input_fpath: Path, dtype: str = 'csv', cols_to_keep=['topic-id', 'query', 'question'], index_col=['topic-id']) -> pd.DataFrame:
    """Loads queries file and returns it as pandas data frame
    """
    if dtype == 'csv':
        df = pd.read_csv(input_fpath, quotechar='"', index_col=index_col, usecols=cols_to_keep)
        # for each column
        for col in df.columns:
            # check if the columns contains string data
            if pd.api.types.is_string_dtype(df[col]):
                df[col] = df[col].str.strip() # removes front and end white spaces
                df[col] = df[col].str.replace('\s{2,}', ' ') # remove double or more white spaces
    return df

QUERY_FPATH = Path('../data/CORD-19/CORD-19/topics-rnd3.csv')
query_df = load_queries(QUERY_FPATH)
query_df['query+question'] = query_df['query'] + ' ' + query_df['question']
query_df.head()

Unnamed: 0_level_0,query,question,query+question
topic-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,coronavirus origin,what is the origin of COVID-19,coronavirus origin what is the origin of COVID-19
2,coronavirus response to weather changes,how does the coronavirus respond to changes in...,coronavirus response to weather changes how do...
3,coronavirus immunity,will SARS-CoV2 infected people develop immunit...,coronavirus immunity will SARS-CoV2 infected p...
4,how do people die from the coronavirus,what causes death from Covid-19?,how do people die from the coronavirus what ca...
5,animal models of COVID-19,what drugs have been active against SARS-CoV o...,animal models of COVID-19 what drugs have been...


### Effects of Stopwords removal and using SciSpacy pre_processing

In [3]:
# 1. Init tokenizer with:
tk_nltk_stop = Tokenizer('l-s') # lowercase + stopwords
punct_nltk_stop = Tokenizer('l-s-pt') # lowercase + stopwords + punctuation
lem_nltk_stop = Tokenizer('l-s-pt-lm') # lowercase + stopwords + punctuation + lemmatizing

In [4]:
# 2. Init custom stopwords tokenizer
tk_cus_stop = Tokenizer('l-s-lm-pt')
custom_stopwords_path = ROOT_PATH / 'utils/stopwords.txt'
tk_cus_stop.load_stop_words(custom_stopwords_path, combine_w_nltk=True)

	Custom STOPWORDS loaded successfully !(^^)!


In [5]:
# 3. Load Scispacy model
import spacy
nlp = spacy.load('en_core_sci_md')

In [6]:
for topic_id, row in query_df.iterrows():
    topic = row['query']
    question = row['question']
    print(f"{topic_id}")
    print(f"\t[BASE]: {topic}, {question}")
    nltk_topic = ' '.join(tk_nltk_stop.tokenize(topic))
    nltk_question = ' '.join(tk_nltk_stop.tokenize(question))
    print(f"\t[NLTK low + stop]: {nltk_topic}, {nltk_question}")
    nltk_topic = ' '.join(punct_nltk_stop.tokenize(topic))
    nltk_question = ' '.join(punct_nltk_stop.tokenize(question))
    print(f"\t[NLTK low + stop + punct]: {nltk_topic}, {nltk_question}")
    nltk_topic = ' '.join(lem_nltk_stop.tokenize(topic))
    nltk_question = ' '.join(lem_nltk_stop.tokenize(question))
    print(f"\t[NLTK low + stop + punct + lemma]: {nltk_topic}, {nltk_question}")
    cus_topic = ' '.join(tk_cus_stop.tokenize(topic))
    cus_question = ' '.join(tk_cus_stop.tokenize(question))
    print(f"\t[CUSTOM STOPWORDS]: {cus_topic}, {cus_question}")
    doc = nlp(question)
    ents = [{e.label_:e.text} for e in doc.ents]
    print(f"\t[NER]: {ents}")

1
	[BASE]: coronavirus origin, what is the origin of COVID-19
	[NLTK low + stop]: coronavirus origin, origin covid-19
	[NLTK low + stop + punct]: coronavirus origin, origin covid-19
	[NLTK low + stop + punct + lemma]: coronavirus origin, origin covid-19
	[CUSTOM STOPWORDS]: coronavirus origin, origin covid-19
	[NER]: [{'ENTITY': 'origin'}, {'ENTITY': 'COVID-19'}]
2
	[BASE]: coronavirus response to weather changes, how does the coronavirus respond to changes in the weather
	[NLTK low + stop]: coronavirus response weather changes, coronavirus respond changes weather
	[NLTK low + stop + punct]: coronavirus response weather changes, coronavirus respond changes weather
	[NLTK low + stop + punct + lemma]: coronavirus response weather change, coronavirus respond change weather
	[CUSTOM STOPWORDS]: coronavirus response weather change, coronavirus respond change weather
	[NER]: [{'ENTITY': 'coronavirus'}, {'ENTITY': 'changes'}, {'ENTITY': 'weather'}]
3
	[BASE]: coronavirus immunity, will SARS-C

Looks like combination of `[NLTK low + stop + punct + lemma]` retains the context for the query compared to `CUSTOM_STOPWORDS` as seen in Topic #15 it removes `outside` from the query which affects the context of the query.



## Cleaning `query+question`

In [7]:
def clean_text(text):
    return ' '.join(tk.tokenize(text))
tk = Tokenizer('l-s-pt-lm')
query_df['cleaned_query'] = query_df['query+question'].progress_apply(clean_text)
query_df.head()

100%|██████████| 40/40 [00:00<00:00, 725.33it/s]


Unnamed: 0_level_0,query,question,query+question,cleaned_query
topic-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,coronavirus origin,what is the origin of COVID-19,coronavirus origin what is the origin of COVID-19,coronavirus origin origin covid-19
2,coronavirus response to weather changes,how does the coronavirus respond to changes in...,coronavirus response to weather changes how do...,coronavirus response weather change coronaviru...
3,coronavirus immunity,will SARS-CoV2 infected people develop immunit...,coronavirus immunity will SARS-CoV2 infected p...,coronavirus immunity sars-cov2 infected people...
4,how do people die from the coronavirus,what causes death from Covid-19?,how do people die from the coronavirus what ca...,people die coronavirus cause death covid-19
5,animal models of COVID-19,what drugs have been active against SARS-CoV o...,animal models of COVID-19 what drugs have been...,animal model covid-19 drug active sars-cov sar...


## Connect to Elasticsearch Index

In [8]:
import json
import time
import os
from elasticsearch import Elasticsearch

INDEX_NAME = 'abstract'
with open("elastic.json") as elastic_file:
    ELASTIC_SETTINGS = json.loads(elastic_file.read().strip())
es_client = Elasticsearch(
    cloud_id=ELASTIC_SETTINGS["cloud_id"],
    http_auth=(ELASTIC_SETTINGS["user"], ELASTIC_SETTINGS["password"]),
)
print(f'Is ES client connected ? - {es_client.ping()}')

Is ES client connected ? - True


In [9]:
def search_on_abstract(query, qid, run_name, es_client, top_k=10, verbose=False):
    """ Searches the query and finds the best matches using elasticsearch."""
    # 1. create trec-covid template
    template = "{} Q0 {} {} {:.6f} {}\n"
    # 2. create ES search query
    search = {
        "size": top_k, 
        "query": {"match": {"abstract": query}},
        "_source": {"includes": ["cord_uid", "title"]}
    }
    response = es_client.search(
        index=INDEX_NAME,
        body=json.dumps(search)
    )
    ranked_lists = []
    for rank, hit in enumerate(response["hits"]["hits"]):
        cord_uid = hit["_source"]["cord_uid"]
        score = hit["_score"]
        title = hit["_source"]["title"]
        ranked_lists.append(template.format(qid, cord_uid, rank+1, score, run_name))
        if verbose:
            print("\tcord_id: {}".format(cord_uid))
            print("\ttitle: {}".format(title))
            print("\tscore: {}".format(score))
            print()
    return ranked_lists

def write_results(out_fpath, query_df, query_txt_col, es_client, run_name, top_k=1000):
    """Writes ranked results from elastic search results to txt file."""
    with open(out_fpath, 'w', encoding='utf-8') as writer:
        for idx, row in query_df.iterrows():
            qid = idx
            query = row[query_txt_col]
            ranked_lists = search_on_abstract(query, qid, run_name, es_client, top_k=top_k)
            writer.writelines(ranked_lists)
    print(f"Wrote file @ {out_fpath}\n")

In [10]:
%%time
run_name = 'elasticsearch_baseline_abstract_cleaned_query'
query_txt_col = 'cleaned_query'
out_fpath = Path('../data/output') / f'{run_name}.txt'
write_results(out_fpath, query_df, query_txt_col, es_client, run_name)

Wrote file @ ../data/output/elasticsearch_baseline_abstract_cleaned_query.txt

CPU times: user 258 ms, sys: 6.7 ms, total: 265 ms
Wall time: 24.2 s


In [11]:
run_name = "elasticsearch_baseline_abstract_cleaned_query"
path_to_qrel_file = "../data/qrels/qrels-covid_d3_j0.5-3.txt"
path_to_result_file = f"../data/output/{run_name}.txt"
output_result_path = f"../data/results/{run_name}_trec_eval.txt"
os.system("trec_eval -c -m all_trec {} {} > {}".format(path_to_qrel_file, path_to_result_file, output_result_path))
with open(output_result_path, encoding='utf-8') as f:
    print(f.read())

runid                 	all	elasticsearch_baseline_abstract_cleaned_query
num_q                 	all	40
num_ret               	all	40000
num_rel               	all	10001
num_rel_ret           	all	3996
map                   	all	0.1607
gm_map                	all	0.1107
Rprec                 	all	0.2514
bpref                 	all	0.3462
recip_rank            	all	0.7767
iprec_at_recall_0.00  	all	0.8260
iprec_at_recall_0.10  	all	0.4528
iprec_at_recall_0.20  	all	0.3348
iprec_at_recall_0.30  	all	0.2322
iprec_at_recall_0.40  	all	0.1438
iprec_at_recall_0.50  	all	0.0801
iprec_at_recall_0.60  	all	0.0374
iprec_at_recall_0.70  	all	0.0096
iprec_at_recall_0.80  	all	0.0000
iprec_at_recall_0.90  	all	0.0000
iprec_at_recall_1.00  	all	0.0000
P_5                   	all	0.6000
P_10                  	all	0.5775
P_15                  	all	0.5467
P_20                  	all	0.5262
P_30                  	all	0.4933
P_100                 	all	0.3472
P_200                 	all	0.2583
P_500            

- `MAP` - 0.1607
- `NDCG@10` - 0.5356
- `P@5` - 0.6000
- `R@1000` - 0.4199

Results are pretty close so with or without normalizing query by removing stopwords or case folding it. Next, let's do query expansion using word embeddings trained on dataset.

## Query Expansion

In [12]:
from gensim.models import KeyedVectors
from IPython.display import display_html
cbow_vectors = KeyedVectors.load("../data/embeddings/cbow_word2vec.bin")

In [13]:
cbow_vectors.wv.most_similar(positive='quarantine', topn=10)

[('distancing', 0.8528013229370117),
 ('strict', 0.8384310007095337),
 ('containment', 0.833786129951477),
 ('quarantining', 0.8291466236114502),
 ('quarantined', 0.8246265649795532),
 ('tracing', 0.8222387433052063),
 ('self-quarantine', 0.8033556938171387),
 ('lock-down', 0.7976255416870117),
 ('self-isolation', 0.7923141717910767),
 ('pre-symptomatic', 0.7876335382461548)]

In [16]:
def expand_terms(text):
    ents = [e.text for e in nlp(text).ents]
    terms = set()
    for ent in ents:
        words = ent.split(' ')
        try:
            results = cbow_vectors.wv.most_similar(positive=words, topn=5)
            results = [result[0] for result in results]
            terms.update(results)
        except:
            pass
    return ' '.join(terms)

In [17]:
topic_id = 12
query_question = query_df.loc[topic_id]['query']
print(query_question)
expand_terms(query_question)

coronavirus quarantine


'syndrome-related (sars)-cov-2 corona coronavirus-2 2019'

In [18]:
query_df['expanded_query'] = query_df['query+question'] + ' ' + query_df['query'].progress_apply(expand_terms)
query_df.head()

100%|██████████| 40/40 [00:00<00:00, 63.01it/s]


Unnamed: 0_level_0,query,question,query+question,cleaned_query,expanded_query
topic-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,coronavirus origin,what is the origin of COVID-19,coronavirus origin what is the origin of COVID-19,coronavirus origin origin covid-19,coronavirus origin what is the origin of COVID...
2,coronavirus response to weather changes,how does the coronavirus respond to changes in...,coronavirus response to weather changes how do...,coronavirus response weather change coronaviru...,coronavirus response to weather changes how do...
3,coronavirus immunity,will SARS-CoV2 infected people develop immunit...,coronavirus immunity will SARS-CoV2 infected p...,coronavirus immunity sars-cov2 infected people...,coronavirus immunity will SARS-CoV2 infected p...
4,how do people die from the coronavirus,what causes death from Covid-19?,how do people die from the coronavirus what ca...,people die coronavirus cause death covid-19,how do people die from the coronavirus what ca...
5,animal models of COVID-19,what drugs have been active against SARS-CoV o...,animal models of COVID-19 what drugs have been...,animal model covid-19 drug active sars-cov sar...,animal models of COVID-19 what drugs have been...


In [19]:
%%time
run_name = 'elasticsearch_baseline_abstract_expanded_query'
query_txt_col = 'expanded_query'
out_fpath = Path('../data/output') / f'{run_name}.txt'
write_results(out_fpath, query_df, query_txt_col, es_client, run_name)

Wrote file @ ../data/output/elasticsearch_baseline_abstract_expanded_query.txt

CPU times: user 281 ms, sys: 7.81 ms, total: 289 ms
Wall time: 24.6 s


In [20]:
run_name = "elasticsearch_baseline_abstract_expanded_query"
path_to_qrel_file = "../data/qrels/qrels-covid_d3_j0.5-3.txt"
path_to_result_file = f"../data/output/{run_name}.txt"
output_result_path = f"../data/results/{run_name}_trec_eval.txt"
os.system("trec_eval -c -m all_trec {} {} > {}".format(path_to_qrel_file, path_to_result_file, output_result_path))
with open(output_result_path, encoding='utf-8') as f:
    print(f.read())

runid                 	all	elasticsearch_baseline_abstract_expanded_query
num_q                 	all	40
num_ret               	all	40000
num_rel               	all	10001
num_rel_ret           	all	3147
map                   	all	0.1123
gm_map                	all	0.0601
Rprec                 	all	0.1861
bpref                 	all	0.2941
recip_rank            	all	0.6378
iprec_at_recall_0.00  	all	0.6922
iprec_at_recall_0.10  	all	0.3455
iprec_at_recall_0.20  	all	0.2187
iprec_at_recall_0.30  	all	0.1343
iprec_at_recall_0.40  	all	0.0817
iprec_at_recall_0.50  	all	0.0445
iprec_at_recall_0.60  	all	0.0177
iprec_at_recall_0.70  	all	0.0078
iprec_at_recall_0.80  	all	0.0036
iprec_at_recall_0.90  	all	0.0000
iprec_at_recall_1.00  	all	0.0000
P_5                   	all	0.5250
P_10                  	all	0.5175
P_15                  	all	0.4667
P_20                  	all	0.4463
P_30                  	all	0.3967
P_100                 	all	0.2602
P_200                 	all	0.1891
P_500           

In this case using query expansion hurt the results, even though word embeddings were pretty close, so it's not always the case query expansion helps.

**Key Metrics**,

- `MAP` - 0.1123
- `NDCG@10` - 0.4557
- `P@5` - 0.5250
- `R@1000` - 0.3430