In [1]:
%reload_ext memory_profiler

import warnings
warnings.filterwarnings(action='ignore')

# ----------------- Classics -------------------- #
import numpy as np
import pandas as pd

# ---------------- Pandas settings --------------- #
# Removes rows and columns truncation of '...'
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# ------------------- Python libs ---------------- #
import os, sys, re
import pprint
pp = pprint.PrettyPrinter(indent=4)
from pathlib import Path
ROOT_PATH = Path().resolve().parent
sys.path.append(str(ROOT_PATH)) # Add folder root path

import typing as t
import timeit

from tqdm import tqdm
tqdm.pandas()

# ------------------- NLP libs ---------------------- #
from utils.tokenizer import Tokenizer

We know adding `query+question` improves the baseline TF-IDF model, so let's now improve the Query by only identifying key entities from `question` and including those, but first let's understand some term frequency in query, how stopwords affects it and how using SciSpacy we can improve query and maybe do query expansion. 


# 1. Load topics

In [2]:
def load_queries(input_fpath: Path, dtype: str = 'csv', cols_to_keep=['topic-id', 'query', 'question'], index_col=['topic-id']) -> pd.DataFrame:
    """Loads queries file and returns it as pandas data frame
    """
    if dtype == 'csv':
        df = pd.read_csv(input_fpath, quotechar='"', index_col=index_col, usecols=cols_to_keep)
        # for each column
        for col in df.columns:
            # check if the columns contains string data
            if pd.api.types.is_string_dtype(df[col]):
                df[col] = df[col].str.strip() # removes front and end white spaces
                df[col] = df[col].str.replace('\s{2,}', ' ') # remove double or more white spaces
    return df

QUERY_FPATH = Path('../data/CORD-19/CORD-19/topics-rnd3.csv')
query_df = load_queries(QUERY_FPATH)
query_df.head()

Unnamed: 0_level_0,query,question
topic-id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,coronavirus origin,what is the origin of COVID-19
2,coronavirus response to weather changes,how does the coronavirus respond to changes in...
3,coronavirus immunity,will SARS-CoV2 infected people develop immunit...
4,how do people die from the coronavirus,what causes death from Covid-19?
5,animal models of COVID-19,what drugs have been active against SARS-CoV o...


### Effects of Stopwords removal and using SciSpacy pre_processing

In [3]:
# 1. Init tokenizer with:
tk_nltk_stop = Tokenizer('l-s') # lowercase + stopwords
punct_nltk_stop = Tokenizer('l-s-pt') # lowercase + stopwords + punctuation
lem_nltk_stop = Tokenizer('l-s-pt-lm') # lowercase + stopwords + punctuation + lemmatizing

In [4]:
# 2. Init custom stopwords tokenizer
tk_cus_stop = Tokenizer('l-s-lm-pt')
custom_stopwords_path = ROOT_PATH / 'utils/stopwords.txt'
tk_cus_stop.load_stop_words(custom_stopwords_path, combine_w_nltk=True)

	Custom STOPWORDS loaded successfully !(^^)!


In [5]:
# 3. Load Scispacy model
import spacy
nlp = spacy.load('en_core_sci_md')

In [8]:
for topic_id, (topic, question) in query_df.iterrows():
    print(f"{topic_id}")
    print(f"\t[BASE]: {topic}, {question}")
    nltk_topic = ' '.join(tk_nltk_stop.tokenize(topic))
    nltk_question = ' '.join(tk_nltk_stop.tokenize(question))
    print(f"\t[NLTK low + stop]: {nltk_topic}, {nltk_question}")
    nltk_topic = ' '.join(punct_nltk_stop.tokenize(topic))
    nltk_question = ' '.join(punct_nltk_stop.tokenize(question))
    print(f"\t[NLTK low + stop + punct]: {nltk_topic}, {nltk_question}")
    nltk_topic = ' '.join(lem_nltk_stop.tokenize(topic))
    nltk_question = ' '.join(lem_nltk_stop.tokenize(question))
    print(f"\t[NLTK low + stop + punct + lemma]: {nltk_topic}, {nltk_question}")
    cus_topic = ' '.join(tk_cus_stop.tokenize(topic))
    cus_question = ' '.join(tk_cus_stop.tokenize(question))
    print(f"\t[CUSTOM STOPWORDS]: {cus_topic}, {cus_question}")
    doc = nlp(question)
    ents = [{e.label_:e.text} for e in doc.ents]
    print(f"\t[NER]: {ents}")

1
	[BASE]: coronavirus origin, what is the origin of COVID-19
	[NLTK low + stop]: coronavirus origin, origin covid-19
	[NLTK low + stop + punct]: coronavirus origin, origin covid-19
	[NLTK low + stop + punct + lemma]: coronavirus origin, origin covid-19
	[CUSTOM STOPWORDS]: coronavirus origin, origin covid-19
	[NER]: [{'ENTITY': 'origin'}, {'ENTITY': 'COVID-19'}]
2
	[BASE]: coronavirus response to weather changes, how does the coronavirus respond to changes in the weather
	[NLTK low + stop]: coronavirus response weather changes, coronavirus respond changes weather
	[NLTK low + stop + punct]: coronavirus response weather changes, coronavirus respond changes weather
	[NLTK low + stop + punct + lemma]: coronavirus response weather change, coronavirus respond change weather
	[CUSTOM STOPWORDS]: coronavirus response weather change, coronavirus respond change weather
	[NER]: [{'ENTITY': 'coronavirus'}, {'ENTITY': 'changes'}, {'ENTITY': 'weather'}]
3
	[BASE]: coronavirus immunity, will SARS-C

Looks like combination of `[NLTK low + stop + punct + lemma]` retains the context for the query compared to `CUSTOM_STOPWORDS` as seen in Topic #15 it removes `outside` from the query which affects the context of the query.



## Query Expansion