In [1]:
%load_ext memory_profiler

# ----------------- Classics -------------------- #
import numpy as np
import pandas as pd

# ---------------- Pandas settings --------------- #
# Removes rows and columns truncation of '...'
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# ------------------- Python libs ---------------- #
import os, sys, re
from pathlib import Path
ROOT_PATH = Path().resolve().parent
sys.path.append(str(ROOT_PATH)) # Add folder root path

import typing as t
import timeit

from tqdm import tqdm
tqdm.pandas()

import warnings
warnings.filterwarnings(action='ignore')

# ------------------- NLP libs ---------------------- #
from utils.tokenizer import Tokenizer

## 1. Load corpus

In [2]:
CORD19_PATH = Path('../data/trec_cord19_v0.csv')

df = pd.read_csv(CORD19_PATH)
df.head()

Unnamed: 0,cord_uid,title,abstract,title+abstract
0,ug7v899j,Clinical features of culture-proven Mycoplasma...,OBJECTIVE: This retrospective chart review des...,Clinical features of culture-proven Mycoplasma...
1,02tnwd4m,Nitric oxide: a pro-inflammatory mediator in l...,Inflammatory diseases of the respiratory tract...,Nitric oxide: a pro-inflammatory mediator in l...
2,ejv2xln0,Surfactant protein-D and pulmonary host defense,Surfactant protein-D (SP-D) participates in th...,Surfactant protein-D and pulmonary host defens...
3,2b73a28n,Role of endothelin-1 in lung disease,Endothelin-1 (ET-1) is a 21 amino acid peptide...,Role of endothelin-1 in lung disease Endotheli...
4,9785vg6d,Gene expression in epithelial cells in respons...,Respiratory syncytial virus (RSV) and pneumoni...,Gene expression in epithelial cells in respons...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127617 entries, 0 to 127616
Data columns (total 4 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   cord_uid        127617 non-null  object
 1   title           127612 non-null  object
 2   abstract        101395 non-null  object
 3   title+abstract  127617 non-null  object
dtypes: object(4)
memory usage: 3.9+ MB


In [4]:
df.isnull().sum()

cord_uid              0
title                 5
abstract          26222
title+abstract        0
dtype: int64

### a) Separate title, abstract, and title+abstract data frames

In [5]:
title_only = df[['title']]
abstract_only = df[['abstract']]
title_abstract = df[['title+abstract']]

# Drop empty rows
title_only.dropna(inplace=True) 
abstract_only.dropna(inplace=True)
title_abstract.dropna(inplace=True)
title_only.shape, abstract_only.shape, title_abstract.shape

((127612, 1), (101395, 1), (127617, 1))

## 2. Load queries (round 3 topics)

Filename: `topics-rnd3.csv`


In [6]:
def load_queries(input_fpath: Path, dtype: str = 'csv') -> pd.DataFrame:
    """Loads queries file and returns it as pandas data frame
    """
    if dtype == 'csv':
        df = pd.read_csv(input_fpath, quotechar='"')
        # for each column
        for col in df.columns:
            # check if the columns contains string data
            if pd.api.types.is_string_dtype(df[col]):
                df[col] = df[col].str.strip() # removes front and end white spaces
                df[col] = df[col].str.replace('\s{2,}', ' ') # remove double or more white spaces
    return df

In [7]:
QUERY_FPATH = Path('../data/CORD-19/CORD-19/topics-rnd3.csv')
query_df = load_queries(QUERY_FPATH)
query_df.head()

Unnamed: 0,topic-id,query,question,narrative
0,1,coronavirus origin,what is the origin of COVID-19,seeking range of information about the SARS-Co...
1,2,coronavirus response to weather changes,how does the coronavirus respond to changes in...,seeking range of information about the SARS-Co...
2,3,coronavirus immunity,will SARS-CoV2 infected people develop immunit...,seeking studies of immunity developed due to i...
3,4,how do people die from the coronavirus,what causes death from Covid-19?,Studies looking at mechanisms of death from Co...
4,5,animal models of COVID-19,what drugs have been active against SARS-CoV o...,Papers that describe the results of testing dr...


## III. TF-IDF

We will perform following steps to build document term matrix, run topic queries and evalute results using TREC-EVAL tool.

Step 1. Build TF-IDF matrix
Step 2. Query the topics
Step 3. Save the resulting results as 


In [8]:
%%writefile tfidf.py

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def create_tfidf_features(df, txt_col, tokenizer=None, max_features=None, min_df=2, max_df=0.9, ngram_range=(1,1)):
    """
    Converts documents to document-term matrix using Scikit-learns TF-IDF Vectorizer
    """
    if tokenizer is not None:
        tokenizer = tokenizer.tokenize
    
    tfidf_vectorizor = TfidfVectorizer(
        decode_error='replace', 
        strip_accents='unicode', 
        tokenizer=tokenizer,
        ngram_range=ngram_range, 
        max_features=max_features,
        norm='l2',
        use_idf=True,
        smooth_idf=True,
        sublinear_tf=True,
        max_df=max_df,
        min_df=min_df
    )
    X_tfidf = tfidf_vectorizor.fit_transform(df[txt_col])
    return X_tfidf, tfidf_vectorizor

def calculate_similarity(X, vectorizor, query, top_k=None):
    """ Vectorizes the `query` via `vectorizor` and calculates the cosine similarity of
    the `query` and `X` (all the documents) and returns the `top_k` similar documents."""
    # Vectorize the query to the same length as documents
    query_vec = vectorizor.transform(query)
    # Compute the cosine similarity between query_vec and all the documents
    cosine_similarities = cosine_similarity(X,query_vec).flatten()
    # Sort the similar documents from the most similar to less similar and return the indices
    most_similar_doc_indices = np.argsort(cosine_similarities, axis=0)[:-top_k-1:-1]
    return (most_similar_doc_indices, cosine_similarities)

Overwriting tfidf.py


In [9]:
%reload_ext autoreload
%autoreload 2
from tfidf import *

In [10]:
%%time
tk = Tokenizer('l-s-lm')
%memit dtm = create_tfidf_features(title_only, txt_col='title', tokenizer=tk, min_df=2, max_df=0.9, ngram_range=(1,2))

peak memory: 889.05 MiB, increment: 252.63 MiB
CPU times: user 2min 39s, sys: 1.41 s, total: 2min 40s
Wall time: 2min 40s


In [16]:

topic = query_df.loc[0]
query = """
    Topic id: {}
    Query: {}
    Question: {}
""".format(topic['topic-id'], topic.query, topic.question)
print(query)


    Topic id: 1
    Query: coronavirus origin
    Question: what is the origin of COVID-19

