In [1]:
%load_ext memory_profiler

# ----------------- Classics -------------------- #
import numpy as np
import pandas as pd

# ---------------- Pandas settings --------------- #
# Removes rows and columns truncation of '...'
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# ------------------- Python libs ---------------- #
import os, sys, re
import pprint
pp = pprint.PrettyPrinter(indent=4)
from pathlib import Path
ROOT_PATH = Path().resolve().parent
sys.path.append(str(ROOT_PATH)) # Add folder root path

import typing as t
import timeit

from tqdm import tqdm
tqdm.pandas()

import warnings
warnings.filterwarnings(action='ignore')


# ------------------- NLP libs ---------------------- #
from utils.tokenizer import Tokenizer

## 1. Load corpus

In [2]:
CORD19_PATH = Path('../data/input/trec_cord19_v0.csv')

cord19 = pd.read_csv(CORD19_PATH)
cord19.head()

Unnamed: 0,cord_uid,title,abstract,title+abstract
0,ug7v899j,Clinical features of culture-proven Mycoplasma...,OBJECTIVE: This retrospective chart review des...,Clinical features of culture-proven Mycoplasma...
1,02tnwd4m,Nitric oxide: a pro-inflammatory mediator in l...,Inflammatory diseases of the respiratory tract...,Nitric oxide: a pro-inflammatory mediator in l...
2,ejv2xln0,Surfactant protein-D and pulmonary host defense,Surfactant protein-D (SP-D) participates in th...,Surfactant protein-D and pulmonary host defens...
3,2b73a28n,Role of endothelin-1 in lung disease,Endothelin-1 (ET-1) is a 21 amino acid peptide...,Role of endothelin-1 in lung disease Endotheli...
4,9785vg6d,Gene expression in epithelial cells in respons...,Respiratory syncytial virus (RSV) and pneumoni...,Gene expression in epithelial cells in respons...


In [3]:
cord19.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127617 entries, 0 to 127616
Data columns (total 4 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   cord_uid        127617 non-null  object
 1   title           127612 non-null  object
 2   abstract        101395 non-null  object
 3   title+abstract  127617 non-null  object
dtypes: object(4)
memory usage: 3.9+ MB


In [4]:
cord19.isnull().sum()

cord_uid              0
title                 5
abstract          26222
title+abstract        0
dtype: int64

### a) Separate title, abstract, and title+abstract data frames

In [5]:
title_only = cord19[['cord_uid', 'title']]
abstract_only = cord19[['cord_uid', 'abstract']]
title_abstract = cord19[['cord_uid', 'title+abstract']]

# Drop empty rows
title_only.dropna(inplace=True) 
abstract_only.dropna(inplace=True)
title_abstract.dropna(inplace=True)
title_only.shape, abstract_only.shape, title_abstract.shape

((127612, 2), (101395, 2), (127617, 2))

## 2. Load queries (round 3 topics)

Filename: `topics-rnd3.csv`


In [6]:
def load_queries(input_fpath: Path, dtype: str = 'csv') -> pd.DataFrame:
    """Loads queries file and returns it as pandas data frame
    """
    if dtype == 'csv':
        df = pd.read_csv(input_fpath, quotechar='"')
        # for each column
        for col in df.columns:
            # check if the columns contains string data
            if pd.api.types.is_string_dtype(df[col]):
                df[col] = df[col].str.strip() # removes front and end white spaces
                df[col] = df[col].str.replace('\s{2,}', ' ') # remove double or more white spaces
    return df

In [7]:
QUERY_FPATH = Path('../data/CORD-19/CORD-19/topics-rnd3.csv')
query_df = load_queries(QUERY_FPATH)
query_df.head()

Unnamed: 0,topic-id,query,question,narrative
0,1,coronavirus origin,what is the origin of COVID-19,seeking range of information about the SARS-Co...
1,2,coronavirus response to weather changes,how does the coronavirus respond to changes in...,seeking range of information about the SARS-Co...
2,3,coronavirus immunity,will SARS-CoV2 infected people develop immunit...,seeking studies of immunity developed due to i...
3,4,how do people die from the coronavirus,what causes death from Covid-19?,Studies looking at mechanisms of death from Co...
4,5,animal models of COVID-19,what drugs have been active against SARS-CoV o...,Papers that describe the results of testing dr...


## III. TF-IDF

We will perform following steps to build document term matrix, run topic queries and evalute results using TREC-EVAL tool.

Parameters to set:

Tokenzier -> case-folding, NLTK stopwords removal, NLTK Wordnet Lemmatizing

Step 1. Build TF-IDF matrix     
Step 2. Query the topics   
Step 3. Save the resulting result as TREC EVAL format

For more info on how ot use TREC EVAL Tool visit my colab -> [tutorial](https://colab.research.google.com/drive/1_HuIn_Yd6y--wR4X0a9z2XdxEWgVkz9_?usp=sharing)

In [8]:
%%writefile tfidf.py

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def create_tfidf_features(df, txt_col, tokenizer=None, max_features=None, min_df=2, max_df=0.9, ngram_range=(1,1)):
    """
    Converts documents to document-term matrix using Scikit-learns TF-IDF Vectorizer
    """
    if tokenizer is not None:
        tokenizer = tokenizer.tokenize
    
    tfidf_vectorizer = TfidfVectorizer(
        decode_error='replace', 
        strip_accents='unicode', 
        tokenizer=tokenizer,
        ngram_range=ngram_range, 
        max_features=max_features,
        norm='l2',
        use_idf=True,
        smooth_idf=True,
        sublinear_tf=True,
        max_df=max_df,
        min_df=min_df
    )
    X_tfidf = tfidf_vectorizer.fit_transform(df[txt_col])
    return X_tfidf, tfidf_vectorizer

def calculate_similarity(X_tfidf, tfidf_vectorizer, query, top_k=5):
    """ Vectorizes the `query` via `tfidf_vectorizer` and calculates the cosine similarity of
    the `query` and `X_tfidf` (all the documents) and returns the `top_k` similar documents."""
    # Vectorize the query to the same length as documents
    if not isinstance(query, list):
        query = [query]
    query_vec = tfidf_vectorizer.transform(query)
    # Compute the cosine similarity between query_vec and all the documents
    scores = cosine_similarity(X_tfidf,query_vec).flatten()
    indices = np.argsort(scores)[-top_k:][::-1]
    scores = scores[indices]
    return list(zip(indices, scores))

def get_ranked_lists(query, qid, run_name, corpus_df, X_tfidf, tfidf_vectorizer, top_k):
    template = "{} Q0 {} {} {:.6f} tfidf_baseline_{}\n"
    similar_docs = calculate_similarity(X_tfidf, tfidf_vectorizer, query, top_k)
    return [template.format(qid, corpus_df.iloc[idx]['cord_uid'], rank+1, score, run_name) for rank, (idx, score) in enumerate(similar_docs)]

def write_results(out_fpath, corpus_df, query_df, query_id_col, query_txt_col, X_tfidf, tfidf_vectorizer, run_name, top_k=1000):
    with open(out_fpath, 'w', encoding='utf-8') as writer:
        for idx, topic in query_df.iterrows():
            qid = topic[query_id_col]
            query = topic[query_txt_col]
            ranked_lists = get_ranked_lists(query, qid, run_name, corpus_df, X_tfidf, tfidf_vectorizer, top_k=top_k)
            writer.writelines(ranked_lists)
    print(f"Wrote file @ {out_fpath}\n")

Overwriting tfidf.py


In [9]:
%reload_ext autoreload
%autoreload 2
from tfidf import *

### Test the query

In [10]:
tk = Tokenizer('l-s-lm')
X_tfidf, tfidf_vectorizer = create_tfidf_features(title_only, txt_col='title', tokenizer=tk)

#### Default TF-IDF settings

In [11]:
print("TF-IDF Settings")
pp.pprint(tfidf_vectorizer.get_params())
print("\nVocab.Size")
print(len(tfidf_vectorizer.vocabulary_))

TF-IDF Settings
{   'analyzer': 'word',
    'binary': False,
    'decode_error': 'replace',
    'dtype': <class 'numpy.float64'>,
    'encoding': 'utf-8',
    'input': 'content',
    'lowercase': True,
    'max_df': 0.9,
    'max_features': None,
    'min_df': 2,
    'ngram_range': (1, 1),
    'norm': 'l2',
    'preprocessor': None,
    'smooth_idf': True,
    'stop_words': None,
    'strip_accents': 'unicode',
    'sublinear_tf': True,
    'token_pattern': '(?u)\\b\\w\\w+\\b',
    'tokenizer': <bound method Tokenizer.tokenize of Tokenizer(config_codes="l-s-lm")>,
    'use_idf': True,
    'vocabulary': None}

Vocab.Size
31540


In [12]:
topic = query_df.loc[0]
query = """
    Topic id: {}
    Query: {}
    Question: {}
""".format(topic['topic-id'], topic.query, topic.question)
print(query)


    Topic id: 1
    Query: coronavirus origin
    Question: what is the origin of COVID-19



In [13]:
results = calculate_similarity(X_tfidf, tfidf_vectorizer, topic.query, top_k=5)
for idx, score in results:
    print("cord_uid: {}, title: {}, score: {:.4f}".format(title_only.iloc[idx]['cord_uid'], title_only.iloc[idx]['title'], score))

cord_uid: 4q24vxq3, title: Origins, score: 0.9030
cord_uid: h8ahn8fw, title: Origin and evolution of the 2019 novel coronavirus, score: 0.6548
cord_uid: gy8d8285, title: The Origin and Evolution of Viruses, score: 0.6443
cord_uid: fk60pph3, title: The Origin and Prevention of Pandemics, score: 0.6438
cord_uid: djpq7sgf, title: The origin of acute respiratory epidemics, score: 0.6328


While some of the documents match the query and results we are looking for, it's still not perfect so combining `title` and `abstract` should give better results.

Now let's test out the results in TREC-COVID `qrel` format.

In [14]:
get_ranked_lists(query_df.iloc[0]['query'], query_df.iloc[0]['topic-id'], 'title_only', title_only, X_tfidf, tfidf_vectorizer, 5)

['1 Q0 4q24vxq3 1 0.903044 tfidf_baseline_title_only\n',
 '1 Q0 h8ahn8fw 2 0.654781 tfidf_baseline_title_only\n',
 '1 Q0 gy8d8285 3 0.644261 tfidf_baseline_title_only\n',
 '1 Q0 fk60pph3 4 0.643778 tfidf_baseline_title_only\n',
 '1 Q0 djpq7sgf 5 0.632806 tfidf_baseline_title_only\n']

Now let's run it against all the topics and see what evaluation results we get against relevance judgement files.

In [15]:
corpus_df = title_only
run_name = 'title_only'
query_txt_col = 'query'
query_id_col = 'topic-id'
out_fpath = Path('../data/output') / f'tfidf_baseline_{run_name}.txt'
write_results(out_fpath, corpus_df, query_df, query_id_col, query_txt_col, X_tfidf, tfidf_vectorizer, run_name)

Wrote file @ ../data/output/tfidf_baseline_title_only.txt



#### `title` only + `query` only TREC results

In [16]:
run_name = "title_only"
path_to_qrel_file = "../data/qrels/qrels-covid_d3_j0.5-3.txt"
path_to_result_file = f"../data/output/tfidf_baseline_{run_name}.txt"
output_result_path = f"../data/results/tfidf_baseline_{run_name}_trec_eval.txt"
os.system("trec_eval -c -m all_trec {} {} > {}".format(path_to_qrel_file, path_to_result_file, output_result_path))
with open(output_result_path, encoding='utf-8') as f:
    print(f.read())

runid                 	all	tfidf_baseline_title_only
num_q                 	all	40
num_ret               	all	40000
num_rel               	all	10001
num_rel_ret           	all	2265
map                   	all	0.0720
gm_map                	all	0.0217
Rprec                 	all	0.1453
bpref                 	all	0.2095
recip_rank            	all	0.4983
iprec_at_recall_0.00  	all	0.5594
iprec_at_recall_0.10  	all	0.2493
iprec_at_recall_0.20  	all	0.1513
iprec_at_recall_0.30  	all	0.0893
iprec_at_recall_0.40  	all	0.0335
iprec_at_recall_0.50  	all	0.0162
iprec_at_recall_0.60  	all	0.0000
iprec_at_recall_0.70  	all	0.0000
iprec_at_recall_0.80  	all	0.0000
iprec_at_recall_0.90  	all	0.0000
iprec_at_recall_1.00  	all	0.0000
P_5                   	all	0.3350
P_10                  	all	0.3100
P_15                  	all	0.3000
P_20                  	all	0.2950
P_30                  	all	0.2817
P_100                 	all	0.2188
P_200                 	all	0.1636
P_500                 	all	0.0921
P_1

Key Metrics to look at:

- `MAP` - 0.0720
- `NDCG@10` - 0.2816
- `P@5` - 0.3350
- `R@1000` - 0.2295

As expected `title` has poor results, now let's compare `abstract` and `title+abstract` to see if performance improves.

### a) `abstract` only + `query` only

In [17]:
# %%time
# corpus_df = abstract_only
# run_name = 'abstract_only'
# tk = Tokenizer('l-s-lm')
# query_txt_col = 'query'
# query_id_col = 'topic-id'
# X_tfidf, tfidf_vectorizer = create_tfidf_features(corpus_df, txt_col='abstract', tokenizer=tk)
# out_fpath = Path('../data/output') / f'tfidf_baseline_{run_name}.txt'
# write_results(out_fpath, corpus_df, query_df, X_tfidf, tfidf_vectorizer, run_name)

Wrote file @ ../data/output/tfidf_baseline_abstract_only.txt

CPU times: user 24min 25s, sys: 3.47 s, total: 24min 28s
Wall time: 24min 29s


**Cell Output**

```
Wrote file @ ../data/output/tfidf_baseline_abstract_only.txt

CPU times: user 23min 57s, sys: 2.45 s, total: 23min 59s
Wall time: 23min 59s
```

In [18]:
# print("TF-IDF Settings")
# pp.pprint(tfidf_vectorizer.get_params())
# print("\nVocab.Size")
# print(len(tfidf_vectorizer.vocabulary_))

TF-IDF Settings
{   'analyzer': 'word',
    'binary': False,
    'decode_error': 'replace',
    'dtype': <class 'numpy.float64'>,
    'encoding': 'utf-8',
    'input': 'content',
    'lowercase': True,
    'max_df': 0.9,
    'max_features': None,
    'min_df': 2,
    'ngram_range': (1, 1),
    'norm': 'l2',
    'preprocessor': None,
    'smooth_idf': True,
    'stop_words': None,
    'strip_accents': 'unicode',
    'sublinear_tf': True,
    'token_pattern': '(?u)\\b\\w\\w+\\b',
    'tokenizer': <bound method Tokenizer.tokenize of Tokenizer(config_codes="l-s-lm")>,
    'use_idf': True,
    'vocabulary': None}

Vocab.Size
125512


**Cell Output**

```
TF-IDF Settings
{   'analyzer': 'word',
    'binary': False,
    'decode_error': 'replace',
    'dtype': <class 'numpy.float64'>,
    'encoding': 'utf-8',
    'input': 'content',
    'lowercase': True,
    'max_df': 0.9,
    'max_features': None,
    'min_df': 2,
    'ngram_range': (1, 1),
    'norm': 'l2',
    'preprocessor': None,
    'smooth_idf': True,
    'stop_words': None,
    'strip_accents': 'unicode',
    'sublinear_tf': True,
    'token_pattern': '(?u)\\b\\w\\w+\\b',
    'tokenizer': <bound method Tokenizer.tokenize of Tokenizer(config_codes="l-s-lm")>,
    'use_idf': True,
    'vocabulary': None}

Vocab.Size
125512
```

In [19]:
run_name = "abstract_only"
path_to_qrel_file = "../data/qrels/qrels-covid_d3_j0.5-3.txt"
path_to_result_file = f"../data/output/tfidf_baseline_{run_name}.txt"
output_result_path = f"../data/results/tfidf_baseline_{run_name}_trec_eval.txt"
os.system("trec_eval -c -m all_trec {} {} > {}".format(path_to_qrel_file, path_to_result_file, output_result_path))
with open(output_result_path, encoding='utf-8') as f:
    print(f.read())

runid                 	all	tfidf_baseline_abstract_only
num_q                 	all	40
num_ret               	all	40000
num_rel               	all	10001
num_rel_ret           	all	3314
map                   	all	0.1069
gm_map                	all	0.0514
Rprec                 	all	0.1905
bpref                 	all	0.2914
recip_rank            	all	0.5229
iprec_at_recall_0.00  	all	0.5997
iprec_at_recall_0.10  	all	0.3161
iprec_at_recall_0.20  	all	0.2263
iprec_at_recall_0.30  	all	0.1602
iprec_at_recall_0.40  	all	0.0971
iprec_at_recall_0.50  	all	0.0553
iprec_at_recall_0.60  	all	0.0263
iprec_at_recall_0.70  	all	0.0029
iprec_at_recall_0.80  	all	0.0000
iprec_at_recall_0.90  	all	0.0000
iprec_at_recall_1.00  	all	0.0000
P_5                   	all	0.3800
P_10                  	all	0.3500
P_15                  	all	0.3617
P_20                  	all	0.3500
P_30                  	all	0.3225
P_100                 	all	0.2567
P_200                 	all	0.2047
P_500                 	all	0.1284


Key Metrics

- `MAP` - 0.1069
- `NDCG@10` - 0.2899
- `P@5` - 0.38
- `R@1000` - 0.3480

There is slight improvement, using only abstract and query. 

### b) `title+abstract` only  + `query` only

In [20]:
# %%time
# corpus_df = title_abstract
# run_name = 'title_abstract'
# tk = Tokenizer('l-s-lm')
# query_txt_col = 'query'
# query_id_col = 'topic-id'
# X_tfidf, tfidf_vectorizer = create_tfidf_features(corpus_df, txt_col='title+abstract', tokenizer=tk)
# out_fpath = Path('../data/output') / f'tfidf_baseline_{run_name}.txt'
# write_results(out_fpath, corpus_df, query_df, query_id_col, query_txt_col, X_tfidf, tfidf_vectorizer, run_name)

Wrote file @ ../data/output/tfidf_baseline_title_abstract.txt

CPU times: user 26min 34s, sys: 3.8 s, total: 26min 38s
Wall time: 26min 38s


**Cell Output**

```
Wrote file @ ../data/output/tfidf_baseline_title_abstract.txt

CPU times: user 26min 11s, sys: 2.76 s, total: 26min 14s
Wall time: 26min 14s
```

In [21]:
# print("TF-IDF Settings")
# pp.pprint(tfidf_vectorizer.get_params())
# print("\nVocab.Size")
# print(len(tfidf_vectorizer.vocabulary_))

TF-IDF Settings
{   'analyzer': 'word',
    'binary': False,
    'decode_error': 'replace',
    'dtype': <class 'numpy.float64'>,
    'encoding': 'utf-8',
    'input': 'content',
    'lowercase': True,
    'max_df': 0.9,
    'max_features': None,
    'min_df': 2,
    'ngram_range': (1, 1),
    'norm': 'l2',
    'preprocessor': None,
    'smooth_idf': True,
    'stop_words': None,
    'strip_accents': 'unicode',
    'sublinear_tf': True,
    'token_pattern': '(?u)\\b\\w\\w+\\b',
    'tokenizer': <bound method Tokenizer.tokenize of Tokenizer(config_codes="l-s-lm")>,
    'use_idf': True,
    'vocabulary': None}

Vocab.Size
130897


**Cell Output**

```
TF-IDF Settings
{   'analyzer': 'word',
    'binary': False,
    'decode_error': 'replace',
    'dtype': <class 'numpy.float64'>,
    'encoding': 'utf-8',
    'input': 'content',
    'lowercase': True,
    'max_df': 0.9,
    'max_features': None,
    'min_df': 2,
    'ngram_range': (1, 1),
    'norm': 'l2',
    'preprocessor': None,
    'smooth_idf': True,
    'stop_words': None,
    'strip_accents': 'unicode',
    'sublinear_tf': True,
    'token_pattern': '(?u)\\b\\w\\w+\\b',
    'tokenizer': <bound method Tokenizer.tokenize of Tokenizer(config_codes="l-s-lm")>,
    'use_idf': True,
    'vocabulary': None}

Vocab.Size
130897
```

In [22]:
run_name = "title_abstract"
path_to_qrel_file = "../data/qrels/qrels-covid_d3_j0.5-3.txt"
path_to_result_file = f"../data/output/tfidf_baseline_{run_name}.txt"
output_result_path = f"../data/results/tfidf_baseline_{run_name}_trec_eval.txt"
os.system("trec_eval -c -m all_trec {} {} > {}".format(path_to_qrel_file, path_to_result_file, output_result_path))
with open(output_result_path, encoding='utf-8') as f:
    print(f.read())

runid                 	all	tfidf_baseline_title_abstract
num_q                 	all	40
num_ret               	all	40000
num_rel               	all	10001
num_rel_ret           	all	3084
map                   	all	0.0807
gm_map                	all	0.0287
Rprec                 	all	0.1637
bpref                 	all	0.2856
recip_rank            	all	0.3841
iprec_at_recall_0.00  	all	0.4523
iprec_at_recall_0.10  	all	0.2132
iprec_at_recall_0.20  	all	0.1635
iprec_at_recall_0.30  	all	0.1185
iprec_at_recall_0.40  	all	0.0912
iprec_at_recall_0.50  	all	0.0547
iprec_at_recall_0.60  	all	0.0234
iprec_at_recall_0.70  	all	0.0133
iprec_at_recall_0.80  	all	0.0000
iprec_at_recall_0.90  	all	0.0000
iprec_at_recall_1.00  	all	0.0000
P_5                   	all	0.2300
P_10                  	all	0.2450
P_15                  	all	0.2350
P_20                  	all	0.2237
P_30                  	all	0.2100
P_100                 	all	0.1857
P_200                 	all	0.1697
P_500                 	all	0.1178

Key Metrics to look at,

- `MAP` - 0.0807
- `NDCG@10` - 0.2031
- `P@5` - 0.2300
- `R@1000` - 0.3252

Performance got hurt most likely due to more non-relevant documents that didn't contain `Abstract` and only had `Title` and thus affected the retrieved results. 

Now let's see if performance can be improved if we can do proper "Query Construction" by combining `query` and `question` columns on `abstract` only dataset. 

### c) `abstract` + `query+question`

In [23]:
query_df['query+question'] = query_df['query'] + ' ' + query_df['question']
query_df.head()

Unnamed: 0,topic-id,query,question,narrative,query+question
0,1,coronavirus origin,what is the origin of COVID-19,seeking range of information about the SARS-Co...,coronavirus origin what is the origin of COVID-19
1,2,coronavirus response to weather changes,how does the coronavirus respond to changes in...,seeking range of information about the SARS-Co...,coronavirus response to weather changes how do...
2,3,coronavirus immunity,will SARS-CoV2 infected people develop immunit...,seeking studies of immunity developed due to i...,coronavirus immunity will SARS-CoV2 infected p...
3,4,how do people die from the coronavirus,what causes death from Covid-19?,Studies looking at mechanisms of death from Co...,how do people die from the coronavirus what ca...
4,5,animal models of COVID-19,what drugs have been active against SARS-CoV o...,Papers that describe the results of testing dr...,animal models of COVID-19 what drugs have been...


In [24]:
# %%time
# corpus_df = abstract_only
# run_name = 'abstract_query_question'
# tk = Tokenizer('l-s-lm')
# query_txt_col = 'query'
# query_id_col = 'topic-id'
# X_tfidf, tfidf_vectorizer = create_tfidf_features(corpus_df, txt_col='abstract', tokenizer=tk)
# out_fpath = Path('../data/output') / f'tfidf_baseline_{run_name}.txt'
# write_results(out_fpath, corpus_df, query_df, query_id_col, query_txt_col, X_tfidf, tfidf_vectorizer, run_name)

Wrote file @ ../data/output/tfidf_baseline_abstract_query_question.txt

CPU times: user 24min 11s, sys: 2.8 s, total: 24min 14s
Wall time: 24min 14s


**Cell Output**

```
Wrote file @ ../data/output/tfidf_baseline_abstract_query_question.txt

CPU times: user 24min 10s, sys: 2.7 s, total: 24min 12s
Wall time: 24min 12s
```

In [25]:
# print("TF-IDF Settings")
# pp.pprint(tfidf_vectorizer.get_params())
# print("\nVocab.Size")
# print(len(tfidf_vectorizer.vocabulary_))

TF-IDF Settings
{   'analyzer': 'word',
    'binary': False,
    'decode_error': 'replace',
    'dtype': <class 'numpy.float64'>,
    'encoding': 'utf-8',
    'input': 'content',
    'lowercase': True,
    'max_df': 0.9,
    'max_features': None,
    'min_df': 2,
    'ngram_range': (1, 1),
    'norm': 'l2',
    'preprocessor': None,
    'smooth_idf': True,
    'stop_words': None,
    'strip_accents': 'unicode',
    'sublinear_tf': True,
    'token_pattern': '(?u)\\b\\w\\w+\\b',
    'tokenizer': <bound method Tokenizer.tokenize of Tokenizer(config_codes="l-s-lm")>,
    'use_idf': True,
    'vocabulary': None}

Vocab.Size
125512


**Cell Output**

```
TF-IDF Settings
{   'analyzer': 'word',
    'binary': False,
    'decode_error': 'replace',
    'dtype': <class 'numpy.float64'>,
    'encoding': 'utf-8',
    'input': 'content',
    'lowercase': True,
    'max_df': 0.9,
    'max_features': None,
    'min_df': 2,
    'ngram_range': (1, 1),
    'norm': 'l2',
    'preprocessor': None,
    'smooth_idf': True,
    'stop_words': None,
    'strip_accents': 'unicode',
    'sublinear_tf': True,
    'token_pattern': '(?u)\\b\\w\\w+\\b',
    'tokenizer': <bound method Tokenizer.tokenize of Tokenizer(config_codes="l-s-lm")>,
    'use_idf': True,
    'vocabulary': None}

Vocab.Size
125512
```

In [26]:
run_name = "abstract_query_question"
path_to_qrel_file = "../data/qrels/qrels-covid_d3_j0.5-3.txt"
path_to_result_file = f"../data/output/tfidf_baseline_{run_name}.txt"
output_result_path = f"../data/results/tfidf_baseline_{run_name}_trec_eval.txt"
os.system("trec_eval -c -m all_trec {} {} > {}".format(path_to_qrel_file, path_to_result_file, output_result_path))
with open(output_result_path, encoding='utf-8') as f:
    print(f.read())

runid                 	all	tfidf_baseline_abstract_query_question
num_q                 	all	40
num_ret               	all	40000
num_rel               	all	10001
num_rel_ret           	all	3314
map                   	all	0.1069
gm_map                	all	0.0514
Rprec                 	all	0.1905
bpref                 	all	0.2914
recip_rank            	all	0.5229
iprec_at_recall_0.00  	all	0.5997
iprec_at_recall_0.10  	all	0.3161
iprec_at_recall_0.20  	all	0.2263
iprec_at_recall_0.30  	all	0.1602
iprec_at_recall_0.40  	all	0.0971
iprec_at_recall_0.50  	all	0.0553
iprec_at_recall_0.60  	all	0.0263
iprec_at_recall_0.70  	all	0.0029
iprec_at_recall_0.80  	all	0.0000
iprec_at_recall_0.90  	all	0.0000
iprec_at_recall_1.00  	all	0.0000
P_5                   	all	0.3800
P_10                  	all	0.3500
P_15                  	all	0.3617
P_20                  	all	0.3500
P_30                  	all	0.3225
P_100                 	all	0.2567
P_200                 	all	0.2047
P_500                 	a

Key Metrics,

- `MAP` - 0.1215
- `NDCG@10` - 0.2937
- `P@5` - 0.3150
- `R@1000` - 0.4007

`NDCG@10` and `Recall@1000` so adding `question` to the `query` provides more context to the "information need" and as such we see better Recall, MAP, and NDCG. 

Now let's keep the documents that have abstract but add `title` to see if performance improves.  

### d) `abstract` + enhance with `title` + (`query` + `question`)

In [27]:
# Drop documents that don't have abstract
enhanced_abstract = cord19.dropna(subset=['abstract'])

# Verify the shape matches with the abstract only data frame
assert enhanced_abstract.shape[0] == abstract_only.shape[0]

# Drop title and abstract columns
enhanced_abstract = enhanced_abstract.drop(['title', 'abstract'], axis=1)
enhanced_abstract.head()

Unnamed: 0,cord_uid,title+abstract
0,ug7v899j,Clinical features of culture-proven Mycoplasma...
1,02tnwd4m,Nitric oxide: a pro-inflammatory mediator in l...
2,ejv2xln0,Surfactant protein-D and pulmonary host defens...
3,2b73a28n,Role of endothelin-1 in lung disease Endotheli...
4,9785vg6d,Gene expression in epithelial cells in respons...


Verify there is no missing values again.

In [28]:
enhanced_abstract.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 101395 entries, 0 to 127616
Data columns (total 2 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   cord_uid        101395 non-null  object
 1   title+abstract  101395 non-null  object
dtypes: object(2)
memory usage: 2.3+ MB


In [None]:
%%time
corpus_df = enhanced_abstract
run_name = 'enhanced_abstract_query_question'
tk = Tokenizer('l-s-lm')
query_txt_col = 'query+question'
query_id_col = 'topic-id'
X_tfidf, tfidf_vectorizer = create_tfidf_features(corpus_df, txt_col='title+abstract', tokenizer=tk)
out_fpath = Path('../data/output') / f'tfidf_baseline_{run_name}.txt'
write_results(out_fpath, corpus_df, query_df, query_id_col, query_txt_col, X_tfidf, tfidf_vectorizer, run_name)

**Cell Output**

```
Wrote file @ ../data/output/tfidf_baseline_enhanced_abstract_query_question.txt

CPU times: user 25min 55s, sys: 2.52 s, total: 25min 58s
Wall time: 25min 58s
```

In [None]:
print("TF-IDF Settings")
pp.pprint(tfidf_vectorizer.get_params())
print("\nVocab.Size")
print(len(tfidf_vectorizer.vocabulary_))

**Cell Output**

```
TF-IDF Settings
{   'analyzer': 'word',
    'binary': False,
    'decode_error': 'replace',
    'dtype': <class 'numpy.float64'>,
    'encoding': 'utf-8',
    'input': 'content',
    'lowercase': True,
    'max_df': 0.9,
    'max_features': None,
    'min_df': 2,
    'ngram_range': (1, 1),
    'norm': 'l2',
    'preprocessor': None,
    'smooth_idf': True,
    'stop_words': None,
    'strip_accents': 'unicode',
    'sublinear_tf': True,
    'token_pattern': '(?u)\\b\\w\\w+\\b',
    'tokenizer': <bound method Tokenizer.tokenize of Tokenizer(config_codes="l-s-lm")>,
    'use_idf': True,
    'vocabulary': None}

Vocab.Size
128777
```

In [None]:
run_name = "enhanced_abstract_query_question"
path_to_qrel_file = "../data/qrels/qrels-covid_d3_j0.5-3.txt"
path_to_result_file = f"../data/output/tfidf_baseline_{run_name}.txt"
output_result_path = f"../data/results/tfidf_baseline_{run_name}_trec_eval.txt"
os.system("trec_eval -c -m all_trec {} {} > {}".format(path_to_qrel_file, path_to_result_file, output_result_path))
with open(output_result_path, encoding='utf-8') as f:
    print(f.read())

Key Metrics,

- `MAP` - 0.1215
- `NDCG@10` - 0.3127
- `P@5` - 0.3750
- `R@1000` - 0.3657

Comparing 