# Preprocess

## Load the data

Data was downloaded from Kaggle, and saved in the 'data' directory. 

Cite: [Dataset Parsing Code | Kaggle, COVID EDA: Initial Exploration Tool](https://www.kaggle.com/ivanegapratama/covid-eda-initial-exploration-tool)


In [80]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import glob
import json
import os

root_path = 'data/'
metadata_path = f'{root_path}/metadata.csv'
meta_df = pd.read_csv(metadata_path, dtype={
    'pubmed_id': str,
    'Microsoft Academic Paper ID': str,
    'doi': str,
    'arxiv_id': str
})
meta_df.head()


Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,arxiv_id,has_pdf_parse,has_pmc_xml_parse,full_text_file,url
0,zjufx4fo,b2897e1277f56641193a6db73825f707eed3e4c9,PMC,Sequence requirements for RNA strand transfer ...,10.1093/emboj/20.24.7220,PMC125340,11742998,unk,Nidovirus subgenomic mRNAs contain a leader se...,2001-12-17,"Pasternak, Alexander O.; van den Born, Erwin; ...",The EMBO Journal,,,,True,True,custom_license,http://europepmc.org/articles/pmc125340?pdf=re...
1,ymceytj3,e3d0d482ebd9a8ba81c254cc433f314142e72174,PMC,"Crystal structure of murine sCEACAM1a[1,4]: a ...",10.1093/emboj/21.9.2076,PMC125375,11980704,unk,CEACAM1 is a member of the carcinoembryonic an...,2002-05-01,"Tan, Kemin; Zelus, Bruce D.; Meijers, Rob; Liu...",The EMBO Journal,,,,True,True,custom_license,http://europepmc.org/articles/pmc125375?pdf=re...
2,wzj2glte,00b1d99e70f779eb4ede50059db469c65e8c1469,PMC,Synthesis of a novel hepatitis C virus protein...,10.1093/emboj/20.14.3840,PMC125543,11447125,no-cc,Hepatitis C virus (HCV) is an important human ...,2001-07-16,"Xu, Zhenming; Choi, Jinah; Yen, T.S.Benedict; ...",EMBO J,,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
3,2sfqsfm1,cf584e00f637cbd8f1bb35f3f09f5ed07b71aeb0,PMC,Structure of coronavirus main proteinase revea...,10.1093/emboj/cdf327,PMC126080,12093723,unk,The key enzyme in coronavirus polyprotein proc...,2002-07-01,"Anand, Kanchan; Palm, Gottfried J.; Mesters, J...",The EMBO Journal,,,,True,True,custom_license,http://europepmc.org/articles/pmc126080?pdf=re...
4,i0zym7iq,dde02f11923815e6a16a31dd6298c46b109c5dfa,PMC,Discontinuous and non-discontinuous subgenomic...,10.1093/emboj/cdf635,PMC136939,12456663,unk,"Arteri-, corona-, toro- and roniviruses are ev...",2002-12-01,"van Vliet, A.L.W.; Smits, S.L.; Rottier, P.J.M...",The EMBO Journal,,,,True,True,custom_license,http://europepmc.org/articles/pmc136939?pdf=re...


In [81]:
all_json = glob.glob(f'{root_path}/**/*.json', recursive=True)
len(all_json)

71261

Add helper class and functions. 

The `check_words` function is used to sieve the set to keep only articles with the keyword "case fatality" or "case-fatality" and "united states". 

The `check_publish_time` is used to remove any article published before 2020. 

In [82]:
class FileReader:
    def __init__(self, file_path):
        with open(file_path) as file:
            content = json.load(file)
            self.paper_id = content['paper_id']
            self.abstract = []
            self.body_text = []
            # Abstract
            for entry in content['abstract']:
                self.abstract.append(entry['text'])
            # Body text
            for entry in content['body_text']:
                self.body_text.append(entry['text'])
            self.abstract = '\n'.join(self.abstract)
            self.body_text = '\n'.join(self.body_text)
    def __repr__(self):
        return f'{self.paper_id}: {self.abstract[:200]}... {self.body_text[:200]}...'

def add_breaks(content, length=40):
    '''
    Add breaks <br> every 'length' characters
    '''
    return '<br>'.join([content[i:i+length] for i in range(0, len(content), length)])

def get_abstract_summary(abstact):
    return add_breaks(' '.join(abstract.split(' ')[:100])) if abstract else 'Not provided.'

def get_authors(authors):
    return add_breaks('. '.join(authors)) if len(authors) > 2 else '. '.join(authors)

def check_words(text):
    text = text.lower()
    return ('case fatality' in text or 'case-fatality' in text) and 'united states' in text

def check_publish_time(publish_time):
    return publish_time is not None and publish_time >= '2020-01-01'

Read files into a data frame. Keep some metadata for display in the visualization. 

In [83]:
dict_ = {'paper_id': [], 'doi': [], 'abstract': [], 'body_text': [], 'authors': [], 'title': [], 'journal': [], 'abstract_summary': [], 'publish_time': []}
for idx, entry in enumerate(all_json):
    if idx % (len(all_json) // 10) == 0:
        print(f'Processing index: {idx} of {len(all_json)}')
    try: 
        content = FileReader(entry)
        metadata = meta_df.loc[meta_df['sha'] == content.paper_id]   
        paper_id = content.paper_id
        abstract = content.abstract
        body_text = content.body_text
        title = add_breaks(metadata['title'].values[0]) if metadata['title'].values.size > 0 else None
        if not check_words(abstract + ' ' + body_text + ' ' + title):
            continue
        publish_time = metadata['publish_time'].values[0] if metadata['publish_time'].values.size > 0 else None         
        if not check_publish_time(publish_time):
            continue
        abstract_summary = get_abstract_summary(abstract)
        authors = get_authors(metadata['authors'].values[0].split(';'))  if metadata['authors'].values.size > 0 else None
        title = add_breaks(metadata['title'].values[0]) if metadata['title'].values.size > 0 else None
        journal = metadata['journal'].values[0] if metadata['journal'].values.size > 0 else None
        doi = metadata['doi'].values[0] if metadata['doi'].values.size > 0 else None
        dict_['paper_id'].append(paper_id)
        dict_['abstract'].append(abstract)
        dict_['body_text'].append(body_text)
        dict_['abstract_summary'].append(abstract_summary)
        dict_['authors'].append(authors)
        dict_['title'].append(title)
        dict_['journal'].append(journal)
        dict_['doi'].append(doi)
        dict_['publish_time'].append(publish_time)
    except Exception as e:  
        continue # skip invalid format
df_covid = pd.DataFrame(dict_, columns=['paper_id', 'doi', 'abstract', 'body_text', 'authors', 'title', 'journal', 'abstract_summary', 'publish_time'])
df_covid.head()

Processing index: 0 of 71261
Processing index: 7126 of 71261
Processing index: 14252 of 71261
Processing index: 21378 of 71261
Processing index: 28504 of 71261
Processing index: 35630 of 71261
Processing index: 42756 of 71261
Processing index: 49882 of 71261
Processing index: 57008 of 71261
Processing index: 64134 of 71261
Processing index: 71260 of 71261


Unnamed: 0,paper_id,doi,abstract,body_text,authors,title,journal,abstract_summary,publish_time
0,424bd09fc65e5f8b04ca407ed5cf79891e8cc5b3,10.1016/j.soncn.2020.151028,To provide a critical reflection of COVID-19 i...,"Since late December 2019, a novel coronavirus ...","Paterson, C.. Gobel, B.. Gosselin, T..<br> ...",Oncology Nursing During a Pandemic: Crit<br>ic...,Seminars in Oncology Nursing,To provide a critical reflection of COVI<br>D-...,2020-04-23
1,67ae6cc09106e080cd5019f09becb78eb1bd5f14,10.1016/j.jgo.2020.04.008,J o u r n a l P r e -p r o o f Journal Pre-pro...,"The novel coronavirus (SARS-CoV-2, COVID- 19) ...","Mian, Hira. Grant, Shakira J.. Engelha<br>rd...",Caring for older adults with multiple my<br>el...,Journal of Geriatric Oncology,J o u r n a l P r e -p r o o f Journal P<br>re...,2020-04-17
2,4c0d77e951761c73efee8ae7e014d09cf37b1907,10.1093/ajcp/aqaa029,,"In the past two decades, the world has seen th...","Guarner, Jeannette",Three Emerging Coronaviruses in Two Deca<br>de...,Am J Clin Pathol,Not provided.,2020-02-13
3,e16308d26591fdca9707c427e39a8d35f29c98cd,10.1016/j.bbi.2020.04.046,,"Coronavirus disease 2019 (COVID-2019), which i...","Zhang, Jiancheng. Xie, Bing. Hashimoto<br>, ...",Current status of potential therapeutic <br>ca...,"Brain, Behavior, and Immunity",Not provided.,2020-04-22
4,8b49086072cc3911c754a7a3565606d937581878,10.1016/j.bbi.2020.04.027,"Please cite this article as: Troyer, E.A., Koh...",The coronavirus disease 19 pandemic continues ...,"Troyer, Emily A.. Kohn, Jordan N.. Hon<br>g,...",Are we facing a crashing wave of neurops<br>yc...,"Brain, Behavior, and Immunity","Please cite this article as: Troyer, E.A<br>.,...",2020-04-13


## Feature Engineering

Adding word count column for abstract and body text. We'll use them to see the distribution of number of words in the abstract and body text. 

In [84]:
df_covid['abstract_word_count'] = df_covid['abstract'].apply(lambda x: len(x.strip().split()))
df_covid['body_word_count'] = df_covid['body_text'].apply(lambda x: len(x.strip().split()))
df_covid['body_unique_words'] = df_covid['body_text'].apply(lambda x: len(set(str(x).split())))
df_covid.head()

Unnamed: 0,paper_id,doi,abstract,body_text,authors,title,journal,abstract_summary,publish_time,abstract_word_count,body_word_count,body_unique_words
0,424bd09fc65e5f8b04ca407ed5cf79891e8cc5b3,10.1016/j.soncn.2020.151028,To provide a critical reflection of COVID-19 i...,"Since late December 2019, a novel coronavirus ...","Paterson, C.. Gobel, B.. Gosselin, T..<br> ...",Oncology Nursing During a Pandemic: Crit<br>ic...,Seminars in Oncology Nursing,To provide a critical reflection of COVI<br>D-...,2020-04-23,133,5958,2298
1,67ae6cc09106e080cd5019f09becb78eb1bd5f14,10.1016/j.jgo.2020.04.008,J o u r n a l P r e -p r o o f Journal Pre-pro...,"The novel coronavirus (SARS-CoV-2, COVID- 19) ...","Mian, Hira. Grant, Shakira J.. Engelha<br>rd...",Caring for older adults with multiple my<br>el...,Journal of Geriatric Oncology,J o u r n a l P r e -p r o o f Journal P<br>re...,2020-04-17,51,1696,783
2,4c0d77e951761c73efee8ae7e014d09cf37b1907,10.1093/ajcp/aqaa029,,"In the past two decades, the world has seen th...","Guarner, Jeannette",Three Emerging Coronaviruses in Two Deca<br>de...,Am J Clin Pathol,Not provided.,2020-02-13,0,1009,535
3,e16308d26591fdca9707c427e39a8d35f29c98cd,10.1016/j.bbi.2020.04.046,,"Coronavirus disease 2019 (COVID-2019), which i...","Zhang, Jiancheng. Xie, Bing. Hashimoto<br>, ...",Current status of potential therapeutic <br>ca...,"Brain, Behavior, and Immunity",Not provided.,2020-04-22,0,8051,2529
4,8b49086072cc3911c754a7a3565606d937581878,10.1016/j.bbi.2020.04.027,"Please cite this article as: Troyer, E.A., Koh...",The coronavirus disease 19 pandemic continues ...,"Troyer, Emily A.. Kohn, Jordan N.. Hon<br>g,...",Are we facing a crashing wave of neurops<br>yc...,"Brain, Behavior, and Immunity","Please cite this article as: Troyer, E.A<br>.,...",2020-04-13,262,3236,1269


In [85]:
df_covid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 187 entries, 0 to 186
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   paper_id             187 non-null    object
 1   doi                  171 non-null    object
 2   abstract             187 non-null    object
 3   body_text            187 non-null    object
 4   authors              187 non-null    object
 5   title                187 non-null    object
 6   journal              106 non-null    object
 7   abstract_summary     187 non-null    object
 8   publish_time         187 non-null    object
 9   abstract_word_count  187 non-null    int64 
 10  body_word_count      187 non-null    int64 
 11  body_unique_words    187 non-null    int64 
dtypes: int64(3), object(9)
memory usage: 17.7+ KB


In [86]:
df = df_covid

### Language

Some articles might not be written in English. Remove them if any. It turns out that all articles are in English, probably because we've searched with the keyword "united states". 

In [87]:
from tqdm import tqdm
from langdetect import detect
from langdetect import DetectorFactory

# set seed
DetectorFactory.seed = 0

# hold label - language
languages = []

# go through each text
for ii in tqdm(range(0,len(df))):
    # split by space into list, take the first x intex, join with space
    text = df.iloc[ii]['body_text'].split(" ")
    
    lang = "en"
    try:
        if len(text) > 50:
            lang = detect(" ".join(text[:50]))
        elif len(text) > 0:
            lang = detect(" ".join(text[:len(text)]))
    # ught... beginning of the document was not in a good format
    except Exception as e:
        all_words = set(text)
        try:
            lang = detect(" ".join(all_words))
        # what!! :( let's see if we can find any text in abstract...
        except Exception as e:
            
            try:
                # let's try to label it through the abstract then
                lang = detect(df.iloc[ii]['abstract_summary'])
            except Exception as e:
                lang = "unknown"
                pass
    
    # get the language    
    languages.append(lang)
    
from pprint import pprint

languages_dict = {}
for lang in set(languages):
    languages_dict[lang] = languages.count(lang)
    
print("Total: {}\n".format(len(languages)))
pprint(languages_dict)



100%|██████████| 187/187 [00:00<00:00, 225.90it/s]

Total: 187

{'en': 187}





In [88]:
df['language'] = languages
df = df[df['language'] == 'en'] 
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 187 entries, 0 to 186
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   paper_id             187 non-null    object
 1   doi                  171 non-null    object
 2   abstract             187 non-null    object
 3   body_text            187 non-null    object
 4   authors              187 non-null    object
 5   title                187 non-null    object
 6   journal              106 non-null    object
 7   abstract_summary     187 non-null    object
 8   publish_time         187 non-null    object
 9   abstract_word_count  187 non-null    int64 
 10  body_word_count      187 non-null    int64 
 11  body_unique_words    187 non-null    int64 
 12  language             187 non-null    object
dtypes: int64(3), object(10)
memory usage: 20.5+ KB


## Stopwords and lemmatization

Remove stopwords (commons words that will act as noise in the text analysis). In addition to the default stopwords from the `string` module, extra stopwords that are common in publications are also used. 

Cite: [Custom Stop Words | Topic Modeling: Finding Related Articles](https://www.kaggle.com/danielwolffram/topic-modeling-finding-related-articles)


In [90]:
import string

punctuations = string.punctuation
stopwords = list(STOP_WORDS)
custom_stop_words = [
    'doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 'author', 'figure', 
    'rights', 'reserved', 'permission', 'used', 'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.', 
    'al.', 'Elsevier', 'PMC', 'CZI', 'www'
]

stopwords = list(set().union(stopwords, custom_stop_words))

In [None]:
Use the `en_core_sci_lg` model to tokenize the texts. 

In [91]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_sci_lg  # model downloaded in previous step

parser = en_core_sci_lg.load(disable=["tagger", "ner"])
parser.max_length = 7000000

def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    mytokens = " ".join([i for i in mytokens])
    return mytokens

Use body text to process. 

In [98]:
from tqdm import tqdm
tqdm.pandas()
text = df['body_text']
df["processed_text"] = text.progress_apply(spacy_tokenizer)

100%|██████████| 187/187 [00:45<00:00,  4.11it/s]


In [99]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 187 entries, 0 to 186
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   paper_id             187 non-null    object
 1   doi                  171 non-null    object
 2   abstract             187 non-null    object
 3   body_text            187 non-null    object
 4   authors              187 non-null    object
 5   title                187 non-null    object
 6   journal              106 non-null    object
 7   abstract_summary     187 non-null    object
 8   publish_time         187 non-null    object
 9   abstract_word_count  187 non-null    int64 
 10  body_word_count      187 non-null    int64 
 11  body_unique_words    187 non-null    int64 
 12  language             187 non-null    object
 13  processed_text       187 non-null    object
dtypes: int64(3), object(11)
memory usage: 21.9+ KB


In [100]:
import pickle
pickle.dump(df, open(f'{root_path}/df_covid.p', 'wb'))