## Imports

In [1]:
import pandas as pd
import numpy as np

## Read in data

In [2]:
articles = pd.read_csv('datasets/metadata.csv', low_memory=False)

In [3]:
articles.head(2)

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,arxiv_id,has_pdf_parse,has_pmc_xml_parse,full_text_file,url
0,zjufx4fo,b2897e1277f56641193a6db73825f707eed3e4c9,PMC,Sequence requirements for RNA strand transfer ...,10.1093/emboj/20.24.7220,PMC125340,11742998.0,unk,Nidovirus subgenomic mRNAs contain a leader se...,2001-12-17,"Pasternak, Alexander O.; van den Born, Erwin; ...",The EMBO Journal,,,,True,True,custom_license,http://europepmc.org/articles/pmc125340?pdf=re...
1,ymceytj3,e3d0d482ebd9a8ba81c254cc433f314142e72174,PMC,"Crystal structure of murine sCEACAM1a[1,4]: a ...",10.1093/emboj/21.9.2076,PMC125375,11980704.0,unk,CEACAM1 is a member of the carcinoembryonic an...,2002-05-01,"Tan, Kemin; Zelus, Bruce D.; Meijers, Rob; Liu...",The EMBO Journal,,,,True,True,custom_license,http://europepmc.org/articles/pmc125375?pdf=re...


## Filter by coronavirus related keywords

Sources: https://www.cdc.gov/coronavirus/types.html | https://www.ncbi.nlm.nih.gov/books/NBK92442/ | https://en.wikipedia.org/wiki/Coronaviridae

We're filtering the dataframe by searching the titles and abstracts for coronavirus related keywords to get a dataframe of articles related to coronaviruses only. 

Other common human coronaviruses besides SARS, MERS, and COVID-19:
- 229E
- NL63
- OC43
- HKU1

Diseases caused by animal coronaviruses:
- PEDV: Porcine epidemic diarrhea virus (domestic pigs)
- TGEV: transmissible gastroenteritis (domestic pigs)
- FIP: feline infectious peritonitis
- IBV: infectious bronchitis virus (avian)
- MHV: Mouse hepatitis virus
- SDAV: Sialodacryoadenitis virus (mice)

In [5]:
# Original shape of dataframe
articles.shape

(59887, 19)

In [10]:
# Filter dataframe with keywords
keywords = ['coronavirus', 'covid', 'sars', 'mers', '229E', 'NL63', 'OC43', 'HKU', 'PEDV', 'TGEV', 'FIP',
            'IBV', 'MHV', 'SDAV', 'FSC']

coronavirus = articles[(articles['title'].str.contains('|'.join(keywords), case=False)) |
                  (articles['abstract'].str.contains('|'.join(keywords), case=False))].copy()

# Shape of filtered dataframe
coronavirus.shape

(20855, 19)

## Check number of missing values

In [11]:
# Missing values
coronavirus.isnull().sum()

cord_uid                           0
sha                             5060
source_x                           0
title                              0
doi                             1871
pmcid                           6782
pubmed_id                       8277
license                            0
abstract                        3572
publish_time                       8
authors                          367
journal                         4365
Microsoft Academic Paper ID    19986
WHO #Covidence                 19220
arxiv_id                       20294
has_pdf_parse                      0
has_pmc_xml_parse                  0
full_text_file                  3237
url                              407
dtype: int64

## Cast publish time as datetime variable

In [12]:
# Datatypes of each column
coronavirus.dtypes

cord_uid                        object
sha                             object
source_x                        object
title                           object
doi                             object
pmcid                           object
pubmed_id                      float64
license                         object
abstract                        object
publish_time                    object
authors                         object
journal                         object
Microsoft Academic Paper ID    float64
WHO #Covidence                  object
arxiv_id                        object
has_pdf_parse                     bool
has_pmc_xml_parse                 bool
full_text_file                  object
url                             object
dtype: object

In [13]:
# Cast publish_time as datetime 
coronavirus['publish_time'] = pd.to_datetime(coronavirus['publish_time'])

In [14]:
coronavirus.dtypes

cord_uid                               object
sha                                    object
source_x                               object
title                                  object
doi                                    object
pmcid                                  object
pubmed_id                             float64
license                                object
abstract                               object
publish_time                   datetime64[ns]
authors                                object
journal                                object
Microsoft Academic Paper ID           float64
WHO #Covidence                         object
arxiv_id                               object
has_pdf_parse                            bool
has_pmc_xml_parse                        bool
full_text_file                         object
url                                    object
dtype: object

## Save as csv file

In [15]:
coronavirus.to_csv('filtered_articles.csv', index=False)