## Imports

In [1]:
import pandas as pd
import numpy as np

## Read in data

In [2]:
# Read in dataframes
text = pd.read_csv('combined_df.csv', low_memory=False)
articles = pd.read_csv('metadata.csv', low_memory=False)

In [3]:
text.head(2)

Unnamed: 0.1,Unnamed: 0,paper_id,title,discussion,text_body
0,0,d23435612750f9edf20673d47975786980fb98ee,The Flow Physics of COVID-19,,Transmission of respiratory infections such as...
1,1,6b8271b465a5f5ce296ef370f4989a05b1d31833,THE SECOND WORLDWIDE WAVE OF INTEREST IN CORON...,The key finding is that GT forecasted the rise...,"As of 7 pm Central European Time on 24 March, ..."


In [4]:
articles.head(2)

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,arxiv_id,has_pdf_parse,has_pmc_xml_parse,full_text_file,url
0,zjufx4fo,b2897e1277f56641193a6db73825f707eed3e4c9,PMC,Sequence requirements for RNA strand transfer ...,10.1093/emboj/20.24.7220,PMC125340,11742998.0,unk,Nidovirus subgenomic mRNAs contain a leader se...,2001-12-17,"Pasternak, Alexander O.; van den Born, Erwin; ...",The EMBO Journal,,,,True,True,custom_license,http://europepmc.org/articles/pmc125340?pdf=re...
1,ymceytj3,e3d0d482ebd9a8ba81c254cc433f314142e72174,PMC,"Crystal structure of murine sCEACAM1a[1,4]: a ...",10.1093/emboj/21.9.2076,PMC125375,11980704.0,unk,CEACAM1 is a member of the carcinoembryonic an...,2002-05-01,"Tan, Kemin; Zelus, Bruce D.; Meijers, Rob; Liu...",The EMBO Journal,,,,True,True,custom_license,http://europepmc.org/articles/pmc125375?pdf=re...


## Check for missing values

In [5]:
# Number of rows
text.shape

(41564, 5)

In [6]:
# Check for missing values
text.isnull().sum()

Unnamed: 0        0
paper_id          0
title          4364
discussion    24099
text_body         0
dtype: int64

In [7]:
# Number of rows 
articles.shape

(59887, 19)

In [8]:
# Check for missing values
articles.isnull().sum()

cord_uid                           0
sha                            14124
source_x                           0
title                            163
doi                             4086
pmcid                          11748
pubmed_id                      18559
license                            0
abstract                       11130
publish_time                       8
authors                         2591
journal                         7059
Microsoft Academic Paper ID    58923
WHO #Covidence                 58119
arxiv_id                       59211
has_pdf_parse                      0
has_pmc_xml_parse                  0
full_text_file                  8461
url                              440
dtype: int64

## Join dataframes on title column

In [9]:
# Inner join on sha and paper ID
coronavirus = pd.merge(articles, text, how='inner', left_on='sha', right_on='paper_id')

In [10]:
# Check number of rows
coronavirus.shape

(37571, 24)

In [11]:
coronavirus.head(2)

Unnamed: 0.1,cord_uid,sha,source_x,title_x,doi,pmcid,pubmed_id,license,abstract,publish_time,...,arxiv_id,has_pdf_parse,has_pmc_xml_parse,full_text_file,url,Unnamed: 0,paper_id,title_y,discussion,text_body
0,zjufx4fo,b2897e1277f56641193a6db73825f707eed3e4c9,PMC,Sequence requirements for RNA strand transfer ...,10.1093/emboj/20.24.7220,PMC125340,11742998.0,unk,Nidovirus subgenomic mRNAs contain a leader se...,2001-12-17,...,,True,True,custom_license,http://europepmc.org/articles/pmc125340?pdf=re...,22114,b2897e1277f56641193a6db73825f707eed3e4c9,,,The genetic information of RNA viruses is orga...
1,ymceytj3,e3d0d482ebd9a8ba81c254cc433f314142e72174,PMC,"Crystal structure of murine sCEACAM1a[1,4]: a ...",10.1093/emboj/21.9.2076,PMC125375,11980704.0,unk,CEACAM1 is a member of the carcinoembryonic an...,2002-05-01,...,,True,True,custom_license,http://europepmc.org/articles/pmc125375?pdf=re...,15547,e3d0d482ebd9a8ba81c254cc433f314142e72174,,,Carcinoembryonic antigen (CEA; CD66e) was init...


In [12]:
# Check for missing values
coronavirus.isnull().sum()

cord_uid                           0
sha                                0
source_x                           0
title_x                            9
doi                              970
pmcid                           7362
pubmed_id                      11818
license                            0
abstract                        5024
publish_time                       0
authors                          438
journal                         3441
Microsoft Academic Paper ID    37217
WHO #Covidence                 37054
arxiv_id                       36970
has_pdf_parse                      0
has_pmc_xml_parse                  0
full_text_file                     0
url                              119
Unnamed: 0                         0
paper_id                           0
title_y                         4071
discussion                     21950
text_body                          0
dtype: int64

## Drop unused rows and columns

In [13]:
# Drop unused columns
coronavirus.drop(columns=['cord_uid', 'doi', 'pmcid', 'pubmed_id', 'license', 'Microsoft Academic Paper ID',
                          'WHO #Covidence', 'arxiv_id', 'has_pdf_parse', 'has_pmc_xml_parse', 'full_text_file',
                          'Unnamed: 0', 'paper_id'], inplace=True)

In [14]:
coronavirus.head(2)

Unnamed: 0,sha,source_x,title_x,abstract,publish_time,authors,journal,url,title_y,discussion,text_body
0,b2897e1277f56641193a6db73825f707eed3e4c9,PMC,Sequence requirements for RNA strand transfer ...,Nidovirus subgenomic mRNAs contain a leader se...,2001-12-17,"Pasternak, Alexander O.; van den Born, Erwin; ...",The EMBO Journal,http://europepmc.org/articles/pmc125340?pdf=re...,,,The genetic information of RNA viruses is orga...
1,e3d0d482ebd9a8ba81c254cc433f314142e72174,PMC,"Crystal structure of murine sCEACAM1a[1,4]: a ...",CEACAM1 is a member of the carcinoembryonic an...,2002-05-01,"Tan, Kemin; Zelus, Bruce D.; Meijers, Rob; Liu...",The EMBO Journal,http://europepmc.org/articles/pmc125375?pdf=re...,,,Carcinoembryonic antigen (CEA; CD66e) was init...


In [15]:
# Look at the rows with 9 missing titles
coronavirus[coronavirus['title_x'].isnull()]

Unnamed: 0,sha,source_x,title_x,abstract,publish_time,authors,journal,url,title_y,discussion,text_body
19216,bdfee1e149cda6e1d449de68cd48d17897015101,Elsevier,,Unknown,2011-12-31,,Treatise on Estuarine and Coastal Science,https://doi.org/10.1016/b978-0-12-374711-2.090...,SUBJECT INDEX Notes,,"natural hazards, 12: 272 nitrogen cycling, 5: ..."
19217,d9b6d041197762c390ca9397ef6c778693bcba15,Elsevier,,Unknown,2012-12-31,,Comprehensive Biophysics,https://doi.org/10.1016/b978-0-12-374920-8.090...,,,Cross-reference terms in italic are general cr...
19322,106db157fd5d6ee766feabadf0d35a24d6d37895,Elsevier,,Unknown,2008-12-31,,Travel Medicine,https://doi.org/10.1016/b978-0-323-03453-1.100...,AFRICAN TRYPANOSOMIASIS (SLEEPING SICKNESS) Ep...,,insect-borne and transfusion-associated transm...
19355,0b87046abff5d934edc840da7e39a09ced213d7e,Elsevier,,Unknown,2014-12-31,,Encyclopedia of Agriculture and Food Systems,https://doi.org/10.1016/b978-0-444-52512-3.090...,,,industry-level analysis 1:54-56 market structu...
19460,48b0dfada83cdf2b9a3de65a15ef635cff2b505b,Elsevier,,Unknown,2008-12-31,,Pediatric Emergency Medicine,https://doi.org/10.1016/b978-141600087-7.50204-x,,,"ABCDs of resuscitation and, 9-12 asphyxial, 95..."
19463,dedd72d0a6bc6cc0d065b2755ef5b52880c29989,Elsevier,,Unknown,2007-12-31,,Equine Infectious Diseases,https://doi.org/10.1016/b978-1-4160-2406-4.500...,,,. See also under individual diseases. employee...
20229,05389a55ca4f5de2b1c3ee206d75d3574313d684,Elsevier,,,2020-03-31,"Eccles, Ron",Journal of Hospital Infection,https://doi.org/10.1016/j.jhin.2020.03.026,,,When considering the persistence of viruses on...
21695,fdd1978e558dd4d78be3f032f06959ba521db5e8,Elsevier,,,2018-04-30,"Greenberg, Harry B.; Griffin, Diane E.",Current Opinion in Virology,https://doi.org/10.1016/j.coviro.2018.04.005,,,Viruses have and will continue to play a major...
21828,9f089d009338c25da7711ee4e6336ebdcad42a7a,Elsevier,,,2020-04-22,"Shaker, Marcus S.; Oppenheimer, John; Grayson,...",The Journal of Allergy and Clinical Immunology...,https://doi.org/10.1016/j.jaip.2020.04.016,,,To the Editor:We appreciate the comments and p...


In [16]:
# Since those 9 rows have a lot of missing values, we decided to drop them
coronavirus.dropna(subset=['title_x'], inplace=True)

In [17]:
coronavirus['title_x'].isnull().sum()

0

In [18]:
# Drop second title col
coronavirus.drop(columns='title_y', inplace=True)

# Rename sha, souce_x, and title_x cols
coronavirus.rename(columns={'sha': 'paper_id',
                            'source_x': 'source',
                            'title_x': 'title'},
                   inplace=True)

In [19]:
coronavirus.head(2)

Unnamed: 0,paper_id,source,title,abstract,publish_time,authors,journal,url,discussion,text_body
0,b2897e1277f56641193a6db73825f707eed3e4c9,PMC,Sequence requirements for RNA strand transfer ...,Nidovirus subgenomic mRNAs contain a leader se...,2001-12-17,"Pasternak, Alexander O.; van den Born, Erwin; ...",The EMBO Journal,http://europepmc.org/articles/pmc125340?pdf=re...,,The genetic information of RNA viruses is orga...
1,e3d0d482ebd9a8ba81c254cc433f314142e72174,PMC,"Crystal structure of murine sCEACAM1a[1,4]: a ...",CEACAM1 is a member of the carcinoembryonic an...,2002-05-01,"Tan, Kemin; Zelus, Bruce D.; Meijers, Rob; Liu...",The EMBO Journal,http://europepmc.org/articles/pmc125375?pdf=re...,,Carcinoembryonic antigen (CEA; CD66e) was init...


## Check for corrupted columns

In [20]:
coronavirus.dtypes

paper_id        object
source          object
title           object
abstract        object
publish_time    object
authors         object
journal         object
url             object
discussion      object
text_body       object
dtype: object

## Cast publish time as datetime variable

In [21]:
coronavirus['publish_time'] = pd.to_datetime(coronavirus['publish_time'])

In [22]:
coronavirus.dtypes

paper_id                object
source                  object
title                   object
abstract                object
publish_time    datetime64[ns]
authors                 object
journal                 object
url                     object
discussion              object
text_body               object
dtype: object

## Save dataframe as csv file

In [23]:
coronavirus.to_csv('merged_data.csv', index=False)