In [10]:
import pandas as pd

In [11]:
"""
Input

metadata_csv_path - Path to latest metadata.csv file from Kaggle's CORD-19 dataset.
covid_19_term_list_path - Path to text file containing a list of Covid-19 synonyms (1 per line).

"""

metadata_csv_path = 'resources/metadata.csv'
covid_19_term_list_path = 'resources/covid_19_terms_200427.txt'

pub_date_cutoff = '2019-10-01'

"""
Output

Filtered metadata dataframe is saved to csv file.

"""

filt_metadata_outpath = 'resources/metadata_covid19_df_200430.csv'

In [12]:
def filter_metadata_df_by_title_abstract_terms(metadata_df, covid_19_term_list_path):
    """
    Filter metadata dataframe to publications containing a Covid-19 synonym in title or abstract.
    """
        
    #Concatenate title and abstract text into a single, lower-cased column
    
    metadata_df = metadata_df.fillna('')
    metadata_df.loc[:, 'title_abstract'] = metadata_df.loc[:, 'title'].str.lower() + ' ' + metadata_df.loc[:, 'abstract'].str.lower()
    metadata_df.loc[:, 'title_abstract'] = metadata_df.loc[:, 'title_abstract'].fillna('')

    #Load text file containing Covid-19 synonyms (1 per line) and generate a search pattern
    with open(covid_19_term_list_path) as f:
        covid_19_terms = f.read().splitlines()
        covid_19_term_pattern = '|'.join([i.lower() for i in covid_19_terms])

    covid19_df = metadata_df.loc[metadata_df.title_abstract.str.contains(covid_19_term_pattern)]
    
    return covid19_df

def filter_metadata_df_by_publ_date_cutoff(metadata_df, pub_date_cutoff):
    
    date_filtered_df = metadata_df.loc[metadata_df['publish_time'] > pub_date_cutoff]

    return date_filtered_df


#QC functions
def filter_metadata_df_to_null_value_in_col(metadata_df, col_name):
    """
    Filter metadata_df to rows containing null values for specified column.
    """    
    na_mask = metadata_df[col_name].isna()
    return metadata_df[na_mask]

def count_df_rows_with_null_values_in_cols(metadata_df, col_names):
    """
    Identify rows with null values in specified columns and return as dict: 
    
    {cord_uid : [columns with null values]}
    """
    cord_uid_null_col_dict = {}
    
    for col_name in col_names:
        null_value_df = filter_metadata_df_to_null_value_in_col(metadata_df, col_name)
        null_value_cord_uids = null_value_df.cord_uid.tolist()
        
        print("%d rows have null values in column %s" % (len(null_value_cord_uids, col_name)))
        
        for cord_uid in null_value_cord_uids:
            cord_uid_null_col_dict.setdefault(cord_uid, []).append(col_name)
            
    return cord_uid_null_col_dict



In [13]:
#Main

#Load metadata.csv as dataframe and filter to 
metadata_df = pd.read_csv(metadata_csv_path)
covid19_df = filter_metadata_df_by_title_abstract_terms(metadata_df, covid_19_term_list_path)

#Filter covid19_df by publication date cutoff
covid19_date_filtered_df = filter_metadata_df_by_publ_date_cutoff(covid19_df, pub_date_cutoff)

covid19_date_filtered_df.to_csv('resources/covid19_date_filt_metadata_200430.csv')

In [14]:
"""
Duplicated cord_uids.

Some cord_uids are non-unique.  This appears to be the result of the same paper being provided by two different sources?
"""

dup_metadata_df_mask =  metadata_df.cord_uid.duplicated(keep=False)
dup_metadata_df = metadata_df[dup_metadata_df_mask]

dup_cord_uids = set(dup_metadata_df.cord_uid.tolist())
print("Number of duplicated cord uids: %d" % len(dup_cord_uids))

print("Examples of duplicated cord uids:")
display(dup_metadata_df.sort_values(by='cord_uid'))

Number of duplicated cord uids: 34
Examples of duplicated cord uids:


Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_pdf_parse,has_pmc_xml_parse,full_text_file,url
29489,0klupmep,,Elsevier,Infectious disease surveillance update,10.1016/s1473-3099(19)30075-1,,30833065.0,els-covid,,2019-03-31,"Zwizwai, Ruth",The Lancet Infectious Diseases,,,False,False,custom_license,https://doi.org/10.1016/s1473-3099(19)30075-1
16421,0klupmep,,PMC,Infectious disease surveillance update,10.1016/s1473-3099(19)30075-1,PMC7129894,30833064.0,no-cc,,2019-02-27,"Zwizwai, Ruth",Lancet Infect Dis,,,False,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...
16419,0z5wacxs,7e787fd2ae5b544add6281d3d40ad322de26aa17,PMC,Transportation capacity for patients with high...,10.1111/1469-0691.12290,PMC7128608,25636943.0,no-cc,Highly infectious diseases (HIDs) are defined ...,2015-06-22,"Schilling, S.; Maltezou, H.C.; Fusco, F.M.; De...",Clin Microbiol Infect,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...
29293,0z5wacxs,7e787fd2ae5b544add6281d3d40ad322de26aa17,Elsevier,Transportation capacity for patients with high...,10.1111/1469-0691.12290,,24750421.0,els-covid,Abstract Highly infectious diseases (HIDs) are...,2019-04-30,"Schilling, S.; Maltezou, H.C.; Fusco, F.M.; De...",Clinical Microbiology and Infection,,,True,False,custom_license,https://doi.org/10.1111/1469-0691.12290
28740,21htepa1,a25e212b03cc65c44dcc336775b101934e30f041,Elsevier,Panspermia—true or false?,10.1016/s0140-6736(03)14040-8,,12907025.0,els-covid,,2003-08-02,"de Leon, Samuel Ponce; Lazcano, Antonio",The Lancet,,,True,False,custom_license,https://doi.org/10.1016/s0140-6736(03)14040-8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28771,vp5358rr,f3eefad19d628cbe1fc0d65a6322a60d6cc5a3ab,Elsevier,PARENTAL ORIGIN OF CHROMOSOME 15 DELETION IN P...,10.1016/s0140-6736(83)92745-9,,6134086.0,els-covid,,1983-06-04,"Butler, MerlinG.; Palmer, CatherineG.",The Lancet,,,True,False,custom_license,https://doi.org/10.1016/s0140-6736(83)92745-9
15934,vqbreyna,2513ddf57215cb9297e05db838156d8856c8d6a5,PMC,"COVID-19, A Clinical Syndrome Manifesting as H...",10.3947/ic.2020.52.1.110,PMC7113449,32153144.0,cc-by-nc,,2020-03-10,"Song, Young Goo; Shin, Hyoung-Shik",Infect Chemother,,,True,True,noncomm_use_subset,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...
41120,vqbreyna,,WHO,"COVID-19, A Clinical Syndrome Manifesting as H...",,,32153144.0,unk,,2020,"Song, Young Goo; Shin, Hyoung Shik",Infect Chemother,,#6103,False,False,,https://www.ncbi.nlm.nih.gov/pubmed/32153144/
28439,xjpev4jw,c05522c7132d3162d433c02ade1fc80604625a4f,Elsevier,"Virus-Specific Antibody, in the Absence of T C...",10.1016/s0002-9440(10)62301-2,,15743792.0,els-covid,Mice infected with mouse hepatitis virus strai...,2005-03-31,"Kim, Taeg S.; Perlman, Stanley",The American Journal of Pathology,,,True,False,custom_license,https://doi.org/10.1016/s0002-9440(10)62301-2


In [15]:
"""
Date filtered publications.
"""

covid19_cord_uids = covid19_df.cord_uid.tolist()
date_incl_cord_uids = covid19_date_filtered_df.cord_uid.tolist()
date_excl_cord_uids = set(covid19_cord_uids) - set(date_incl_cord_uids)

print("Covid-19 cord_uids: %d" % len(set(covid19_cord_uids)))
print("Covid-19 cord_uids published after date cutoff %s : %d" % (pub_date_cutoff, len(set(date_incl_cord_uids))))
print("Covid-19 cord_uids published before date cutoff %s : %d" % (pub_date_cutoff, len(set(date_excl_cord_uids))))

Covid-19 cord_uids: 6170
Covid-19 cord_uids published after date cutoff 2019-10-01 : 5886
Covid-19 cord_uids published before date cutoff 2019-10-01 : 284


In [18]:
pubdate_df = metadata_df.set_index('publish_time')

In [42]:
pubdate_df = metadata_df.copy()
pubdate_df['publish_time'] = pd.to_datetime(pubdate_df['publish_time'])
pubdate_df = pubdate_df.set_index(['publish_time'])



Index(['cord_uid', 'sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id',
       'license', 'abstract', 'publish_time', 'authors', 'journal',
       'Microsoft Academic Paper ID', 'WHO #Covidence', 'has_pdf_parse',
       'has_pmc_xml_parse', 'full_text_file', 'url'],
      dtype='object')

In [49]:
covid19_uids = covid19_df.cord_uid.tolist()
pubdate_df['covid19'] = pubdate_df['cord_uid'].apply(lambda x: 1 if x in covid19_uids else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
