In [11]:
import os
import pandas as pd
import scholarly
os.chdir('C:\\Users\\Cafral\\Desktop\\kaggle\\CORD-19-research-challenge\\data_v7')

# Load the data

In [4]:
df_method = pd.read_csv('method_df.csv')
df_result = pd.read_csv('result_df.csv')

print("No of unique papers in method section : ", df_method['paper_id'].nunique(), " out of ", 
      len(df_method), " rows in dataframe")
print("No of unique papers in result section : ", df_result['paper_id'].nunique(), " out of ", 
      len(df_result), " rows in dataframe")

df_method.info()

No of unique papers in method section :  9693  out of  188137  rows in dataframe
No of unique papers in result section :  7820  out of  178851  rows in dataframe
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188137 entries, 0 to 188136
Data columns (total 38 columns):
Unnamed: 0                         188137 non-null int64
paper_id                           188137 non-null object
language                           188137 non-null object
section                            188137 non-null object
sentence                           188137 non-null object
lemma                              188137 non-null object
UMLS                               188137 non-null object
GGP                                188137 non-null object
SO                                 188137 non-null object
TAXON                              188137 non-null object
CHEBI                              188137 non-null object
GO                                 188137 non-null object
CL                              

# Extracting sentences which contain topic ngrams

In [5]:
def find_ngrams(dataframe,columnToSearch,keywords):
    df_w_ngrams = dataframe[dataframe[columnToSearch].str.contains('|'.join(keywords), case=False) == True]
    return df_w_ngrams

population_density_ngrams = ['population density','number of people in','number of people per',
                             'highly populated areas','highly populated countries',
                             'densely populated countries','densely populated areas',
                             'high density areas','high density countries'
                             ,'population densities', 'density of population','sparsely populated',
                             'densely populated','density of the population','dense population','populated areas',
                             'densely inhabited','housing density','densely-populated','concentration of people',
                             'population pressure','population studies','populated regions','populous',
                             'high population densities','residential densities','overpopulated']

#Extracting sentences which contain ngrams

df_method_p = find_ngrams(df_method,'sentence',population_density_ngrams)

df_result_p = find_ngrams(df_result,'sentence',population_density_ngrams)

print("There are {} sentences containing keywords/ngrams in Method section.".format(len(df_method_p)))
print("There are {} sentences containing keywords/ngrams in Result section.".format(len(df_result_p)))

# Merging the method and result section sentences into single dataframe
df_real = pd.concat([df_method_p, df_result_p])

print("Total unique papers in Method section : {}".format(df_method_p['paper_id'].nunique()))
print("Total unique papers in Result section : {}".format(df_result_p['paper_id'].nunique()))
print("Total unique papers in combined section : {}".format(df_real['paper_id'].nunique()))

There are 121 sentences containing keywords/ngrams in Method section.
There are 62 sentences containing keywords/ngrams in Result section.
Total unique papers in Method section : 84
Total unique papers in Result section : 39
Total unique papers in combined section : 117


# Keeping all the sentences from papers that had topic ngrams

In [6]:
df_method_all_sentence = pd.merge(df_method[['paper_id','sentence']],df_method_p['paper_id'],on='paper_id',how='right')
df_method_all_sentence.rename(columns={'sentence_x':'all_sentences','sentence_y':'ngram_sentence'},inplace=True)

df_result_all_sentence = pd.merge(df_result[['paper_id','sentence']],df_result_p['paper_id'],on='paper_id',how='right')
df_result_all_sentence.rename(columns={'sentence_x':'all_sentences','sentence_y':'ngram_sentence'},inplace=True)

df_all_sentences = pd.concat([df_method_all_sentence, df_result_all_sentence])
print("Total unique papers in combined section : {}".format(df_all_sentences['paper_id'].nunique()))

Total unique papers in combined section : 117


# Extracting methodolody,sample size,causal nature,sentences refering to coronavirus, fatality

In [7]:
def extract_features(ngramDf,allSentdataFrame):
    # extracting methodology
    methods_list = ['regression','OLS','logistic','time series','model','modelling','simulation','forecast','forecasting']
    methodology = find_ngrams(allSentdataFrame,'sentence',methods_list)

    #extracting sample size
    sample_size_list = ['population size','sample size','number of samples','number of observations','number of subjects']
    sample_size = find_ngrams(allSentdataFrame,'sentence',sample_size_list)

    #extracting nature of correlation
    causal_list =['statistically significant','statistical significance',
                  'correlation','positively correlated','negatively correlated','correlated',
                  'p value','p-value','chi square','chi-square',
                  'confidence interval','CI','odds ratio','OR','coefficient']

    causality_type = find_ngrams(allSentdataFrame,'sentence',causal_list)

    # extracting coronavirus related sentence #can someone check and update this list?
    coronavirus_list = ['severe acute respiratory syndrome','sars-cov','sars-like',
                        'middle east respiratory syndrome','mers-cov','mers-like',
                        'covid-19','sars-cov-2','2019-ncov','sars-2',
                        'sarscov-2','novel coronavirus','corona virus','coronaviruses',
                        'sars','mers','covid19','covid 19']

    coronavirus = find_ngrams(allSentdataFrame,'sentence',coronavirus_list)

    # extracting outcome
    disease_stage_list = ['lethal', 'morbid',"death", "fatality", "mortality","lethal", "lethality", "morbidity"]

    fatality = find_ngrams(allSentdataFrame,'sentence',disease_stage_list)

    df_list = [methodology,sample_size,causality_type,coronavirus,fatality]
    df_list_name = ['methodology','sample_size','causality_type','coronavirus','fatality']
    i=0
    for one_df in df_list:
        one_df.rename(columns={'sentence':df_list_name[i]},inplace=True)
        grouped_one_df = one_df.groupby(['paper_id'], as_index=False)[df_list_name[i]].sum()
        ngramDf = pd.merge(ngramDf,grouped_one_df,on='paper_id',how='left')
        i=i+1
    return ngramDf

In [8]:
df_real = extract_features(df_real,df_all_sentences)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


# Merge with Metadata

In [9]:
metadata = pd.read_csv('clean_metadata.csv')
metadata.rename(columns={'sha':'paper_id'}, inplace = True)
metadata['paper_id'] = metadata['paper_id'].astype("str")

#Merging the given papers with their metadata
df_real = df_real.merge(metadata[['paper_id', 'title', 'abstract', 'publish_time', 'authors',
                                  'url']], on='paper_id', how='left') #'title_w_ngram','abstract_w_ngram'

#Keeping only the fields which are relevant to us.
df_real = df_real[['paper_id','language', 'section', 'sentence', 'lemma', 'UMLS', 'sentence_id', 
                   'publish_time', 'authors', 'url','methodology','sample_size','causality_type','coronavirus',
                   'fatality','title','abstract','publish_time','authors',
                   'url','TAXON']]#'title_w_ngram','abstract_w_ngram',
df_real.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 183 entries, 0 to 182
Data columns (total 21 columns):
paper_id          183 non-null object
language          183 non-null object
section           183 non-null object
sentence          183 non-null object
lemma             183 non-null object
UMLS              183 non-null object
sentence_id       181 non-null object
publish_time      140 non-null object
authors           139 non-null object
url               139 non-null object
methodology       116 non-null object
sample_size       46 non-null object
causality_type    183 non-null object
coronavirus       40 non-null object
fatality          34 non-null object
title             140 non-null object
abstract          139 non-null object
publish_time      140 non-null object
authors           139 non-null object
url               139 non-null object
TAXON             183 non-null object
dtypes: object(21)
memory usage: 31.5+ KB


In [12]:
grouped = df_real.groupby('paper_id')
def keywordcounter(sentences, keywords_list):
    '''
    Input : List of sentences, List of keywords
    Returns : Keywords present in sentences, Total count of all keywords present in Input
    '''
    keyword = {}
    sent = " ".join(sentences)
    for pol in keywords_list:
        counter = sent.lower().count(pol)
        if (counter > 0):
            keyword[pol] = counter
    return list(keyword.keys()), sum(keyword.values())

def aggregation(item,keyWordList,RiskFactor):
    '''
    Input : Dataframe of sentences of a paper
    Return : Datframe in Standard Output format
    '''
    dfo = {}
    
    dfo['Risk Factor'] = RiskFactor
    dfo['Title'] = item['title'].iloc[0]
    dfo['Keyword/Ngram'], dfo['No of keyword occurence in Paper'] = keywordcounter(item['sentence'].tolist(),
                                                                                 keyWordList)
    dfo['paper_id'] = item['paper_id'].iloc[0]
    
    if (item['url'].iloc[0].isnull().any()==False):
        dfo['URL'] = item['url'].iloc[0].tolist()
    else:
        dfo['URL']=''
    #dfo['Sentences from Title']= item['title_w_ngram'].iloc[0]                        
    #dfo['Sentences from Abstract']= item['abstract_w_ngram'].iloc[0]
    dfo['Sentences from Method'] = item[item['section']=='methods']['sentence'].tolist()
    dfo['Sentences from Result'] = item[item['section']=='results']['sentence'].tolist()
    
    if (item['authors'].iloc[0].isnull().any()==False):#(item['authors'].iloc[0].isnull()==False):
        dfo['Authors'] = item['authors'].iloc[0].tolist()
    else:
         dfo['Authors'] = ''
    # For papers which do not have title (not in metadata) we have to resolve exceptions
    try:
        dfo['No of Citations'] = next(scholarly.search_pubs_query(item['title'].iloc[0])).citedby
    except:
        dfo['No of Citations'] = 0
        
    dfo['Correlation'] = item['causality_type'].iloc[0]
    dfo['Design Methodology'] = item['methodology'].iloc[0]
    dfo['Sample Size'] = item['sample_size'].iloc[0]
    dfo['Coronavirus'] = item['coronavirus'].iloc[0]
    dfo['Fatality'] = item['fatality'].iloc[0]
    dfo['TAXON'] =item['TAXON'].iloc[0]
    
    return dfo

df_output = pd.DataFrame(columns=['Risk Factor', 'Title','Keyword/Ngram', 'No of keyword occurence in Paper',
                                  'paper_id', 'URL',
                                  'Sentences from Result', 'Sentences from Method',
                                  'Authors','No of Citations', 'Correlation', 
                                  'Design Methodology', 'Sample Size',
                                 'Coronavirus','Fatality','TAXON'])#Sentences from Title','Sentences from Abstract',
for key, item in grouped:
    df_output = pd.concat([df_output, pd.DataFrame([aggregation(item,population_density_ngrams,'Population Density')])])

df_output = df_output.reset_index()
df_output.to_excel('population_density_json.xlsx')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


