In [1]:
import os
import pandas as pd
import scholarly
from nltk import sent_tokenize,word_tokenize
os.chdir('C:\\Users\\Cafral\\Desktop\\kaggle\\CORD-19-research-challenge\\data_v7')

# Load the data

In [2]:
metadata = pd.read_csv('clean_metadata.csv')
metadata.rename(columns={'sha':'paper_id'}, inplace = True)
metadata['paper_id'] = metadata['paper_id'].astype("str")
metadata['title'] = metadata['title'].fillna('nan')
metadata['abstract'] = metadata['abstract'].fillna('nan')

metadata['text'] = metadata['title']+ ' ' + metadata['abstract']
metadata.drop_duplicates(['text'], inplace=True)

# Extracting papers which contain ngrams

In [3]:
def find_ngrams(dataframe,columnToSearch,keywords):
    df_w_ngrams = dataframe[dataframe[columnToSearch].str.contains('|'.join(keywords), case=False) == True]
    return df_w_ngrams

ngrams = ['population density','number of people in','number of people per',
                             'highly populated areas','highly populated countries',
                             'densely populated countries','densely populated areas',
                             'high density areas','high density countries'
                             ,'population densities', 'density of population','sparsely populated',
                             'densely populated','density of the population','dense population','populated areas',
                             'densely inhabited','housing density','densely-populated','concentration of people',
                             'population pressure','population studies','populated regions','populous',
                             'high population densities','residential densities','overpopulated']

metadata_ngrams = find_ngrams(metadata,'text',ngrams)
#ngrams_titles.rename(columns={'title':'title_w_ngram'},inplace=True)

# Extracting all sentences from the relevant papers

In [4]:
def metadata_sentences(dataFrame):
    paper_ids = list(dataFrame['paper_id'].unique())
    meta_sent_df = pd.DataFrame(columns=['paper_id','sentence'])
    for paper_id in paper_ids:
        sentence_df = pd.DataFrame(columns=['paper_id','sentence'])
        paper_id_df = dataFrame[dataFrame['paper_id']==paper_id]

        for idx_num,row in paper_id_df.iterrows():
            sentences = [sentence for sentence in sent_tokenize(row.abstract)]
        sentence_df['sentence'] = sentences
        #print(sentence_df['sentence'])
        sentence_df['paper_id'] = paper_id
        meta_sent_df = pd.concat([meta_sent_df,sentence_df])

    meta_sent_df.reset_index(inplace=True)
    meta_sent_df.drop(columns=['index'],inplace=True)
    
    return meta_sent_df

In [5]:
meta_sent_df = metadata_sentences(metadata_ngrams)

# Extracting methodolody,sample size,causal nature,sentences refering to coronavirus, fatality

In [6]:
def extract_features(ngramDf,allSentdataFrame,ngrams):
    
    #ngram sentences
    sentences = find_ngrams(allSentdataFrame,'sentence',ngrams)
    
    # extracting methodology
    methods_list = ['regression','OLS','logistic','time series','model','modelling','simulation','forecast','forecasting']
    methodology = find_ngrams(allSentdataFrame,'sentence',methods_list)

    #extracting sample size
    sample_size_list = ['population size','sample size','number of samples','number of observations','number of subjects']
    sample_size = find_ngrams(allSentdataFrame,'sentence',sample_size_list)

    #extracting nature of correlation
    causal_list =['statistically significant','statistical significance',
                  'correlation','positively correlated','negatively correlated','correlated',
                  'p value','p-value','chi square','chi-square',
                  'confidence interval','CI','odds ratio','OR','coefficient']

    causality_type = find_ngrams(allSentdataFrame,'sentence',causal_list)

    # extracting coronavirus related sentence #can someone check and update this list?
    coronavirus_list = ['severe acute respiratory syndrome','sars-cov','sars-like',
                        'middle east respiratory syndrome','mers-cov','mers-like',
                        'covid-19','sars-cov-2','2019-ncov','sars-2',
                        'sarscov-2','novel coronavirus','corona virus','coronaviruses',
                        'sars','mers','covid19','covid 19']

    coronavirus = find_ngrams(allSentdataFrame,'sentence',coronavirus_list)

    # extracting outcome
    disease_stage_list = ['lethal', 'morbid',"death", "fatality", "mortality","lethal", "lethality", "morbidity"]

    fatality = find_ngrams(allSentdataFrame,'sentence',disease_stage_list)

    df_list = [sentences,methodology,sample_size,causality_type,coronavirus,fatality]
    df_list_name = ['sentences','methodology','sample_size','causality_type','coronavirus','fatality']
    i=0
    for one_df in df_list:
        one_df.rename(columns={'sentence':df_list_name[i]},inplace=True)
        grouped_one_df = one_df.groupby(['paper_id'], as_index=False)[df_list_name[i]].sum()
        ngramDf = pd.merge(ngramDf,grouped_one_df,on='paper_id',how='left')
        i=i+1
    return ngramDf

In [7]:
metadata_ngrams = extract_features(metadata_ngrams,meta_sent_df,ngrams)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


In [8]:
metadata_ngrams.drop(columns=['Unnamed: 0', 'cord_uid','source_x','pmcid', 'pubmed_id', 'license', 
                              'Microsoft Academic Paper ID', 'WHO #Covidence',
                              'has_full_text', 'full_text_file'],inplace=True)

In [10]:
def keywordcounter(sentences, keywords_list):
    '''
    Input : List of sentences, List of keywords
    Returns : Keywords present in sentences, Total count of all keywords present in Input
    '''
    keyword = {}
    sent = " ".join(sentences)
    for pol in keywords_list:
        counter = sent.lower().count(pol)
        if (counter > 0):
            keyword[pol] = counter
    return list(keyword.keys()), sum(keyword.values())

def aggregation(item,keyWordList,RiskFactor):
    '''
    Input : Dataframe of sentences of a paper
    Return : Datframe in Standard Output format
    '''
    dfo = {}
    
    dfo['Risk Factor'] = RiskFactor
    dfo['Title'] = item['title'].iloc[0]
    dfo['Keyword/Ngram'], dfo['No of keyword occurence in Paper'] = keywordcounter(item['text'].tolist(),
                                                                                 keyWordList)
    dfo['Sentences'] = item['sentences'].iloc[0]
    dfo['paper_id'] = item['paper_id'].iloc[0]
    
    dfo['URL'] = item['url'].iloc[0]
    
    dfo['Authors'] = item['authors'].iloc[0]

    try:
        dfo['No of Citations'] = next(scholarly.search_pubs_query(item['title'].iloc[0])).citedby
    except:
        dfo['No of Citations'] = 0
        
    dfo['Correlation'] = item['causality_type'].iloc[0]
    dfo['Design Methodology'] = item['methodology'].iloc[0]
    dfo['Sample Size'] = item['sample_size'].iloc[0]
    dfo['Coronavirus'] = item['coronavirus'].iloc[0]
    dfo['Fatality'] = item['fatality'].iloc[0]
    #dfo['TAXON'] =item['TAXON'].iloc[0]
    
    return dfo

#del df_output
df_output = pd.DataFrame(columns=['Risk Factor', 'Title','Keyword/Ngram', 'No of keyword occurence in Paper',
                                  'paper_id', 'URL',
                                  'Authors','No of Citations', 'Correlation', 
                                  'Design Methodology', 'Sample Size',
                                 'Coronavirus','Fatality'])

grouped = metadata_ngrams.groupby('paper_id')
for key, item in grouped:
    df_output = pd.concat([df_output, pd.DataFrame([aggregation(item,ngrams,'Population Density')])])

df_output = df_output.reset_index()
df_output.to_excel('population_density_metadata.xlsx')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


