In [52]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer
import gensim
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
import string
import json
import re

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

- IDs: 4743095, 2077273, Marcus Dreyer, 4894652, 4833887
- usernames: helenlord, katieymo, Marcus Dreyer, dincerti, shemrarizzo
- Emails: lord.helen [at] gene [dot] com, katieymo [at] gmail [dot] com, dreyer [at] itprodqs-consultng [dot] com, devin.incerti [at] gmail [dot] com, shem.rizzo [at] gmail [dot] com


## Goal: to identify papers which discuss specific non-pharmaceutical interventions to decrease the spread of COVID-19

*Focus: Methods to control the spread in communities, barriers to compliance and how these vary among different populations*


## Methodology: 
1) Cleaned abstracts

2) Use LDA on cleaned abstracts to identify papers most relevant to NPI topics

3) Use keyword search to pull out NPI papers which focus on: methods to control the spread in communities, barriers to compliance and how these vary among different populations

4) Pull out specific sentences and paragraphs from the identified papers with the keywords for quick identification

## Notes:
- The keyword search alone on the metadata is a useful methodology to identify relevant papers. It can be completed on either the title or the abstract. However, finding pertinent, specific keywords is very important for success
- Cleaning the text and using topic modeling focuses on a smaller, more relevant subset of papers, allowing this methodology to scale. However, a some relevant papers may be eliminated by the topic modeling

In [53]:
m = pd.read_csv('/kaggle/input/CORD-19-research-challenge/metadata.csv')

In [54]:
m = m[(m['title'].notna() & m['abstract'].notna())]

## Clean Abstracts

- lowercase
- remove punctuation
- remove stopwords
- lemmatize
- bigrams/trigrams

In [55]:
w_tokenizer = WhitespaceTokenizer()
lemmatizer = WordNetLemmatizer()

In [56]:
def preprocess(sentence):
    sentence = sentence.lower()
    sentence_no_punctuation = sentence.translate(str.maketrans('', '', string.punctuation))
    lemmatized_list = [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(sentence_no_punctuation) 
                  if w not in stopwords.words('english')]
    return lemmatized_list

In [57]:
m['abstract_lemmatized']=m['abstract'].map(lambda s:preprocess(s)) 

In [58]:
data_words = list(m['abstract_lemmatized'])

In [59]:
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
bigram_mod = gensim.models.phrases.Phraser(bigram)

In [60]:
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [61]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

In [62]:
def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [63]:
m['abstract_lemmatized_grams']= make_trigrams(m['abstract_lemmatized'])

In [64]:
def abstract_to_string(text):
    return ' '.join(word for word in text)

In [65]:
m['cleanAbstract'] = m['abstract_lemmatized_grams'].map(lambda s:abstract_to_string(s))

## LDA on Cleaned Abstracts

In [66]:
count_vectorizer = CountVectorizer(stop_words='english')

In [67]:
data_vectorized = count_vectorizer.fit_transform(m['cleanAbstract'])

## Compared sklean and gensim LDA models, completed GridSearch for n_components (options 5, 10, 15, 20, 25, 30, 35). The best model was sklearn and parameter was 5 - used below

In [68]:
number_topics = 5

In [69]:
lda = LDA(n_components=number_topics, n_jobs=-1)

In [70]:
lda.fit(data_vectorized)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=5, n_jobs=-1,
                          perp_tol=0.1, random_state=None,
                          topic_word_prior=None, total_samples=1000000.0,
                          verbose=0)

## Print top words associated with Topics

In [71]:
# Helper function
def print_topics(model, count_vectorizer, n_top_words=10):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

In [72]:
# Print the topics found by the LDA model
print("Topics found via LDA:")
print_topics(lda, count_vectorizer)

Topics found via LDA:

Topic #0:
protein virus cell viral rna gene replication sequence host activity

Topic #1:
virus influenza infection respiratory study sample viral human method case

Topic #2:
cell infection response vaccine mouse virus immune expression disease study

Topic #3:
disease health data model outbreak infectious study case risk epidemic

Topic #4:
patient infection group virus day study clinical disease result treatment


In [73]:
topics = lda.transform(data_vectorized)

In [74]:
for idx in range(number_topics):
    col_name = 'Topic ' + str(idx)
    m[col_name] = topics[:, idx]

## Looking for Topic associated with NPI (non-pharm in abstract)

In [75]:
non_pharm = m[(m['abstract'].str.contains('non-pharm'))]

In [76]:
topic_cols = [x for x in m.columns if 'Topic ' in x]

In [77]:
non_pharm_topics = non_pharm[topic_cols].idxmax(axis=1)

In [78]:
def most_frequent(List): 
    return max(set(List), key = List.count)

## Take the topics that match most for NPI modeling papers, then find all papers with that as their top topic

In [79]:
Counter(non_pharm_topics)

Counter({'Topic 3': 50, 'Topic 1': 5, 'Topic 4': 2})

In [80]:
top_topic = most_frequent(list(non_pharm_topics))

top_topic

'Topic 3'

In [81]:
m['Top_Topic'] = m[topic_cols].idxmax(axis=1)

In [82]:
m.groupby('Top_Topic').size()

Top_Topic
Topic 0    10939
Topic 1     7218
Topic 2     7439
Topic 3    10932
Topic 4     5675
dtype: int64

In [83]:
top_topic_papers = m[m['Top_Topic'] == top_topic]

## Keywords

- Need a core covid keyword
- And need a topic keyword

In [84]:
covid_keywords = ['corona', 'covid']

In [85]:
intervention_keywords = ['social distancing',
                        'contact tracing',
                        'case isolation',
                        'shelter-in-place',
                        'stay-at-home',
                        'movement restriction',
                        'event cancel',
                        'face mask',
                        'facial mask',
                        'travel ban',
                        'school closure']

In [86]:
def find_papers_w_keywords(topic_keywords, papers):
    for keyword in topic_keywords:
        num_papers_title = len(papers[(papers['title'].str.contains(keyword)) & 
                                        (papers['title'])])
        num_papers_abstract = len(papers[papers['abstract'].str.contains(keyword)])
        print ('Identified {} papers with "{}" in title, {} relevant papers with "{}" in abstract'\
                       .format(num_papers_title, keyword, num_papers_abstract, keyword)) 

## Identify core papers - about COVID-19 (keyword search and published date)

In [87]:
date_filter = '2019-12-01'

In [88]:
find_papers_w_keywords(covid_keywords, top_topic_papers)

Identified 512 papers with "corona" in title, 1407 relevant papers with "corona" in abstract
Identified 5 papers with "covid" in title, 37 relevant papers with "covid" in abstract


In [89]:
top_topic_papers['core_abstract'] = top_topic_papers['abstract'].apply(lambda x: any([k in x for k in covid_keywords]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [90]:
covid_papers = top_topic_papers[(top_topic_papers['core_abstract'] == True) & 
                                (top_topic_papers['publish_time'] >= date_filter)]

## Intervention Papers

In [91]:
for keyword in intervention_keywords:
    covid_papers[keyword] = covid_papers['abstract'].str.contains(keyword)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [92]:
covid_papers['# Keywords in Abstract'] = covid_papers[intervention_keywords].sum(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [93]:
find_papers_w_keywords(intervention_keywords, covid_papers)

Identified 3 papers with "social distancing" in title, 36 relevant papers with "social distancing" in abstract
Identified 1 papers with "contact tracing" in title, 16 relevant papers with "contact tracing" in abstract
Identified 0 papers with "case isolation" in title, 5 relevant papers with "case isolation" in abstract
Identified 0 papers with "shelter-in-place" in title, 0 relevant papers with "shelter-in-place" in abstract
Identified 0 papers with "stay-at-home" in title, 1 relevant papers with "stay-at-home" in abstract
Identified 0 papers with "movement restriction" in title, 2 relevant papers with "movement restriction" in abstract
Identified 0 papers with "event cancel" in title, 0 relevant papers with "event cancel" in abstract
Identified 0 papers with "face mask" in title, 4 relevant papers with "face mask" in abstract
Identified 0 papers with "facial mask" in title, 2 relevant papers with "facial mask" in abstract
Identified 0 papers with "travel ban" in title, 9 relevant pap

In [94]:
intervention_papers = covid_papers[covid_papers['# Keywords in Abstract'] > 1]

In [95]:
len(intervention_papers)

12

In [96]:
intervention_papers.to_csv("intervention_papers_metadata.csv", index=False)

## Search Full Papers for relevant sentences and paragraphs

In [97]:
def find_keyword(keywords, text):
    """
    Iterates through a list of keywords and searches them in a string of text.

    inputs:
      keywords: list of keywords
      text: string of text

    output: number of times keywords are found in the text
    """
    find = []
    for keyword in keywords:
        find.extend(re.findall(keyword, text.lower()))
    return len(find)

In [98]:
def search_body_text(sha, folder1, folder2, keywords, sentence_only):
    """
    Searches a single full length text for sentences/paragraphs which contain a list of keywords.

    inputs:
      sha: sha file name
      folder1: text folder name
      folder2: pdf or pmc folder name
      keywords: list of keywords to search for
      sentence_only: whether or not to show sentence only or full paragraph
    
    output: list of sentences/paragraphs found containing keywords
    """

    #open text file
    with open('/kaggle/input/CORD-19-research-challenge/'+folder1+'/'+folder1+'/'+folder2+'/'+sha+'.json') as f:
        file = json.load(f)
    
    found = []
    for text_dict in file["body_text"]:
        
        #if show_sentence_only, then split the paragraph into sentences, then look for keywords
        if sentence_only:
            sentences = text_dict["text"].split(". ")
            for sentence in sentences:
                count = find_keyword(keywords, sentence)
                if count > 0:
                    found.append(sentence)
                    
        #otherwise, show the whole paragraph
        else:
            count = find_keyword(keywords, text_dict["text"])
            if count > 0:
                #print(text_dict["section"])
                found.append(text_dict["text"])
                
    return(found)

In [99]:
def automated_lit_search(metadata_subset, keywords, sentence_only=True):
    """
    Creates a table keyword findings.
    
    inputs:
      metadata_subset: subset of metadata file to search
      keywords: list of keywords to search
      sentence_only: whether or not to show sentence only or full paragraph
    
    output: dataframe table of results with columns containing index, title, and text snippet
    """
    results = []
    
    indices = metadata_subset[metadata_subset['has_pdf_parse'] == True].index
    indices_pmc = metadata_subset[metadata_subset['has_pmc_xml_parse'] == True].index
    indices.append(indices_pmc)
    
    for index in indices:
        
        #find text location
        sha = metadata_subset["sha"][index].split(';')[0]
        folder1 = metadata_subset["full_text_file"][index]
        if metadata_subset['has_pdf_parse'][index] == True:
            folder2 = 'pdf_json'
        elif metadata_subset['has_pmc_xml_parse'][index] == True:
            folder2 = 'pmc_json'
        
        #open text and search for keywords
        found = search_body_text(sha, folder1, folder2, keywords, sentence_only)
        if len(found) > 0:
            for f in found:
                results.append([index, metadata_subset["title"][index], f])
                
    results_df = pd.DataFrame(results, columns=["index","title","text"])
    return(results_df)

In [100]:
intervention_sentences = automated_lit_search(intervention_papers, intervention_keywords, True)
intervention_sentences.to_csv('intervention_sentences.csv', index=False)

In [101]:
intervention_paragraphs = automated_lit_search(intervention_papers, intervention_keywords, False)
intervention_paragraphs.to_csv('intervention_paragraphs.csv', index=False)

In [102]:
list(intervention_papers['title'])

['Interventions to mitigate early spread of SARS-CoV-2 in Singapore: a modelling study',
 'School closure and management practices during coronavirus outbreaks including COVID-19: a rapid systematic review',
 'Impact of school closures for COVID-19 on the US health-care workforce and net mortality: a modelling study',
 'Sentinel Event Surveillance to Estimate Total SARS-CoV-2 Infections, United States',
 'Age profile of susceptibility, mixing, and social distancing shape the dynamics of the novel coronavirus disease 2019 outbreak in China',
 'The Effectiveness of Social Distancing in Mitigating COVID-19 Spread: a modelling analysis',
 'A Social Network Model of the COVID-19 Pandemic',
 'A Genomic Survey of SARS-CoV-2 Reveals Multiple Introductions into Northern California without a Predominant Lineage',
 'Pandemic Politics: Timing State-Level Social Distancing Responses to COVID-19',
 'Will novel virus go pandemic or be contained?',
 'Estimating Risk for Death from 2019 Novel Coronavir