In [88]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer
import string
import json
import re
import gensim
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from collections import Counter

In [49]:
pd.set_option('display.max_colwidth', 500)

In [50]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lordh1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/lordh1/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Import file
- Only keep rows with non-null titles and abstracts

In [51]:
m = pd.read_csv('metadata 4.csv')

In [52]:
m = m[(m['title'].notna() & m['abstract'].notna())]

## Lemmatize Abstracts and Titles (get word roots)

- lowercase
- remove punctuation
- remove stopwords
- get root of words

In [53]:
w_tokenizer = WhitespaceTokenizer()
lemmatizer = WordNetLemmatizer()

In [54]:
def preprocess(sentence):
    sentence = sentence.lower()
    sentence_no_punctuation = sentence.translate(str.maketrans('', '', string.punctuation))
    lemmatized_list = [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(sentence_no_punctuation) 
                  if w not in stopwords.words('english')]
    return lemmatized_list

In [55]:
m['abstract_lemmatized']=m['abstract'].map(lambda s:preprocess(s)) 

In [56]:
data_words = list(m['abstract_lemmatized'])

In [59]:
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
bigram_mod = gensim.models.phrases.Phraser(bigram)

In [60]:
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [61]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

In [62]:
def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [63]:
m['abstract_lemmatized_grams']= make_trigrams(m['abstract_lemmatized'])

In [64]:
def abstract_to_string(text):
    return ' '.join(word for word in text)

In [66]:
m['cleanAbstract'] = m['abstract_lemmatized_grams'].map(lambda s:abstract_to_string(s))

## LDA on abstracts - grid search for best number of topics

In [69]:
count_vectorizer = CountVectorizer(stop_words='english')

In [70]:
data_vectorized = count_vectorizer.fit_transform(m['cleanAbstract'])

## Last time best n_components was 5 (options 5, 10, 15, 20, 25, 30, 35)

In [72]:
search_params = {'n_components': [5]}

In [73]:
lda = LDA()

In [74]:
model = GridSearchCV(lda, param_grid=search_params)

In [75]:
model.fit(data_vectorized)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=LatentDirichletAllocation(batch_size=128,
                                                 doc_topic_prior=None,
                                                 evaluate_every=-1,
                                                 learning_decay=0.7,
                                                 learning_method='batch',
                                                 learning_offset=10.0,
                                                 max_doc_update_iter=100,
                                                 max_iter=10,
                                                 mean_change_tol=0.001,
                                                 n_components=10, n_jobs=None,
                                                 perp_tol=0.1,
                                                 random_state=None,
                                                 topic_word_prior=None,
                                   

In [76]:
model.best_params_

{'n_components': 5}

In [77]:
best_lda_model = model.best_estimator_

In [78]:
number_topics = model.best_params_['n_components']

In [79]:
# Helper function
def print_topics(model, count_vectorizer, n_top_words=10):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

In [80]:
# Print the topics found by the LDA model
print("Topics found via LDA:")
print_topics(best_lda_model, count_vectorizer)

Topics found via LDA:

Topic #0:
protein virus rna viral sequence gene genome cell coronavirus study

Topic #1:
patient infection virus respiratory study case clinical sample result influenza

Topic #2:
disease outbreak health data model study infectious epidemic method transmission

Topic #3:
cell virus infection viral expression protein host study response disease

Topic #4:
virus infection cell vaccine response mouse antibody immune human study


## Looking for Topic associated with NPI + modelling efforts

In [81]:
topics = best_lda_model.transform(data_vectorized)

In [82]:
for idx in range(number_topics):
    col_name = 'Topic ' + str(idx)
    m[col_name] = topics[:, idx]

## Most basic - non-pharm in title

In [83]:
non_pharm = m[(m['abstract'].str.contains('non-pharm'))]

In [84]:
topic_cols = [x for x in m.columns if 'Topic ' in x]

In [85]:
non_pharm_topics = non_pharm[topic_cols].idxmax(axis=1)

In [86]:
def most_frequent(List): 
    return max(set(List), key = List.count) 

## Take the topics that match most for NPI modeling papers, then find all papers with that as their top topic

In [89]:
Counter(non_pharm_topics)

Counter({'Topic 2': 37, 'Topic 1': 8})

In [90]:
top_topic = most_frequent(list(non_pharm_topics))

top_topic

'Topic 2'

In [91]:
m['Top_Topic'] = m[topic_cols].idxmax(axis=1)

In [92]:
m.groupby('Top_Topic').size()

Top_Topic
Topic 0     9119
Topic 1     7818
Topic 2    10135
Topic 3     6522
Topic 4     5305
dtype: int64

In [93]:
top_topic_papers = m[m['Top_Topic'] == top_topic]

In [133]:
date_filter = '2019-12-01'
top_topic_papers[top_topic_papers['publish_time']>date_filter].shape

(1694, 28)

## Keywords

- Need a core covid keyword
- And need a topic keyword

## Make sure keywords are lemmatized version

In [95]:
check_words = ['isolation',
                'social',
                'distancing',
                'contact',
                'tracing',
                'event',
                'cancelation',
                'case',
                'isolation',
                'shelter',
                'place', 
                'stay',
                'home', 
                'movement',
                'restriction',
                'economic', 
                'unemployment',
                'depression',
                'financial',
                'crisis',
                'market',
                'stock',
                'macroeconomics',
                'dsge',
                'face',
                'facial',
                'mask',
                'travel',
                'ban',
                'school',
                'closure',
                'benefits',
                'costs',
                'economy',
                'poverty',
                'health_care'
                'bankrupt'] 

In [96]:
[lemmatizer.lemmatize(w) for w in check_words if lemmatizer.lemmatize(w) != w]

['benefit', 'cost']

## Keywords chosen

In [97]:
covid_keywords = ['corona', 'covid']

In [98]:
intervention_keywords = [#'isolation', #can't use - too broad 
         # i.e. isolation of sequences from a random-sequence expression library that mimic viral epitopes
                        'social distancing',
                        'contact tracing',
                        'case isolation',
                        'shelter place', # in/at is removed stopword
                        'stay home', # at is removed stopword
                        'movement restriction',
                        'event cancellation',
                        'face mask',
                        'facial mask',
                        'travel ban',
                        'school closure']

In [99]:
economic_keywords = ['economic', 
                    'unemployment',
                     'unemploy',
                   # 'depression', # mental health articles here
                # i.e. 148. acute bipolar disorder depression is associated with immune activation
                    'financial crisis',
                  #  'market', # related to physical markets
            # i.e. wet markets—a continuing source of severe acute respiratory syndrome and influenza?
                #    ' stock', # often completly unrelated (or related to livestock if no space)
            # i.e. plaque assay for human coronavirus nl63 using human colon carcinoma cells
                    'stock market',
                    'macroeconomics',
                    'dsge',
                    'benefit',
                'cost',
                'economy',
                'poverty',
                'health_care'
                'bankrupt']

## Create Functions

In [104]:
def find_papers_w_keywords(topic_keywords, papers):
    for keyword in topic_keywords:
        num_papers_title = len(papers[(papers['title'].str.contains(keyword)) & 
                                        (papers['title'])])
        num_papers_abstract = len(papers[papers['cleanAbstract'].str.contains(keyword)])
        print ('Identified {} papers with "{}" in title, {} relevant papers with "{}" in abstract'\
                       .format(num_papers_title, keyword, num_papers_abstract, keyword)) 

In [105]:
# 0 for title search, 1 for abstract search
def return_papers(topic_keywords, papers, abstract_search=1):
    relevant_papers = pd.DataFrame(columns=['title', 'abstract', 'sha', 'full_text_file', 'has_pdf_parse', 'has_pmc_xml_parse', 'publish_time'])
    if abstract_search == 1:
        for keyword in topic_keywords:
            relevant_papers = pd.concat((relevant_papers, papers[['title', 'abstract', 
                                'sha', 'full_text_file', 'has_pdf_parse', 'has_pmc_xml_parse', 'publish_time']][papers['cleanAbstract'].str.contains(keyword)]))
    else:
        for keyword in topic_keywords:
            relevant_papers = pd.concat((relevant_papers, papers[['title', 'abstract', 
                        'sha', 'full_text_file', 'has_pdf_parse', 'has_pmc_xml_parse', 'publish_time']][papers['cleanTitle'].str.contains(keyword)]))
    
    return relevant_papers.drop_duplicates()

## Identify core papers

In [106]:
find_papers_w_keywords(covid_keywords, top_topic_papers)

Identified 480 papers with "corona" in title, 1443 relevant papers with "corona" in abstract
Identified 4 papers with "covid" in title, 785 relevant papers with "covid" in abstract


In [103]:
top_topic_papers['core_abstract'] = top_topic_papers['cleanAbstract'].apply(lambda x: any([k in x for k in covid_keywords]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [107]:
len(top_topic_papers), len(top_topic_papers[top_topic_papers['core_abstract'] == True])

(10135, 1764)

In [108]:
covid_papers = top_topic_papers[top_topic_papers['core_abstract'] == True]

## Intervention Papers


In [109]:
find_papers_w_keywords(intervention_keywords, covid_papers)

Identified 4 papers with "social distancing" in title, 2 relevant papers with "social distancing" in abstract
Identified 3 papers with "contact tracing" in title, 0 relevant papers with "contact tracing" in abstract
Identified 0 papers with "case isolation" in title, 7 relevant papers with "case isolation" in abstract
Identified 0 papers with "shelter place" in title, 2 relevant papers with "shelter place" in abstract
Identified 0 papers with "stay home" in title, 0 relevant papers with "stay home" in abstract
Identified 0 papers with "movement restriction" in title, 4 relevant papers with "movement restriction" in abstract
Identified 1 papers with "event cancellation" in title, 3 relevant papers with "event cancellation" in abstract
Identified 0 papers with "face mask" in title, 0 relevant papers with "face mask" in abstract
Identified 0 papers with "facial mask" in title, 0 relevant papers with "facial mask" in abstract
Identified 2 papers with "travel ban" in title, 1 relevant paper

In [110]:
intervention_papers = return_papers(intervention_keywords, covid_papers, 1)

In [111]:
len(intervention_papers)

19

In [112]:
intervention_papers = intervention_papers[intervention_papers['publish_time']>date_filter]
len(intervention_papers)

19

In [132]:
list(intervention_papers['title'])

['Society of Cardiovascular Computed Tomography Guidance for Use of Cardiac Computed Tomography Amidst the COVID-19 Pandemic',
 'Coronavirus nixes conference, twilight zone beckons and a faded star brightens',
 'The Effectiveness of Social Distancing in Mitigating COVID-19 Spread: a modelling analysis',
 'A Social Network Model of the COVID-19 Pandemic',
 'Suppression and Mitigation Strategies for Control of COVID-19 in New Zealand',
 'Containing 2019-nCoV (Wuhan) coronavirus',
 'Early containment strategies and core measures for prevention and control of novel coronavirus pneumonia in China',
 'What further should be done to control COVID-19 outbreaks in addition to cases isolation and contact tracing measures?',
 'Feasibility of controlling COVID-19 outbreaks by isolation of cases and contacts',
 'Window of Opportunity for Mitigation to Prevent Overflow of ICU capacity in Chicago by COVID-19',
 'Characterizing occupations that cannot work from home: a means to identify susceptible wo

In [114]:
intervention_papers.to_csv("intervention_abstracts.csv", index=False)

## Economic Papers

In [115]:
find_papers_w_keywords(economic_keywords, covid_papers)

Identified 1 papers with "economic" in title, 103 relevant papers with "economic" in abstract
Identified 0 papers with "unemployment" in title, 0 relevant papers with "unemployment" in abstract
Identified 0 papers with "unemploy" in title, 0 relevant papers with "unemploy" in abstract
Identified 0 papers with "financial crisis" in title, 0 relevant papers with "financial crisis" in abstract
Identified 0 papers with "stock market" in title, 0 relevant papers with "stock market" in abstract
Identified 0 papers with "macroeconomics" in title, 1 relevant papers with "macroeconomics" in abstract
Identified 0 papers with "dsge" in title, 0 relevant papers with "dsge" in abstract
Identified 1 papers with "benefit" in title, 44 relevant papers with "benefit" in abstract
Identified 5 papers with "cost" in title, 53 relevant papers with "cost" in abstract
Identified 0 papers with "economy" in title, 38 relevant papers with "economy" in abstract
Identified 0 papers with "poverty" in title, 3 rele

In [116]:
economic_papers = return_papers(economic_keywords, covid_papers, 1)

In [117]:
len(economic_papers)

207

In [118]:
economic_papers = economic_papers[economic_papers['publish_time']>date_filter]
len(economic_papers)

131

In [119]:
economic_papers[economic_papers['abstract'].str.contains('economic')]

Unnamed: 0,title,abstract,sha,full_text_file,has_pdf_parse,has_pmc_xml_parse,publish_time
2565,Human Coronaviruses: General Features,"Abstract Human coronaviruses (HCoVs), including HCoV-229E, HCoV-OC43, HCoV-NL63, and HCoV-HKU1, are traditionally known to cause symptoms of common cold with only moderate clinical impact. Severe acute respiratory syndrome coronavirus (SARS-CoV) and Middle East respiratory syndrome coronavirus (MERS-CoV), on the other hand, have strike humans in the past two decades as highly fatal human pathogens leading to considerable mortality and economic loss. This article summaries the updates on the ...",90798ce4da0c11f8eba3a943743b0a1584ff046a,custom_license,True,False,2019-12-31
2811,Chapter 3 Infectious Bronchitis Virus in Poultry: Molecular Epidemiology and Factors Leading to the Emergence and Reemergence of Novel Strains of Infectious Bronchitis Virus,"Abstract Infectious bronchitis virus (IBV) is a coronavirus that causes an acute and highly contagious disease in chickens. The virus can cause substantial economic losses throughout the poultry industry worldwide. It can affect the upper respiratory tract and the reproductive tract, and some strains can cause nephritis. The causative agent IBV is an RNA virus with great ability for mutation and recombination, thus capable of generating new virus strains that are difficult to control. There ...",a0de24c5fdb4a0e77b1b5e8c002ced81ebe9b227,custom_license,True,False,2020-12-31
4256,Coronavirus Disease 2019 (COVID-19) Pandemic and Pregnancy,"Abstract The current coronavirus disease 2019 (COVID-19) pneumonia pandemic, caused by the severe acute respiratory syndrome 2 (SARS-CoV-2) virus, is spreading globally at an accelerated rate, with a basic reproduction number (R0) of 2 – 2.5, indicating that 2 – 3 persons will be infected from an index patient. A serious public health emergency, it is particularly deadly in vulnerable populations and communities in which healthcare providers are insufficiently prepared to manage the infectio...",5dc4268a42adf3d5c55c87b7f6518de600b057c5,custom_license,True,False,2020-03-23
16134,Mesenchymal stem cells and management of COVID-19 pneumonia,"Abstract Human coronavirus, hCoV-19, is highly pathogenic with severe pneumonia associated with rapid virus replication. Arising in Wuhan China December 2019, the current COVID-19 epidemic has rapidly grown with person-to-person infection expanding to become a global health emergency now on pandemic scale. In mitigation of this current COVID-19 pandemic, according to Anderson et al. 2020 [1], governments will not be able to minimise both deaths from COVID-19 and the economic impact of viral ...",f3ed8362aa46c7eb327ee008b02e1849df7b110e,custom_license,True,False,2020-03-19
28640,Complete Genome Sequence of Avian Coronavirus Strain GA08 (GI-27 Lineage),"Avian coronavirus, also known as infectious bronchitis virus, is a highly contagious respiratory pathogen of chickens that is responsible for major economic losses to the poultry industry around the globe. Here, we report the complete genome sequence of strain GA08 of the GI-27 lineage, isolated from a fecal sample from a broiler chicken collected in Georgia in 2015.",24dcca53dd31df074a77242d2e2e6053825e1761,custom_license,True,True,2020-02-27
30665,An Ounce of Prevention: Coronavirus (COVID-19) and Mass Gatherings,"Widespread, non-stop, and often sensational coverage of the coronavirus (COVID-19) has caught many governments flat-footed in efforts to protect the health and safety of their citizens. In response to the current global health event, the World Health Organization (WHO) declared COVID-19 a pandemic. Mass gatherings present a historic challenge in protecting the health and safety of attendees. The majority of the prominent mass gatherings are religious in nature. Global sporting events, such a...",7eb8da93320ea097a81497d97ed22bdad6e708dc,comm_use_subset,True,True,2020-03-20
31577,"Knowledge, attitudes, and practices towards COVID-19 among Chinese residents during the rapid rise period of the COVID-19 outbreak: a quick online cross-sectional survey","Unprecedented measures have been adopted to control the rapid spread of the ongoing COVID-19 epidemic in China. People's adherence to control measures is affected by their knowledge, attitudes, and practices (KAP) towards COVID-19. In this study, we investigated Chinese residents' KAP towards COVID-19 during the rapid rise period of the outbreak. An online sample of Chinese residents was successfully recruited via the authors' networks with residents and popular media in Hubei, China. A self...",ae836e53c3ca9f8a84c0dbb3f2757ec7f17b6f49,comm_use_subset,True,True,2020-03-15
41557,Assessing the impact of reduced travel on exportation dynamics of novel coronavirus infection (COVID-19),"The impact of the drastic reduction in travel volume within mainland China in January and February 2020 was quantified with respect to reports of novel coronavirus (COVID-19) infections outside China. Data on confirmed cases diagnosed outside China were analyzed using statistical models to estimate the impact of travel reduction on three epidemiological outcome measures: (i) the number of exported cases, (ii) the probability of a major epidemic, and (iii) the time delay to a major epidemic. ...",f4d76fb8161f4cee6987d7d412798e1276348858,biorxiv_medrxiv,True,False,2020-02-17
41581,Risk map of the novel coronavirus (2019-nCoV) in China: proportionate control is needed,"Background China is running a national level antivirus campaign against the novel coronavirus (2019-nCoV). Strict control measures are being enforced in either the populated areas and remote regions. While the virus is closed to be under control, tremendous economic loss has been caused. Methods and findings We assessed the pandemic risk of 2019-nCoV for all cities/regions in China using the random forest algorithm, taking into account the effect of five factors: the accumulative and increas...",fbebe4b66073c44cface2e842754bce26e3e2913,biorxiv_medrxiv,True,False,2020-02-18
41607,Effective containment explains sub-exponential growth in confirmed cases of recent COVID-19 outbreak in Mainland China,"The recent outbreak of COVID-19 in Mainland China is characterized by a distinctive algebraic, sub-exponential increase of confirmed cases with time during the early phase of the epidemic, contrasting an initial exponential growth expected for an unconstrained outbreak with sufficiently large reproduction rate. Although case counts vary significantly between affected provinces in Mainland China, the scaling law t^μ is surprisingly universal, with a range of exponents μ = 2.1 ± 0.3. The unive...",fc7a6b5d1852c5ecce2d20fd0d73d5f957ed7055,biorxiv_medrxiv,True,False,2020-02-20


In [120]:
economic_papers.to_csv("economic_abstracts.csv", index=False)

## Economic seems to be the only keyword, and works in title, not abstract

In [122]:
return_papers(economic_keywords, covid_papers, 1)

Unnamed: 0,title,abstract,sha,full_text_file,has_pdf_parse,has_pmc_xml_parse,publish_time
1934,Chapter 1 Agents of Emerging Infectious Diseases,"Abstract Dramatic improvements in the control of infectious diseases in developed countries owing to socioeconomic changes, vaccines, and antibiotics during the first seven decades of the 20th century led to the mistakened concept that infectious diseases would no longer be a concern. Since the declaration of victory in the war against infectious diseases in 1967, approximately 50 new disease agents have been identified. Nearly every type of etiologic agent and clinical manifestation have be...",023b89a5ec6dec38e943ec4cfc67598845d3b0ff,custom_license,True,False,2009-12-31
2565,Human Coronaviruses: General Features,"Abstract Human coronaviruses (HCoVs), including HCoV-229E, HCoV-OC43, HCoV-NL63, and HCoV-HKU1, are traditionally known to cause symptoms of common cold with only moderate clinical impact. Severe acute respiratory syndrome coronavirus (SARS-CoV) and Middle East respiratory syndrome coronavirus (MERS-CoV), on the other hand, have strike humans in the past two decades as highly fatal human pathogens leading to considerable mortality and economic loss. This article summaries the updates on the ...",90798ce4da0c11f8eba3a943743b0a1584ff046a,custom_license,True,False,2019-12-31
2736,Chapter 11 Structural Insight Into the Viral 3C-Like Protease Inhibitors: Comparative SAR/QSAR Approaches,"Abstract Severe acute respiratory syndrome (SARS), caused by SARS-coronavirus (SARS-CoV), is a dreadful infection worldwide having economic and medical importance and a global threat for health. It was turned into an epidemic in South China followed by a chain of infections across three generations. A number of pathogeneses in human may occur due to the virus. This infection has not been taken into account before the SARS outbreak, and still it is a neglected one. Therefore, there is an urge...",5779e410753e13a0a79f9f872321fe42df49e65c,custom_license,True,False,2017-12-31
2811,Chapter 3 Infectious Bronchitis Virus in Poultry: Molecular Epidemiology and Factors Leading to the Emergence and Reemergence of Novel Strains of Infectious Bronchitis Virus,"Abstract Infectious bronchitis virus (IBV) is a coronavirus that causes an acute and highly contagious disease in chickens. The virus can cause substantial economic losses throughout the poultry industry worldwide. It can affect the upper respiratory tract and the reproductive tract, and some strains can cause nephritis. The causative agent IBV is an RNA virus with great ability for mutation and recombination, thus capable of generating new virus strains that are difficult to control. There ...",a0de24c5fdb4a0e77b1b5e8c002ced81ebe9b227,custom_license,True,False,2020-12-31
4256,Coronavirus Disease 2019 (COVID-19) Pandemic and Pregnancy,"Abstract The current coronavirus disease 2019 (COVID-19) pneumonia pandemic, caused by the severe acute respiratory syndrome 2 (SARS-CoV-2) virus, is spreading globally at an accelerated rate, with a basic reproduction number (R0) of 2 – 2.5, indicating that 2 – 3 persons will be infected from an index patient. A serious public health emergency, it is particularly deadly in vulnerable populations and communities in which healthcare providers are insufficiently prepared to manage the infectio...",5dc4268a42adf3d5c55c87b7f6518de600b057c5,custom_license,True,False,2020-03-23
6314,Wet markets—a continuing source of severe acute respiratory syndrome and influenza?,"Summary Context Live-animal markets (wet markets) provide a source of vertebrate and invertebrate animals for customers in tropical and subtropical regions of the world. Wet markets sell live poultry, fish, reptiles, and mammals of every kind. Live-poultry markets (mostly chicken, pigeon, quail, ducks, geese, and a wide range of exotic wild-caught and farm-raised fowl) are usually separated from markets selling fish or red-meat animals, but the stalls can be near each other with no physical ...",7dd2342dec468a14a74dded901f70ec7d84a38aa,custom_license,True,False,2004-01-17
6842,Genotyping of avian infectious bronchitis virus in Iran: Detection of D274 and changing in the genotypes rate,"Abstract The coronavirus avian Infectious bronchitis virus (IBV) poses economic threats to poultry farms worldwide, affecting the performance of both meat-type and egg-laying birds. To define the evolution of recent IBVs in Iran, a genetic analysis based on hypervariable nucleotide sequences of S1 gene was carried out. Tracheal swab samples were collected from 170 Broiler flocks during 2017. Ten tracheal swabs from each flock pooled. From a total number of 170 flocks tested, 84.71% found to ...",52467f330dbc2d3ec0e98abd1ac58eb15a9b6aa2,custom_license,True,False,2019-08-31
8336,Porcine epidemic diarrhea: A retrospect from Europe and matters of debate,"Abstract A retrospect is given on the emergence of porcine epidemic diarrhea (PED) during the early seventies in Europe. While, at first, it appeared as a disease affecting feeder pigs, fattening- and adult swine, it later also became pathogenic for neonatal and suckling pigs hereby drastically increasing its economic impact. Isolation of the causative virus revealed a new porcine coronavirus, the origin of which has never been clarified. Pathogenesis studies with the prototype strain CV777 ...",b9fbe0887c389e6c5202eaf5d7e21c9d19ab6bc6,custom_license,True,False,2016-12-02
8343,"Evolution, antigenicity and pathogenicity of global porcine epidemic diarrhea virus strains","Abstract Emerging and re-emerging coronaviruses cause morbidity and mortality in human and animal populations, resulting in serious public and animal health threats and economic losses. The ongoing outbreak of a highly contagious and deadly porcine epidemic diarrhea virus (PEDV) in Asia, the Americas and Europe is one example. Genomic sequence analyses of PEDV variants have revealed important insights into the evolution of PEDV. However, the antigenic variations among different PEDV strains ...",40812d7b28ab3c043610f6e224b3f62806af6eb6,custom_license,True,False,2016-12-02
11643,miR-146a-5p promotes replication of infectious bronchitis virus by targeting IRAK2 and TNFRSF18,"Abstract Avian infectious bronchitis virus (IBV) is a coronavirus which infects chickens (Gallus gallus) of all ages and causes significant economic losses to the poultry industry worldwide. The present study aims to analyze the miRNAs related to pathogenicity of nephropathogenic IBVs. It was found that four miRNAs (miR-1454, miR-3538, miR-146a-5p and miR-215-5p) were related to the infection of virulent nephropathogenic IBV with transcript per million (TPM) > 500 and more than a 2-fold alte...",dd5ffacaf860a7b441e4bd4496b4b1c02fda278d,custom_license,True,False,2018-07-31


## Corona may also be too broad, but not sure how to limit otherwise

## Search full text files

In [123]:
def find_keyword(keywords, text):
    """
    Iterates through a list of keywords and searches them in a string of text.

    inputs:
      keywords: list of keywords
      text: string of text

    output: number of times keywords are found in the text
    """
    find = []
    for keyword in keywords:
        find.extend(re.findall(keyword, text.lower()))
    return len(find)

In [124]:
def search_body_text(sha, folder1, folder2, keywords, sentence_only):
    """
    Searches a single full length text for sentences/paragraphs which contain a list of keywords.

    inputs:
      sha: sha file name
      folder1: text folder name
      folder2: pdf or pmc folder name
      keywords: list of keywords to search for
      sentence_only: whether or not to show sentence only or full paragraph
    
    output: list of sentences/paragraphs found containing keywords
    """

    #open text file
    with open('./CORD-19-research-challenge/'+folder1+'/'+folder1+'/'+folder2+'/'+sha+'.json') as f:
        file = json.load(f)
    
    found = []
    for text_dict in file["body_text"]:
        
        #if show_sentence_only, then split the paragraph into sentences, then look for keywords
        if sentence_only:
            sentences = text_dict["text"].split(". ")
            for sentence in sentences:
                count = find_keyword(keywords, sentence)
                if count > 0:
                    found.append(sentence)
                    
        #otherwise, show the whole paragraph
        else:
            count = find_keyword(keywords, text_dict["text"])
            if count > 0:
                #print(text_dict["section"])
                found.append(text_dict["text"])
                
    return(found)

In [125]:
def automated_lit_search(metadata_subset, keywords, sentence_only=True):
    """
    Creates a table keyword findings.
    
    inputs:
      metadata_subset: subset of metadata file to search
      keywords: list of keywords to search
      sentence_only: whether or not to show sentence only or full paragraph
    
    output: dataframe table of results with columns containing index, title, and text snippet
    """
    results = []
    
    indices = metadata_subset[metadata_subset['has_pdf_parse'] == True].index
    indices_pmc = metadata_subset[metadata_subset['has_pmc_xml_parse'] == True].index
    indices.append(indices_pmc)
    
    for index in indices:
        
        #find text location
        sha = metadata_subset["sha"][index].split(';')[0]
        folder1 = metadata_subset["full_text_file"][index]
        if metadata_subset['has_pdf_parse'][index] == True:
            folder2 = 'pdf_json'
        elif metadata_subset['has_pmc_xml_parse'][index] == True:
            folder2 = 'pmc_json'
        
        #open text and search for keywords
        found = search_body_text(sha, folder1, folder2, keywords, sentence_only)
        if len(found) > 0:
            for f in found:
                results.append([index, metadata_subset["title"][index], f])
                
    results_df = pd.DataFrame(results, columns=["index","title","text"])
    return(results_df)

In [127]:
# intervention_sentences = automated_lit_search(intervention_papers, intervention_keywords, True)
# intervention_sentences.to_csv('intervention_sentences_v3.csv', index=False)

In [128]:
# intervention_paragraphs = automated_lit_search(intervention_papers, intervention_keywords, False)
# intervention_paragraphs.to_csv('intervention_paragraphs_v3.csv', index=False)

In [129]:
# economic_sentences = automated_lit_search(economic_papers, economic_keywords, True)
# economic_sentences.to_csv('economic_sentences_v3.csv', index=False)

In [130]:
# economic_paragraphs = automated_lit_search(economic_papers, economic_keywords, False)
# economic_paragraphs.to_csv('economic_paragraphs_v3.csv', index=False)