In [56]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer
import string
import json
import re

In [2]:
pd.set_option('display.max_colwidth', 500)

In [10]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mok\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mok\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

## Import file
- Only keep rows with non-null titles and abstracts

In [3]:
m = pd.read_csv('./CORD-19-research-challenge/metadata.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
m = m[(m['title'].notna() & m['abstract'].notna())]

## Lemmatize Abstracts and Titles (get word roots)

- lowercase
- remove punctuation
- remove stopwords
- get root of words

In [5]:
w_tokenizer = WhitespaceTokenizer()
lemmatizer = WordNetLemmatizer()

In [6]:
def preprocess(sentence):
    sentence = sentence.lower()
    sentence_no_punctuation = sentence.translate(str.maketrans('', '', string.punctuation))
    lemmatized_list = [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(sentence_no_punctuation) 
                  if w not in stopwords.words('english')]
    return ' '.join(word for word in lemmatized_list)

In [14]:
m['cleanTitle']=m['title'].map(lambda s:preprocess(s)) 

In [15]:
m['cleanAbstract']=m['abstract'].map(lambda s:preprocess(s)) 

In [16]:
m.to_csv('./CORD-19-research-challenge/metadata_lemmatized.csv', index=False)

## Keywords

- Need a core covid keyword
- And need a topic keyword

## Make sure keywords are lemmatized version

In [18]:
check_words = ['isolation',
                'social',
                'distancing',
                'contact',
                'tracing',
                'event',
                'cancelation',
                'case',
                'isolation',
                'shelter',
                'place', 
                'stay',
                'home', 
                'movement',
                'restriction',
                'economic', 
                'unemployment',
                'depression',
                'financial',
                'crisis',
                'market',
                'stock',
                'macroeconomics',
                'dsge',
                'face',
                'facial',
                'mask',
                'travel',
                'ban',
                'school',
                'closure'] 

In [19]:
[lemmatizer.lemmatize(w) for w in check_words if lemmatizer.lemmatize(w) != w]

[]

## Confirmation all words at root word

In [20]:
covid_keywords = ['corona', 'covid']

In [21]:
intervention_keywords = [#'isolation', #can't use - too broad 
         # i.e. isolation of sequences from a random-sequence expression library that mimic viral epitopes
                        'social distancing',
                        'contact tracing',
                        'case isolation',
                        'shelter place', # in/at is removed stopword
                        'stay home', # at is removed stopword
                        'movement restriction',
                        'event cancellation',
                        'face mask',
                        'facial mask',
                        'travel ban',
                        'school closure']

In [22]:
economic_keywords = ['economic', 
                    'unemployment',
                     'unemploy',
                   # 'depression', # mental health articles here
                # i.e. 148. acute bipolar disorder depression is associated with immune activation
                    'financial crisis',
                  #  'market', # related to physical markets
            # i.e. wet markets—a continuing source of severe acute respiratory syndrome and influenza?
                #    ' stock', # often completly unrelated (or related to livestock if no space)
            # i.e. plaque assay for human coronavirus nl63 using human colon carcinoma cells
                    'stock market',
                    'macroeconomics',
                    'dsge']

## Create Functions

In [23]:
def find_papers_w_keywords(topic_keywords, papers):
    for keyword in topic_keywords:
        num_papers_title = len(papers[(papers['cleanTitle'].str.contains(keyword)) & 
                                        (papers['cleanTitle'])])
        num_papers_abstract = len(papers[papers['cleanAbstract'].str.contains(keyword)])
        print ('Identified {} papers with "{}" in title, {} relevant papers with "{}" in abstract'\
                       .format(num_papers_title, keyword, num_papers_abstract, keyword)) 

In [24]:
# 0 for title search, 1 for abstract search
def return_papers(topic_keywords, papers, abstract_search=1):
    relevant_papers = pd.DataFrame(columns=['title', 'abstract', 'sha', 'full_text_file', 'has_pdf_parse', 'has_pmc_xml_parse'])
    if abstract_search == 1:
        for keyword in topic_keywords:
            relevant_papers = pd.concat((relevant_papers, papers[['title', 'abstract', 
                                'sha', 'full_text_file', 'has_pdf_parse', 'has_pmc_xml_parse']][papers['cleanAbstract'].str.contains(keyword)]))
    else:
        for keyword in topic_keywords:
            relevant_papers = pd.concat((relevant_papers, papers[['title', 'abstract', 
                        'sha', 'full_text_file', 'has_pdf_parse', 'has_pmc_xml_parse']][papers['cleanTitle'].str.contains(keyword)]))
    
    return relevant_papers.drop_duplicates()

## Identify core papers

In [25]:
find_papers_w_keywords(covid_keywords, m)

Identified 5207 papers with "corona" in title, 8427 relevant papers with "corona" in abstract
Identified 1124 papers with "covid" in title, 1547 relevant papers with "covid" in abstract


In [26]:
m['core_abstract'] = m['cleanAbstract'].apply(lambda x: any([k in x for k in covid_keywords]))

In [27]:
len(m), len(m[m['core_abstract'] == True])

(38899, 8965)

In [28]:
covid_papers = m[m['core_abstract'] == True]

## Intervention Papers


In [29]:
find_papers_w_keywords(intervention_keywords, covid_papers)

Identified 9 papers with "social distancing" in title, 73 relevant papers with "social distancing" in abstract
Identified 7 papers with "contact tracing" in title, 54 relevant papers with "contact tracing" in abstract
Identified 1 papers with "case isolation" in title, 7 relevant papers with "case isolation" in abstract
Identified 0 papers with "shelter place" in title, 2 relevant papers with "shelter place" in abstract
Identified 0 papers with "stay home" in title, 5 relevant papers with "stay home" in abstract
Identified 0 papers with "movement restriction" in title, 4 relevant papers with "movement restriction" in abstract
Identified 1 papers with "event cancellation" in title, 3 relevant papers with "event cancellation" in abstract
Identified 0 papers with "face mask" in title, 14 relevant papers with "face mask" in abstract
Identified 0 papers with "facial mask" in title, 1 relevant papers with "facial mask" in abstract
Identified 2 papers with "travel ban" in title, 14 relevant p

In [30]:
intervention_papers = return_papers(intervention_keywords, covid_papers, 1)

In [31]:
len(intervention_papers)

163

In [32]:
intervention_papers.head()

Unnamed: 0,title,abstract,sha,full_text_file,has_pdf_parse,has_pmc_xml_parse
4256,Coronavirus Disease 2019 (COVID-19) Pandemic and Pregnancy,"Abstract The current coronavirus disease 2019 (COVID-19) pneumonia pandemic, caused by the severe acute respiratory syndrome 2 (SARS-CoV-2) virus, is spreading globally at an accelerated rate, with a basic reproduction number (R0) of 2 – 2.5, indicating that 2 – 3 persons will be infected from an index patient. A serious public health emergency, it is particularly deadly in vulnerable populations and communities in which healthcare providers are insufficiently prepared to manage the infectio...",5dc4268a42adf3d5c55c87b7f6518de600b057c5,custom_license,True,False
6622,COVID-19 and Italy: what next?,"Summary The spread of severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) has already taken on pandemic proportions, affecting over 100 countries in a matter of weeks. A global response to prepare health systems worldwide is imperative. Although containment measures in China have reduced new cases by more than 90%, this reduction is not the case elsewhere, and Italy has been particularly affected. There is now grave concern regarding the Italian national health system's capacity to ...",849f0d4e93647e40b3a6f0841ebb2dd6a890a6b7,custom_license,True,False
10895,Unknown unknowns – COVID-19 and potential global mortality,"Abstract COVID-19 (SARS-CoV-2) is currently a global pandemic. This paper will attempt to estimate global infection rates and potential resultant mortality in the absence of effective treatment and/or vaccination. Calculations are based on World Health Organisation data from Wuhan in China: 14% of infected cases are severe, 5% require intensive care and 4% die. Estimated infection rates and mortality rates at the level of continents and some individual countries (when these are of sufficient...",b039cc050d0d94bf4783a550e68f3a0ce2f1f796,custom_license,True,False
11739,Using psychoneuroimmunity against COVID-19,"Abstract The worldwide outbreak of coronavirus disease 2019 (COVID-19) raises concerns of widespread panic and anxiety in individuals subjected to the real or perceived threat of the virus. Compared to general populations, patients who are institutionalized in a closed unit are also very vulnerable to COVID-19 infection and complications. This crisis touched on difficult issues of not only psychiatric care and ethics, but also psychological impacts to psychiatric care givers. In this Viewpoi...",4f47977f3202bca320c42ccbe3e0432c8bfab2f5,custom_license,True,False
13910,Transmission potential and severity of COVID-19 in South Korea,"Abstract Objectives Since the first case of 2019 novel coronavirus (COVID-19) identified on Jan 20, 2020 in South Korea, the number of cases rapidly increased, resulting in 6,284 cases including 42 deaths as of March 6, 2020. To examine the growth rate of the outbreak, we aimed to present the first study to report the reproduction number of COVID-19 in South Korea. Methods The daily confirmed cases of COVID-19 in South Korea were extracted from publicly available sources. By using the empiri...",1aed8cc899c77553312d72e2ec022fd77e50b4fb,custom_license,True,False


## Economic Papers

In [33]:
find_papers_w_keywords(economic_keywords, covid_papers)

Identified 10 papers with "economic" in title, 290 relevant papers with "economic" in abstract
Identified 0 papers with "unemployment" in title, 0 relevant papers with "unemployment" in abstract
Identified 0 papers with "unemploy" in title, 0 relevant papers with "unemploy" in abstract
Identified 0 papers with "financial crisis" in title, 0 relevant papers with "financial crisis" in abstract
Identified 0 papers with "stock market" in title, 0 relevant papers with "stock market" in abstract
Identified 0 papers with "macroeconomics" in title, 1 relevant papers with "macroeconomics" in abstract
Identified 0 papers with "dsge" in title, 0 relevant papers with "dsge" in abstract


In [34]:
economic_papers = return_papers(economic_keywords, covid_papers, 1)

In [35]:
len(economic_papers)

290

In [36]:
economic_papers[economic_papers['abstract'].str.contains('economic')]

Unnamed: 0,title,abstract,sha,full_text_file,has_pdf_parse,has_pmc_xml_parse
118,Comparison of ascites production for monoclonal antibodies in BALB/c and BALB/c-derived cross-bred mice,"Abstract BALB/c male mice were mated with either Swiss-Webster or MF1 females to produce first generation cross-bred offspring. Hybridoma cell lines, from the fusion of P3-NS1-Ag4/1 myeloma cells with spleen cells sensitised to the porcine coronavirus causing transmissible gastroenteritis, were injected intraperitoneally into these mice to produce ascitic fluid containing monoclonal antibodies. Mice of 11 weeks of age weighing between 26 and 34 g were used. The volume of ascites produced by ...",c93a0161a57b9a8ebcc1c6395d81ad237b38e66a,custom_license,True,False
1878,CHAPTER 28 Coronaviridae,"Publisher Summary Coronaviruses are ssRNA viruses that infect a wide range of mammalian and avian species; they are important causes of respiratory and enteric disease, encephalomyelitis, hepatitis, serositis, and vasculitis in domestic animals. In humans, coronaviruses are one of several groups of viruses that cause the common cold. The prototype of the family, avian infectious brochitis virus, is one of the most infectious of all viruses and causes an acute respiratory disease, which in yo...",0d9de5c910f092a3bb01beb690c06445d6cf1ca2,custom_license,True,False
1934,Chapter 1 Agents of Emerging Infectious Diseases,"Abstract Dramatic improvements in the control of infectious diseases in developed countries owing to socioeconomic changes, vaccines, and antibiotics during the first seven decades of the 20th century led to the mistakened concept that infectious diseases would no longer be a concern. Since the declaration of victory in the war against infectious diseases in 1967, approximately 50 new disease agents have been identified. Nearly every type of etiologic agent and clinical manifestation have be...",023b89a5ec6dec38e943ec4cfc67598845d3b0ff,custom_license,True,False
2427,Avian Infectious Bronchitis Virus,"Publisher Summary This chapter provides an overview of the classification, description, hosts, key developments, diagnostic techniques, and diagnostic reagents for avian infectious bronchitis virus. Avian infectious bronchitis virus belongs to the family Coronaviridae; genus Coronavirus; and species Avian infectious bronchitis virus. The virus causes acute contagious respiratory illness and includes reproductive tissue disease in chickens. The hosts of avian infectious bronchitis virus are o...",7601f80f8b2d0ad5ca0079439ef90136664e9cfb,custom_license,True,False
2565,Human Coronaviruses: General Features,"Abstract Human coronaviruses (HCoVs), including HCoV-229E, HCoV-OC43, HCoV-NL63, and HCoV-HKU1, are traditionally known to cause symptoms of common cold with only moderate clinical impact. Severe acute respiratory syndrome coronavirus (SARS-CoV) and Middle East respiratory syndrome coronavirus (MERS-CoV), on the other hand, have strike humans in the past two decades as highly fatal human pathogens leading to considerable mortality and economic loss. This article summaries the updates on the ...",90798ce4da0c11f8eba3a943743b0a1584ff046a,custom_license,True,False
...,...,...,...,...,...,...
46516,Investigation of nonlinear epidemiological models for analyzing and controlling the MERS outbreak in Korea,"Abstract Much concern has arisen regarding serious epidemics due to the Middle East Respiratory Syndrome (MERS) coronavirus. The first MERS case of Korea was reported on 20 May 2015, and since then, the MERS outbreak in Korea has resulted in hundreds of confirmed cases and tens of deaths. Deadly infectious diseases such as MERS have significant direct and indirect social impacts, which include disease-induced mortality and economic losses. Also, a delayed response to the outbreak and underes...",a2c4f10b9e13efcf246bca26e5933080a571ba2a,custom_license,True,False
46742,A genome-wide association study identifies major loci affecting the immune response against infectious bronchitis virus in chicken,"Abstract Coronaviruses are a hot research topic because they can cause severe diseases in humans and animals. Infectious bronchitis virus (IBV), belonging to gamma-coronaviruses, causes a highly infectious respiratory viral disease and can result in catastrophic economic losses to the poultry industry worldwide. Unfortunately, the genetic basis of the host immune responses against IBV is poorly understood. In the present study, the antibody levels against IBV post-immunization were measured ...",4939a8ffb831e245e793678ca58921a9cea68872,custom_license,True,False
46749,"The SARS-coronavirus papain-like protease: Structure, function and inhibition by designed antiviral compounds","Abstract Over 10 years have passed since the deadly human coronavirus that causes severe acute respiratory syndrome (SARS-CoV) emerged from the Guangdong Province of China. Despite the fact that the SARS-CoV pandemic infected over 8500 individuals, claimed over 800 lives and cost billions of dollars in economic loss worldwide, there still are no clinically approved antiviral drugs, vaccines or monoclonal antibody therapies to treat SARS-CoV infections. The recent emergence of the deadly huma...",a33b2defd1b4103bac921e592dad8749c4ec8ccb; 63be870a50a6f2736945462450822bc94c63a254,custom_license,True,False
46817,Associations between bovine coronavirus and bovine respiratory syncytial virus infections and animal performance in Swedish dairy herds,"Abstract To assess the economic impact of bovine coronavirus (BCV) and bovine respiratory syncytial virus (BRSV) infections, accurate estimates of their associated effects on animal performance are needed. This study aimed to quantify the variation in individual test-day milk yield and somatic cell count, risk of reproductive failure after first service of dairy cows, and risk of death of calves and heifers according to the BCV and BRSV status of the herd. Three types of status were defined ...",c635d21a8dd780ebae0bfe87fd7ef1d7d717c4ec,custom_license,True,False


## Economic seems to be the only keyword, and works in title, not abstract

In [37]:
return_papers(economic_keywords, covid_papers, 0)

Unnamed: 0,title,abstract,sha,full_text_file,has_pdf_parse,has_pmc_xml_parse
12071,"Global epidemiology of coronavirus disease 2019 (COVID-19): disease incidence, daily cumulative index, mortality, and their association with country healthcare resources and economic status","ABSTRACT It has been 2 months since the first case of coronavirus disease 2019 (COVID-19) was reported in Wuhan, China. So far, COVID-19 has affected 85 403 patients in 57 countries/territories and has caused 2924 deaths in 9 countries. However, epidemiological data differ between countries. Although China had higher morbidity and mortality than other sites, the number of new daily cases in China has been lower than outside of China since 26 February 2020. The incidence ranged from 61.44 per...",dc1e6aa9ceaf4d2129fa5c16f2649e4688405832,custom_license,True,False
22116,Estimates and determinants of economic impacts from influenza‐like illnesses caused by respiratory viruses in Australian children attending childcare: a cohort study,"BACKGROUND: Influenza and other respiratory infections cause excess winter morbidity in children. This study assessed the economic impact of influenza‐like illness (ILI) on families with children attending childcare using a societal perspective. METHODS: We conducted a prospective cohort study in 90 childcare centres and one general practitioner clinics in Sydney, Australia, during 2010. Healthy children aged ≥6 months to <3 years were enrolled. Economic impacts of ILI (temperature ≥37·8°C o...",b3fd1c9139c31dc46f7b65873260adcb21f47192,custom_license,True,True
30629,"Seroprevalence of economically important viral pathogens in swine populations of Trinidad and Tobago, West Indies","The objective of this study was to evaluate the seroprevalence and identify the strains of swine influenza virus (SwIV), as well as the seroprevalence of porcine parvovirus (PPV), transmissible gastroenteritis virus (TGEV), porcine reproductive and respiratory syndrome virus (PRRSV), porcine respiratory coronavirus (PRCV), porcine circovirus type 2 (PCV-2), and classical swine fever virus (CSFV) in pigs in Trinidad and Tobago (T&T). Blood samples (309) were randomly collected from pigs at fa...",479307c95d4d31ef10f0c813e00261b72d9c755e,custom_license,True,True
37298,"A reliable, practical, and economical protocol for inducing diarrhea and severe dehydration in the neonatal calf.","Fifteen healthy, colostrum-fed, male dairy calves, aged 2 to 7 d were used in a study to develop a diarrhea protocol for neonatal calves that is reliable, practical, and economical. After instrumentation and recording baseline data, diarrhea and dehydration were induced by administering milk replacer [16.5 mL/kg of body weight (BW), PO], sucrose (2 g/kg in a 20% aqueous solution, p.o.), spironolactone and hydrochlorothiazide (1 mg/kg, PO) every 8 h, and furosemide (2 mg/kg, i.m., q6h). Calve...",,,False,False
41989,Impacts of social and economic factors on the transmission of coronavirus disease (COVID-19) in China,"This paper examines the role of various socioeconomic factors in mediating the local and cross-city transmissions of the novel coronavirus 2019 (COVID-19) in China. We implement a machine learning approach to select instrumental variables that strongly predict virus transmission among the rich exogenous weather characteristics. Our 2SLS estimates show that the stringent quarantine, massive lockdown and other public health measures imposed in late January significantly reduced the transmissio...",de055f09fef2776bc78bac5d58c4131301b2025f,biorxiv_medrxiv,True,False
42970,Economic Impacts of Wuhan 2019-nCoV on China and the World,"Uncertainties over the Wuhan 2019 Novel Coronavirus (2019-nCoV), which has killed 1,017 people and sickened more than 43,100 as of Feb 11,(1) has interrupted global trade and supply chains, depressed asset prices, and forced multinational businesses to make hard decisions with limited information. This article is protected by copyright. All rights reserved.",,,False,False
43084,Modelling the Economic Impact and Ripple Effects of Disease Outbreaks,"The Coronavirus Disease 2019 (COVID-19) outbreak has had alarming effects on human lives and the economies of affected countries. With the world’s manufacturing hubs experiencing a period of extended factory closures, the economic impact transcends territorial borders via global supply chains. This paper provides a roadmap on how to evaluate the vulnerability that cascades through the supply chain due to a disease outbreak at the firm level, national level, and global scale. The final extent...",,,False,False
43256,World Economic Prospects Monthly,"Overview: Coronavirus to cut global growth to new lows ▀ The rapid spread of coronavirus will weaken China's GDP growth sharply in the short term, causing disruption for the rest of the world. We now expect global GDP growth to slow to just 1.9% y/y in Q1 this year and have lowered our forecast for 2020 as a whole from 2.5% to 2.3%, down from 2.6% in 2019. ▀ Prior to the coronavirus outbreak, there had been signs that the worst was over for both world trade and the manufacturing sector. Howe...",,,False,False
44732,"Potential scenarios for the progression of a COVID-19 epidemic in the European Union and the European Economic Area, March 2020","Two months after the emergence of severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2), the possibility of established and widespread community transmission in the European Union and European Economic Area (EU/EEA) is becoming more likely. We provide scenarios for use in preparedness for a possible widespread epidemic. The EU/EEA is moving towards the ‘limited sustained transmission’ phase. We propose actions to prepare for potential mitigation phases and coordinate efforts to protec...",c118f8139e55632ad0f4d7267c40b0419126e874,comm_use_subset,True,True
45982,"Rapidly increasing cumulative incidence of coronavirus disease (COVID-19) in the European Union/European Economic Area and the United Kingdom, 1 January to 15 March 2020","The cumulative incidence of coronavirus disease (COVID-19) cases is showing similar trends in European Union/European Economic Area countries and the United Kingdom confirming that, while at a different stage depending on the country, the COVID-19 pandemic is progressing rapidly in all countries. Based on the experience from Italy, countries, hospitals and intensive care units should increase their preparedness for a surge of patients with COVID-19 who will require healthcare, and in particu...",3aacd806eb100b38f64a8c5cab039e18ee0e9d73,comm_use_subset,True,True


## Corona may also be too broad, but not sure how to limit otherwise

## Search full text files

In [54]:
def find_keyword(keywords, text):
    """
    Iterates through a list of keywords and searches them in a string of text.

    inputs:
      keywords: list of keywords
      text: string of text

    output: number of times keywords are found in the text
    """
    find = []
    for keyword in keywords:
        find.extend(re.findall(keyword, text.lower()))
    return len(find)

In [95]:
def search_body_text(sha, folder1, folder2, keywords, sentence_only):
    """
    Searches a single full length text for sentences/paragraphs which contain a list of keywords.

    inputs:
      sha: sha file name
      folder1: text folder name
      folder2: pdf or pmc folder name
      keywords: list of keywords to search for
      sentence_only: whether or not to show sentence only or full paragraph
    
    output: list of sentences/paragraphs found containing keywords
    """

    #open text file
    with open('./CORD-19-research-challenge/'+folder1+'/'+folder1+'/'+folder2+'/'+sha+'.json') as f:
        file = json.load(f)
    
    found = []
    for text_dict in file["body_text"]:
        
        #if show_sentence_only, then split the paragraph into sentences, then look for keywords
        if sentence_only:
            sentences = text_dict["text"].split(". ")
            for sentence in sentences:
                count = find_keyword(keywords, sentence)
                if count > 0:
                    found.append(sentence)
                    
        #otherwise, show the whole paragraph
        else:
            count = find_keyword(keywords, text_dict["text"])
            if count > 0:
                #print(text_dict["section"])
                found.append(text_dict["text"])
                
    return(found)

In [89]:
def automated_lit_search(metadata_subset, keywords, sentence_only=True):
    """
    Creates a table keyword findings.
    
    inputs:
      metadata_subset: subset of metadata file to search
      keywords: list of keywords to search
      sentence_only: whether or not to show sentence only or full paragraph
    
    output: dataframe table of results with columns containing index, title, and text snippet
    """
    results = []
    
    indices = metadata_subset[metadata_subset['has_pdf_parse'] == True].index
    indices_pmc = metadata_subset[metadata_subset['has_pmc_xml_parse'] == True].index
    indices.append(indices_pmc)
    
    for index in indices:
        
        #find text location
        sha = metadata_subset["sha"][index].split(';')[0]
        folder1 = metadata_subset["full_text_file"][index]
        if metadata_subset['has_pdf_parse'][index] == True:
            folder2 = 'pdf_json'
        elif metadata_subset['has_pmc_xml_parse'][index] == True:
            folder2 = 'pmc_json'
        
        #open text and search for keywords
        found = search_body_text(sha, folder1, folder2, keywords, sentence_only)
        if len(found) > 0:
            for f in found:
                results.append([index, metadata_subset["title"][index], f])
                
    results_df = pd.DataFrame(results, columns=["index","title","text"])
    return(results_df)

In [90]:
intervention_sentences = automated_lit_search(intervention_papers, intervention_keywords, True)
intervention_sentences.to_csv('intervention_sentences.csv', index=False)

In [96]:
intervention_paragraphs = automated_lit_search(intervention_papers, intervention_keywords, False)
intervention_paragraphs.to_csv('intervention_paragraphs.csv', index=False)

In [91]:
economic_sentences = automated_lit_search(economic_papers, economic_keywords, True)
economic_sentences.to_csv('economic_sentences.csv', index=False)

In [97]:
economic_paragraphs = automated_lit_search(economic_papers, economic_keywords, False)
economic_paragraphs.to_csv('economic_paragraphs.csv', index=False)