In [55]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer
import string

In [2]:
pd.set_option('display.max_colwidth', 500)

## Import file
- Only keep rows with non-null titles and abstracts

In [3]:
m = pd.read_csv('metadata.csv')

In [4]:
m = m[(m['title'].notna() & m['abstract'].notna())]

## Lemmatize Abstracts and Titles (get word roots)

- lowercase
- remove punctuation
- remove stopwords
- get root of words

In [37]:
w_tokenizer = WhitespaceTokenizer()
lemmatizer = WordNetLemmatizer()

In [102]:
def preprocess(sentence):
    sentence = sentence.lower()
    sentence_no_punctuation = sentence.translate(str.maketrans('', '', string.punctuation))
    lemmatized_list = [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(sentence_no_punctuation) 
                  if w not in stopwords.words('english')]
    return ' '.join(word for word in lemmatized_list)

In [104]:
m['cleanTitle']=m['title'].map(lambda s:preprocess(s)) 

In [107]:
m['cleanAbstract']=m['abstract'].map(lambda s:preprocess(s)) 

## Keywords

- Need a core covid keyword
- And need a topic keyword

## Make sure keywords are lemmatized version

In [79]:
check_words = ['isolation',
                'social',
                'distancing',
                'contact',
                'tracing',
                'event',
                'cancelation',
                'case',
                'isolation',
                'shelter',
                'place', 
                'stay',
                'home', 
                'movement',
                'restriction',
                'economic', 
                'unemployment',
                'depression',
                'financial',
                'crisis',
                'market',
                'stock',
                'macroeconomics',
                'dsge']

In [82]:
[lemmatizer.lemmatize(w) for w in check_words if lemmatizer.lemmatize(w) != w]

[]

## Confirmation all words at root word

In [109]:
covid_keywords = ['corona', 'covid']

In [156]:
intervention_keywords = [#'isolation', #can't use - too broad 
         # i.e. isolation of sequences from a random-sequence expression library that mimic viral epitopes
                        'social distancing',
                        'contact tracing',
                        'case isolation',
                        'shelter place', # in/at is removed stopword
                        'stay home', # at is removed stopword
                        'movement restriction',
                        'event cancellation']

In [182]:
economic_keywords = ['economic', 
                    'unemployment',
                     'unemploy',
                   # 'depression', # mental health articles here
                # i.e. 148. acute bipolar disorder depression is associated with immune activation
                    'financial crisis',
                  #  'market', # related to physical markets
            # i.e. wet markets—a continuing source of severe acute respiratory syndrome and influenza?
                #    ' stock', # often completly unrelated (or related to livestock if no space)
            # i.e. plaque assay for human coronavirus nl63 using human colon carcinoma cells
                    'stock market',
                    'macroeconomics',
                    'dsge']

## Create Functions

In [89]:
def find_papers_w_keywords(topic_keywords, papers):
    for keyword in topic_keywords:
        num_papers_title = len(papers[(papers['cleanTitle'].str.contains(keyword)) & 
                                        (papers['cleanTitle'])])
        num_papers_abstract = len(papers[papers['cleanAbstract'].str.contains(keyword)])
        print ('Identified {} papers with "{}" in title, {} relevant papers with "{}" in abstract'\
                       .format(num_papers_title, keyword, num_papers_abstract, keyword)) 

In [151]:
# 0 for title search, 1 for abstract search
def return_papers(topic_keywords, papers, abstract_search=1):
    relevant_papers = pd.DataFrame(columns=['title', 'abstract', 'sha'])
    if abstract_search == 1:
        for keyword in topic_keywords:
            relevant_papers = pd.concat((relevant_papers, papers[['title', 'abstract', 
                                'sha']][papers['cleanAbstract'].str.contains(keyword)]))
    else:
        for keyword in topic_keywords:
            relevant_papers = pd.concat((relevant_papers, papers[['title', 'abstract', 
                        'sha']][papers['cleanTitle'].str.contains(keyword)]))
    
    return relevant_papers.drop_duplicates()

## Identify core papers

In [110]:
find_papers_w_keywords(covid_keywords, m)

Identified 4672 papers with "corona" in title, 7383 relevant papers with "corona" in abstract
Identified 658 papers with "covid" in title, 943 relevant papers with "covid" in abstract


In [111]:
m['core_abstract'] = m['cleanAbstract'].apply(lambda x: any([k in x for k in covid_keywords]))

In [112]:
len(m), len(m[m['core_abstract'] == True])

(35657, 7644)

In [113]:
covid_papers = m[m['core_abstract'] == True]

## Intervention Papers


In [157]:
find_papers_w_keywords(intervention_keywords, covid_papers)

Identified 2 papers with "social distancing" in title, 23 relevant papers with "social distancing" in abstract
Identified 7 papers with "contact tracing" in title, 41 relevant papers with "contact tracing" in abstract
Identified 1 papers with "case isolation" in title, 4 relevant papers with "case isolation" in abstract
Identified 0 papers with "shelter place" in title, 0 relevant papers with "shelter place" in abstract
Identified 0 papers with "stay home" in title, 4 relevant papers with "stay home" in abstract
Identified 0 papers with "movement restriction" in title, 4 relevant papers with "movement restriction" in abstract
Identified 1 papers with "event cancellation" in title, 2 relevant papers with "event cancellation" in abstract


In [161]:
intervention_papers = return_papers(intervention_keywords, covid_papers, 1)

In [162]:
len(intervention_papers)

69

In [165]:
intervention_papers.head()

Unnamed: 0,title,abstract,sha
7956,covid-19 and italy: what next?,"summary the spread of severe acute respiratory syndrome coronavirus 2 (sars-cov-2) has already taken on pandemic proportions, affecting over 100 countries in a matter of weeks. a global response to prepare health systems worldwide is imperative. although containment measures in china have reduced new cases by more than 90%, this reduction is not the case elsewhere, and italy has been particularly affected. there is now grave concern regarding the italian national health system's capacity to ...",849f0d4e93647e40b3a6f0841ebb2dd6a890a6b7
18237,"short-term forecasts of the covid-19 epidemic in guangdong and zhejiang, china: february 13–23, 2020","the ongoing covid-19 epidemic continues to spread within and outside of china, despite several social distancing measures implemented by the chinese government. limited epidemiological data are available, and recent changes in case definition and reporting further complicate our understanding of the impact of the epidemic, particularly in the epidemic&rsquo;s epicenter. here we use previously validated phenomenological models to generate short-term forecasts of cumulative reported cases in g...",80993091f576dc7fdbec10552b45b4af5eec2b8b
18520,"coronavirus nixes conference, twilight zone beckons and a faded star brightens","coronavirus enters dangerous new phase the new coronavirus has spread to more than 70 nations and the total number of infections worldwide had passed 90,000 as nature went to press (see ‘rapid spread’). researchers have warned that the surge in outbreaks outside china, where the virus emerged and most cases have occurred, means that the coronavirus is becoming unstoppable. the world health organization has resisted describing the situation as a pandemic. director-general tedros adhanom ghebr...",
18714,will novel virus go pandemic or be contained?,"the repatriation of 565 japanese citizens from wuhan, china, in late january offered scientists an unexpected opportunity to learn a bit more about the novel coronavirus (2019-ncov) raging in that city. to avoid domestic spread of the virus, japanese officials screened every passenger for disease symptoms and tested them for the virus after they landed. eight tested positive, but four of those had no symptoms at all, says epidemiologist hiroshi nishiura of hokkaido university, sapporo—which ...",
18886,"estimating risk for death from 2019 novel coronavirus disease, china, january-february 2020","since december 2019, when the first case of 2019 novel coronavirus disease (covid-19) was identified in the city of wuhan in the hubei province of china, the epidemic has generated tens of thousands of cases throughout china. as of february 28, 2020, the cumulative number of reported deaths in china was 2,858. we estimated the time-delay adjusted risk for death from covid-19 in wuhan, as well as for china excluding wuhan, to assess the severity of the epidemic in the country. our estimates o...",


## Economic Papers

In [183]:
find_papers_w_keywords(economic_keywords, covid_papers)

Identified 8 papers with "economic" in title, 237 relevant papers with "economic" in abstract
Identified 0 papers with "unemployment" in title, 0 relevant papers with "unemployment" in abstract
Identified 0 papers with "unemploy" in title, 0 relevant papers with "unemploy" in abstract
Identified 0 papers with "financial crisis" in title, 0 relevant papers with "financial crisis" in abstract
Identified 0 papers with "stock market" in title, 0 relevant papers with "stock market" in abstract
Identified 0 papers with "macroeconomics" in title, 0 relevant papers with "macroeconomics" in abstract
Identified 0 papers with "dsge" in title, 0 relevant papers with "dsge" in abstract


In [184]:
economic_papers = return_papers(economic_keywords, covid_papers, 1)

In [185]:
len(economic_papers)

237

In [187]:
economic_papers[economic_papers['abstract'].str.contains('economic')]

Unnamed: 0,title,abstract,sha
120,comparison of ascites production for monoclonal antibodies in balb/c and balb/c-derived cross-bred mice,"abstract balb/c male mice were mated with either swiss-webster or mf1 females to produce first generation cross-bred offspring. hybridoma cell lines, from the fusion of p3-ns1-ag4/1 myeloma cells with spleen cells sensitised to the porcine coronavirus causing transmissible gastroenteritis, were injected intraperitoneally into these mice to produce ascitic fluid containing monoclonal antibodies. mice of 11 weeks of age weighing between 26 and 34 g were used. the volume of ascites produced by ...",c93a0161a57b9a8ebcc1c6395d81ad237b38e66a
2046,chapter 28 coronaviridae,"publisher summary coronaviruses are ssrna viruses that infect a wide range of mammalian and avian species; they are important causes of respiratory and enteric disease, encephalomyelitis, hepatitis, serositis, and vasculitis in domestic animals. in humans, coronaviruses are one of several groups of viruses that cause the common cold. the prototype of the family, avian infectious brochitis virus, is one of the most infectious of all viruses and causes an acute respiratory disease, which in yo...",0d9de5c910f092a3bb01beb690c06445d6cf1ca2
2108,chapter 1 agents of emerging infectious diseases,"abstract dramatic improvements in the control of infectious diseases in developed countries owing to socioeconomic changes, vaccines, and antibiotics during the first seven decades of the 20th century led to the mistakened concept that infectious diseases would no longer be a concern. since the declaration of victory in the war against infectious diseases in 1967, approximately 50 new disease agents have been identified. nearly every type of etiologic agent and clinical manifestation have be...",023b89a5ec6dec38e943ec4cfc67598845d3b0ff
2665,avian infectious bronchitis virus,"publisher summary this chapter provides an overview of the classification, description, hosts, key developments, diagnostic techniques, and diagnostic reagents for avian infectious bronchitis virus. avian infectious bronchitis virus belongs to the family coronaviridae; genus coronavirus; and species avian infectious bronchitis virus. the virus causes acute contagious respiratory illness and includes reproductive tissue disease in chickens. the hosts of avian infectious bronchitis virus are o...",7601f80f8b2d0ad5ca0079439ef90136664e9cfb
2821,human coronaviruses: general features,"abstract human coronaviruses (hcovs), including hcov-229e, hcov-oc43, hcov-nl63, and hcov-hku1, are traditionally known to cause symptoms of common cold with only moderate clinical impact. severe acute respiratory syndrome coronavirus (sars-cov) and middle east respiratory syndrome coronavirus (mers-cov), on the other hand, have strike humans in the past two decades as highly fatal human pathogens leading to considerable mortality and economic loss. this article summaries the updates on the ...",90798ce4da0c11f8eba3a943743b0a1584ff046a
2902,chapter 18 family arteriviridae,"abstract the family arteriviridae is one of four families in the order nidovirales. arteriviruses are enveloped, plus-strand rna viruses with genomes of 12.7–15.7kb. the overall genome organization and gene expression strategy of the arteriviruses is highly similar to the coronaviruses. notably they use a discontinuous transcription strategy for synthesis of subgenomic mrnas. there are no recognized human pathogens among the arteriviruses. members of the family include equine arteritis virus...",d0aa9e7ce12b10d7a289f64b6279b6c1455b9d9a
3024,chapter 11 structural insight into the viral 3c-like protease inhibitors: comparative sar/qsar approaches,"abstract severe acute respiratory syndrome (sars), caused by sars-coronavirus (sars-cov), is a dreadful infection worldwide having economic and medical importance and a global threat for health. it was turned into an epidemic in south china followed by a chain of infections across three generations. a number of pathogeneses in human may occur due to the virus. this infection has not been taken into account before the sars outbreak, and still it is a neglected one. therefore, there is an urge...",5779e410753e13a0a79f9f872321fe42df49e65c
3123,chapter 3 infectious bronchitis virus in poultry: molecular epidemiology and factors leading to the emergence and reemergence of novel strains of infectious bronchitis virus,"abstract infectious bronchitis virus (ibv) is a coronavirus that causes an acute and highly contagious disease in chickens. the virus can cause substantial economic losses throughout the poultry industry worldwide. it can affect the upper respiratory tract and the reproductive tract, and some strains can cause nephritis. the causative agent ibv is an rna virus with great ability for mutation and recombination, thus capable of generating new virus strains that are difficult to control. there ...",a0de24c5fdb4a0e77b1b5e8c002ced81ebe9b227
4969,biochemical and biophysical characterization of the transmissible gastroenteritis coronavirus fusion core,"abstract transmissible gastroenteritis coronavirus (tgev) is one of the most destructive agents, responsible for the enteric infections that are lethal for suckling piglets, causing enormous economic loss to the porcine fostering industry every year. although it has been known that tgev spiker protein is essential for the viral entry for many years, the detail knowledge of the tgev fusion protein core is still very limited. here, we report that tgev fusion core (hr1-sggrgg-hr2), in vitro exp...",e54a7e2b13048a2c79e39de20d940aaab85136ad
5583,histopathological and immunohistochemical study of air sac lesions induced by two strains of infectious bronchitis virus,"summary infectious bronchitis virus (ibv) is a highly contagious respiratory coronavirus of domestic chickens. although mortality is low, infection with ibv results in substantial losses for the egg and meat chicken industries. despite the economic importance of ibv and decades of research into the pathogenesis of infection, significant gaps in our knowledge exist. the aim of this study was to compare the early progression of air sac lesions in birds receiving a vaccine strain of the virus o...",03302abf698fc986829cbbbd9f660abdbf9321e3


## Economic seems to be the only keyword, and works in title, not abstract

In [188]:
return_papers(economic_keywords, covid_papers, 0)

Unnamed: 0,title,abstract,sha
18338,economic impacts of wuhan 2019-ncov on china and the world,"uncertainties over the wuhan 2019 novel coronavirus (2019-ncov), which has killed 1,017 people and sickened more than 43,100 as of feb 11,(1) has interrupted global trade and supply chains, depressed asset prices, and forced multinational businesses to make hard decisions with limited information. this article is protected by copyright. all rights reserved.",
18460,modelling the economic impact and ripple effects of disease outbreaks,"the coronavirus disease 2019 (covid-19) outbreak has had alarming effects on human lives and the economies of affected countries. with the world’s manufacturing hubs experiencing a period of extended factory closures, the economic impact transcends territorial borders via global supply chains. this paper provides a roadmap on how to evaluate the vulnerability that cascades through the supply chain due to a disease outbreak at the firm level, national level, and global scale. the final extent...",
18667,world economic prospects monthly,"overview: coronavirus to cut global growth to new lows ▀ the rapid spread of coronavirus will weaken china's gdp growth sharply in the short term, causing disruption for the rest of the world. we now expect global gdp growth to slow to just 1.9% y/y in q1 this year and have lowered our forecast for 2020 as a whole from 2.5% to 2.3%, down from 2.6% in 2019. ▀ prior to the coronavirus outbreak, there had been signs that the worst was over for both world trade and the manufacturing sector. howe...",
18876,"rapidly increasing cumulative incidence of coronavirus disease (covid-19) in the european union/european economic area and the united kingdom, 1 january to 15 march 2020","the cumulative incidence of coronavirus disease (covid-19) cases is showing similar trends in european union/european economic area countries and the united kingdom confirming that, while at a different stage depending on the country, the covid-19 pandemic is progressing rapidly in all countries. based on the experience from italy, countries, hospitals and intensive care units should increase their preparedness for a surge of patients with covid-19 who will require healthcare, and in particu...",
18880,"potential scenarios for the progression of a covid-19 epidemic in the european union and the european economic area, march 2020","two months after the emergence of severe acute respiratory syndrome coronavirus 2 (sars-cov-2), the possibility of established and widespread community transmission in the european union and european economic area (eu/eea) is becoming more likely. we provide scenarios for use in preparedness for a possible widespread epidemic. the eu/eea is moving towards the 'limited sustained transmission' phase. we propose actions to prepare for potential mitigation phases and coordinate efforts to protec...",
29035,estimates and determinants of economic impacts from influenza‐like illnesses caused by respiratory viruses in australian children attending childcare: a cohort study,"background: influenza and other respiratory infections cause excess winter morbidity in children. this study assessed the economic impact of influenza‐like illness (ili) on families with children attending childcare using a societal perspective. methods: we conducted a prospective cohort study in 90 childcare centres and one general practitioner clinics in sydney, australia, during 2010. healthy children aged ≥6 months to <3 years were enrolled. economic impacts of ili (temperature ≥37·8°c o...",b3fd1c9139c31dc46f7b65873260adcb21f47192
37946,"a reliable, practical, and economical protocol for inducing diarrhea and severe dehydration in the neonatal calf.","fifteen healthy, colostrum-fed, male dairy calves, aged 2 to 7 d were used in a study to develop a diarrhea protocol for neonatal calves that is reliable, practical, and economical. after instrumentation and recording baseline data, diarrhea and dehydration were induced by administering milk replacer [16.5 ml/kg of body weight (bw), po], sucrose (2 g/kg in a 20% aqueous solution, p.o.), spironolactone and hydrochlorothiazide (1 mg/kg, po) every 8 h, and furosemide (2 mg/kg, i.m., q6h). calve...",
40491,impacts of social and economic factors on the transmission of coronavirus disease (covid-19) in china,"this paper examines the role of various socioeconomic factors in mediating the local and cross-city transmissions of the novel coronavirus 2019 (covid-19) in china. we implement a machine learning approach to select instrumental variables that strongly predict virus transmission among the rich exogenous weather characteristics. our 2sls estimates show that the stringent quarantine, massive lockdown and other public health measures imposed in late january significantly reduced the transmissio...",de055f09fef2776bc78bac5d58c4131301b2025f


## Corona may also be too broad, but not sure how to limit otherwise