In [1]:
import pandas as pd
import numpy as np
import json
import re
from collections import Counter

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.feature_extraction.text import CountVectorizer

import warnings
warnings.filterwarnings("ignore")

In [2]:
metadata = pd.read_csv("./CORD-19-research-challenge/metadata.csv")

In [3]:
metadata.head()

Unnamed: 0,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text,full_text_file
0,,Elsevier,Intrauterine virus infections and congenital h...,10.1016/0002-8703(72)90077-4,,4361535.0,els-covid,Abstract The etiologic basis for the vast majo...,1972-12-31,"Overall, James C.",American Heart Journal,,,False,custom_license
1,,Elsevier,Coronaviruses in Balkan nephritis,10.1016/0002-8703(80)90355-5,,6243850.0,els-covid,,1980-03-31,"Georgescu, Leonida; Diosi, Peter; Buţiu, Ioan;...",American Heart Journal,,,False,custom_license
2,,Elsevier,Cigarette smoking and coronary heart disease: ...,10.1016/0002-8703(80)90356-7,,7355701.0,els-covid,,1980-03-31,"Friedman, Gary D",American Heart Journal,,,False,custom_license
3,aecbc613ebdab36753235197ffb4f35734b5ca63,Elsevier,Clinical and immunologic studies in identical ...,10.1016/0002-9343(73)90176-9,,4579077.0,els-covid,"Abstract Middle-aged female identical twins, o...",1973-08-31,"Brunner, Carolyn M.; Horwitz, David A.; Shann,...",The American Journal of Medicine,,,True,custom_license
4,,Elsevier,Epidemiology of community-acquired respiratory...,10.1016/0002-9343(85)90361-4,,4014285.0,els-covid,Abstract Upper respiratory tract infections ar...,1985-06-28,"Garibaldi, Richard A.",The American Journal of Medicine,,,False,custom_license


In [4]:
metadata["full_text_file"].value_counts()

custom_license        20873
comm_use_subset        8803
noncomm_use_subset     2133
biorxiv_medrxiv        1020
Name: full_text_file, dtype: int64

In [5]:
metadata["has_full_text"].value_counts()

True     28462
False    15758
Name: has_full_text, dtype: int64

In [6]:
metadata["abstract"].isnull().sum()

8414

In [7]:
metadata["title"].isnull().sum()

224

In [8]:
def find_keyword(keywords, text):
    find = []
    for keyword in keywords:
        find.extend(re.findall(keyword, text.lower()))
    return len(find)

In [9]:
def count_keywords(keywords, data):
    counts = []
    for i in range(data.shape[0]):
        count = 0
        if type(data.iloc[i]["title"]) == str:
            count += find_keyword(keywords, data.iloc[i]["title"])
        if type(data.iloc[i]["abstract"]) == str:
            count += find_keyword(keywords, data.iloc[i]["abstract"])
        counts.append(count)
    return(counts)

In [10]:
def abstract_word_counts(data, n):
    count_vect = CountVectorizer(stop_words="english", analyzer="word", ngram_range=(1,1))
    counts = count_vect.fit_transform(data[data["abstract"].isnull()==False]["abstract"])
    vocab = count_vect.get_feature_names()
    counter = Counter(dict(zip(vocab, counts.sum(axis=0).A1)))
    return(counter.most_common(n))

In [17]:
def show_title_abstract(data, show_abstract=True):
    for index, row in data.iterrows():
        print(index)
        print(row["title"])
        if show_abstract:
            print(row["abstract"])
        print("")

In [12]:
keywords_1 = ["non-pharmaceutical intervention"]
metadata["keywords_1"] = count_keywords(keywords_1, metadata)
data_1 = metadata[metadata["keywords_1"] > 0]
data_1.shape[0]

29

In [13]:
print(abstract_word_counts(data_1, 100))

[('interventions', 82), ('influenza', 68), ('pharmaceutical', 63), ('non', 58), ('pandemic', 57), ('cases', 39), ('health', 38), ('infection', 32), ('disease', 31), ('transmission', 31), ('control', 29), ('epidemic', 27), ('npi', 27), ('public', 27), ('19', 26), ('npis', 26), ('results', 26), ('hand', 25), ('respiratory', 25), ('measures', 24), ('outbreak', 24), ('covid', 22), ('study', 22), ('contact', 21), ('effective', 21), ('based', 20), ('community', 20), ('model', 20), ('spread', 20), ('2009', 18), ('background', 18), ('case', 18), ('early', 18), ('effectiveness', 18), ('risk', 17), ('use', 17), ('cost', 16), ('h1n1', 16), ('household', 16), ('infectious', 16), ('methods', 16), ('number', 16), ('strategies', 16), ('policy', 15), ('delay', 14), ('evidence', 14), ('intervention', 14), ('laboratory', 14), ('reported', 14), ('secondary', 14), ('social', 14), ('time', 14), ('95', 13), ('data', 13), ('households', 13), ('hygiene', 13), ('infections', 13), ('travel', 13), ('age', 12), (

In [14]:
keywords_2 = ["school closure", "travel ban", "social distancing"]
metadata["keywords_2"] = count_keywords(keywords_2, metadata)
data_2 = metadata[metadata["keywords_2"] > 0]
data_2.shape[0]

113

In [15]:
print(abstract_word_counts(data_2, 100))

[('social', 153), ('influenza', 141), ('pandemic', 132), ('epidemic', 130), ('transmission', 124), ('school', 122), ('distancing', 118), ('health', 118), ('measures', 118), ('disease', 110), ('data', 94), ('control', 90), ('spread', 88), ('number', 87), ('19', 86), ('outbreak', 84), ('public', 84), ('cases', 83), ('model', 83), ('covid', 82), ('china', 78), ('infection', 68), ('impact', 66), ('interventions', 65), ('results', 60), ('respiratory', 59), ('models', 58), ('closure', 57), ('contact', 57), ('effective', 53), ('population', 53), ('travel', 51), ('virus', 51), ('based', 50), ('case', 47), ('infectious', 47), ('risk', 47), ('outbreaks', 46), ('time', 46), ('countries', 44), ('using', 44), ('methods', 43), ('age', 42), ('used', 41), ('reduce', 40), ('strategies', 40), ('infected', 39), ('quarantine', 39), ('study', 38), ('wuhan', 38), ('contacts', 36), ('patients', 36), ('people', 36), ('effect', 35), ('h1n1', 35), ('studies', 35), ('non', 34), ('2019', 33), ('information', 33),

In [18]:
show_title_abstract(data_1, show_abstract=False)

5927
Timely identification of optimal control strategies for emerging infectious diseases

5951
The impacts of simultaneous disease intervention decisions on epidemic outcomes

10868
The impact of non-pharmaceutical interventions for 2009 H1N1 influenza on travel intentions: A model of goal-directed behavior

13513
Certainties and Uncertainties Facing Emerging Respiratory Infectious Diseases: Lessons from SARS

16331
Closure of schools during an influenza pandemic

16539
Non-pharmaceutical interventions for the prevention of respiratory tract infections during Hajj pilgrimage

19615
Time variations in the transmissibility of pandemic influenza in Prussia, Germany, from 1918–19

19629
Influenza pandemic intervention planning using InfluSim: pharmaceutical and non- pharmaceutical interventions

20037
An optimal control theory approach to non-pharmaceutical interventions

20038
Dynamics and Control of Diseases in Networks with Community Structure

20207
Response to the challenges of pande