In [1]:
import pandas as pd
import numpy as np
import json
import re
from collections import Counter

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.feature_extraction.text import CountVectorizer

import warnings
warnings.filterwarnings("ignore")

In [2]:
DIR = "/hackathon/covid-19/data/"

In [3]:
metadata = pd.read_csv(DIR+"metadata.csv")

In [4]:
metadata.head()

Unnamed: 0,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text,full_text_file
0,,Elsevier,Intrauterine virus infections and congenital h...,10.1016/0002-8703(72)90077-4,,4361535.0,els-covid,Abstract The etiologic basis for the vast majo...,1972-12-31,"Overall, James C.",American Heart Journal,,,False,custom_license
1,,Elsevier,Coronaviruses in Balkan nephritis,10.1016/0002-8703(80)90355-5,,6243850.0,els-covid,,1980-03-31,"Georgescu, Leonida; Diosi, Peter; Buţiu, Ioan;...",American Heart Journal,,,False,custom_license
2,,Elsevier,Cigarette smoking and coronary heart disease: ...,10.1016/0002-8703(80)90356-7,,7355701.0,els-covid,,1980-03-31,"Friedman, Gary D",American Heart Journal,,,False,custom_license
3,aecbc613ebdab36753235197ffb4f35734b5ca63,Elsevier,Clinical and immunologic studies in identical ...,10.1016/0002-9343(73)90176-9,,4579077.0,els-covid,"Abstract Middle-aged female identical twins, o...",1973-08-31,"Brunner, Carolyn M.; Horwitz, David A.; Shann,...",The American Journal of Medicine,,,True,custom_license
4,,Elsevier,Epidemiology of community-acquired respiratory...,10.1016/0002-9343(85)90361-4,,4014285.0,els-covid,Abstract Upper respiratory tract infections ar...,1985-06-28,"Garibaldi, Richard A.",The American Journal of Medicine,,,False,custom_license


In [5]:
metadata["full_text_file"].value_counts()

custom_license        20873
comm_use_subset        8803
noncomm_use_subset     2133
biorxiv_medrxiv        1020
Name: full_text_file, dtype: int64

In [6]:
metadata["has_full_text"].value_counts()

True     28462
False    15758
Name: has_full_text, dtype: int64

In [7]:
metadata["abstract"].isnull().sum()

8414

In [8]:
metadata["title"].isnull().sum()

224

In [10]:
def find_keyword(keywords, text):
    """
    Iterates through a list of keywords and searches them in a string of text.

    inputs:
      keywords: list of keywords
      text: string of text

    output: number of times keywords are found in the text
    """
    find = []
    for keyword in keywords:
        find.extend(re.findall(keyword, text.lower()))
    return len(find)

In [11]:
def count_keywords(keywords, data, must_contain_keywords=[]):
    """
    Counts the number of times keywords appears in the title and abstract, if available.

    inputs:
      keywords: list of keywords
      data: metadata dataframe
      must_contain_keywords: list of keywords that must be included

    output: list of keyword counts, indexed according to the input dataframe
    """
    counts = []
    for i in range(data.shape[0]):
        count = 0
        include = 0
        if type(data.iloc[i]["title"]) == str:
            count += find_keyword(keywords, data.iloc[i]["title"])
            include += find_keyword(must_contain_keywords, data.iloc[i]["title"])
        if type(data.iloc[i]["abstract"]) == str:
            count += find_keyword(keywords, data.iloc[i]["abstract"])
            include += find_keyword(must_contain_keywords, data.iloc[i]["abstract"])
        if must_contain_keywords==[]:
            counts.append(count)
        else:
            if include==0:
                counts.append(0)
            else:
                counts.append(count)

    return(counts)

In [12]:
def abstract_word_counts(data, n):
    count_vect = CountVectorizer(stop_words="english", analyzer="word", ngram_range=(1,1))
    counts = count_vect.fit_transform(data[data["abstract"].isnull()==False]["abstract"])
    vocab = count_vect.get_feature_names()
    counter = Counter(dict(zip(vocab, counts.sum(axis=0).A1)))
    return(counter.most_common(n))

In [13]:
def show_title_abstract(data, show_abstract=True):
    for index, row in data.iterrows():
        print(index)
        print(row["title"])
        if show_abstract:
            print(row["abstract"])
        print("")

In [14]:
keywords_1 = ["non-pharmaceutical intervention"]
metadata["keywords_1"] = count_keywords(keywords_1, metadata, ["covid", "corona"])
data_1 = metadata[metadata["keywords_1"] > 0]
data_1.shape[0]

4

In [None]:
#print(abstract_word_counts(data_1, 100))

In [16]:
keywords_2 = ["school closure", "travel ban", "social distancing"]
metadata["keywords_2"] = count_keywords(keywords_2, metadata, ["covid", "corona"])
data_2 = metadata[metadata["keywords_2"] > 0]
data_2.shape[0]

37

The json file has the following keys:

dict_keys(['paper_id', 'metadata', 'abstract', 'body_text', 'bib_entries', 'ref_entries', 'back_matter'])

In [17]:
def search_body_text(index, keywords, sentence_only):
    """
    Searches a single full length text for sentences/paragraphs which contain a list of keywords.

    inputs:
      index: index of paper to search from the metadata file
      keywords: list of keywords to search for
      sentence_only: whether or not to show sentence only or full paragraph
    
    output: list of sentences/paragraphs found containing keywords
    """
    
    #find text location
    sha = metadata.iloc[index]["sha"]
    folder = metadata.iloc[index]["full_text_file"]

    #open text file
    with open(DIR+folder+'/'+folder+'/'+sha+'.json') as f:
        file = json.load(f)
    
    found = []
    for text_dict in file["body_text"]:
        
        #if show_sentence_only, then split the paragraph into sentences, then look for keywords
        if sentence_only:
            sentences = text_dict["text"].split(". ")
            for sentence in sentences:
                count = find_keyword(keywords, sentence)
                if count > 0:
                    found.append(sentence)
                    
        #otherwise, show the whole paragraph
        else:
            count = find_keyword(keywords, text)
            if count > 0:
                #print(text_dict["section"])
                found.append(text_dict["text"])
                
    return(found)


In [18]:
def automated_lit_search(metadata, keywords, sentence_only=True):
    """
    Creates a table keyword findings.
    
    inputs:
      metadata: subset of metadata file to search
      keywords: list of keywords to search
      sentence_only: whether or not to show sentence only or full paragraph
    
    output: dataframe table of results with columns containing index, title, and text snippet
    """
    results = []
    for index in metadata[metadata['has_full_text']].index:
        found = search_body_text(index, keywords, sentence_only)
        if len(found) > 0:
            for f in found:
                results.append([index, metadata["title"][index], f])
    results_df = pd.DataFrame(results, columns=["index","title","text"])
    return(results_df)

In [19]:
keywords_epi = ['r0', 'r 0', 'r_0', 'reproduction number', 'growth rate', 'doubling time', 'incubation period']
results = automated_lit_search(data_2, keywords_epi, True)
#results.to_csv('test.csv', index=False)

In [20]:
results.head()

Unnamed: 0,index,title,text
0,7956,COVID-19 and Italy: what next?,On the basis of the exponential curve predicti...
1,18237,Short-term Forecasts of the COVID-19 Epidemic ...,"In the absence of additional information, this..."
2,40109,Analysis of the epidemic growth of the early 2...,"With a mean serial interval of 7.5 days, the b..."
3,40109,Analysis of the epidemic growth of the early 2...,Using internationally exported cases from Wuha...
4,40109,Analysis of the epidemic growth of the early 2...,Other analyses using the series of new confirm...
