In [1]:
import spacy

In [5]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import json

In [7]:
root_path = '../data/2020-03-13/'
metadata_path = f'{root_path}/all_sources_metadata_2020-03-13.csv'
meta_df = pd.read_csv(metadata_path, dtype={
    'pubmed_id': str,
    'Microsoft Academic Paper ID': str, 
    'doi': str
})
meta_df.head()

Unnamed: 0,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text
0,c630ebcdf30652f0422c3ec12a00b50241dc9bd9,CZI,Angiotensin-converting enzyme 2 (ACE2) as a SA...,10.1007/s00134-020-05985-9,,32125455.0,cc-by-nc,,2020,"Zhang, Haibo; Penninger, Josef M.; Li, Yimin; ...",Intensive Care Med,2002765492,#3252,True
1,53eccda7977a31e3d0f565c884da036b1e85438e,CZI,Comparative genetic analysis of the novel coro...,10.1038/s41421-020-0147-1,,,cc-by,,2020,"Cao, Yanan; Li, Lin; Feng, Zhimin; Wan, Shengq...",Cell Discovery,3003430844,#1861,True
2,210a892deb1c61577f6fba58505fd65356ce6636,CZI,Incubation Period and Other Epidemiological Ch...,10.3390/jcm9020538,,,cc-by,The geographic spread of 2019 novel coronaviru...,2020,"Linton, M. Natalie; Kobayashi, Tetsuro; Yang, ...",Journal of Clinical Medicine,3006065484,#1043,True
3,e3b40cc8e0e137c416b4a2273a4dca94ae8178cc,CZI,Characteristics of and Public Health Responses...,10.3390/jcm9020575,,32093211.0,cc-by,"In December 2019, cases of unidentified pneumo...",2020,"Deng, Sheng-Qun; Peng, Hong-Juan",J Clin Med,177663115,#1999,True
4,92c2c9839304b4f2bc1276d41b1aa885d8b364fd,CZI,Imaging changes in severe COVID-19 pneumonia,10.1007/s00134-020-05976-w,,32125453.0,cc-by-nc,,2020,"Zhang, Wei",Intensive Care Med,3006643024,#3242,False


In [8]:
all_json = glob.glob(f'{root_path}/**/*.json', recursive=True)

13202

In [9]:
class FileReader:
    def __init__(self, file_path):
        with open(file_path) as file:
            content = json.load(file)
            self.paper_id = content['paper_id']
            self.abstract = []
            self.body_text = []
            # Abstract
            for entry in content['abstract']:
                self.abstract.append(entry['text'])
            # Body text
            for entry in content['body_text']:
                self.body_text.append(entry['text'])
            self.abstract = '\n'.join(self.abstract)
            self.body_text = '\n'.join(self.body_text)
    def __repr__(self):
        return f'{self.paper_id}: {self.abstract[:200]}... {self.body_text[:200]}...'
first_row = FileReader(all_json[0])
print(first_row)

8f8eb4f004c2002face0723f2f58cc411954d36e: Bordetella bronchiseptica isolate KM22 has been used in experimental infections of swine as a model of clinical B. bronchiseptica infection and to study host-to-host transmission. The draft genome seq... 20-kb insert library preparation protocol (https://www.pacb.com/wp-content/uploads/ Procedure-Checklist-20-kb-Template-Preparation-Using-BluePippin-Size-Selection -System.pdf). The 20-kb library was s...


In [10]:
def get_breaks(content, length):
    data = ""
    words = content.split(' ')
    total_chars = 0

    # add break every length characters
    for i in range(len(words)):
        total_chars += len(words[i])
        if total_chars > length:
            data = data + "<br>" + words[i]
            total_chars = 0
        else:
            data = data + " " + words[i]
    return data

In [11]:
dict_ = {'paper_id': [], 'abstract': [], 'body_text': [], 'authors': [], 'title': [], 'journal': [], 'abstract_summary': []}
for idx, entry in enumerate(all_json):
    if idx % (len(all_json) // 10) == 0:
        print(f'Processing index: {idx} of {len(all_json)}')
    content = FileReader(entry)
    
    # get metadata information
    meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
    # no metadata, skip this paper
    if len(meta_data) == 0:
        continue
    
    dict_['paper_id'].append(content.paper_id)
    dict_['abstract'].append(content.abstract)
    dict_['body_text'].append(content.body_text)
    
    # also create a column for the summary of abstract to be used in a plot
    if len(content.abstract) == 0: 
        # no abstract provided
        dict_['abstract_summary'].append("Not provided.")
    elif len(content.abstract.split(' ')) > 100:
        # abstract provided is too long for plot, take first 300 words append with ...
        info = content.abstract.split(' ')[:100]
        summary = get_breaks(' '.join(info), 40)
        dict_['abstract_summary'].append(summary + "...")
    else:
        # abstract is short enough
        summary = get_breaks(content.abstract, 40)
        dict_['abstract_summary'].append(summary)
        
    # get metadata information
    meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
    
    try:
        # if more than one author
        authors = meta_data['authors'].values[0].split(';')
        if len(authors) > 2:
            # more than 2 authors, may be problem when plotting, so take first 2 append with ...
            dict_['authors'].append(". ".join(authors[:2]) + "...")
        else:
            # authors will fit in plot
            dict_['authors'].append(". ".join(authors))
    except Exception as e:
        # if only one author - or Null valie
        dict_['authors'].append(meta_data['authors'].values[0])
    
    # add the title information, add breaks when needed
    try:
        title = get_breaks(meta_data['title'].values[0], 40)
        dict_['title'].append(title)
    # if title was not provided
    except Exception as e:
        dict_['title'].append(meta_data['title'].values[0])
    
    # add the journal information
    dict_['journal'].append(meta_data['journal'].values[0])
    
df_covid = pd.DataFrame(dict_, columns=['paper_id', 'abstract', 'body_text', 'authors', 'title', 'journal', 'abstract_summary'])
df_covid.head()

Processing index: 0 of 13202
Processing index: 1320 of 13202
Processing index: 2640 of 13202
Processing index: 3960 of 13202
Processing index: 5280 of 13202
Processing index: 6600 of 13202
Processing index: 7920 of 13202
Processing index: 9240 of 13202
Processing index: 10560 of 13202
Processing index: 11880 of 13202
Processing index: 13200 of 13202


Unnamed: 0,paper_id,abstract,body_text,authors,title,journal,abstract_summary
0,8f8eb4f004c2002face0723f2f58cc411954d36e,Bordetella bronchiseptica isolate KM22 has bee...,20-kb insert library preparation protocol (htt...,"['Nicholson, Tracy L.', 'Bayles, Darrell O.', ...",Complete Genome Sequence of Bordetella<br>bro...,Microbiol Resour Announc,Bordetella bronchiseptica isolate KM22 has<br...
1,63f7049d200896290b38b38711113054f7ea1b50,,I nfectious diseases have been an ever-present...,"['Fauci, Anthony S.', 'Touchette, Nancy A.', '...",Emerging Infectious Diseases: a 10-Year<br>Pe...,Emerg Infect Dis,Not provided.
2,4df45b8404d9de0b376a8ae3c282a517df36fe51,The influenza A nucleoprotein (NP) is an attra...,The transmission of a pathogenic avian H5N1 vi...,"['Cheung, Ying-Kit', 'Cheng, Samuel Chak-Sum',...",Two novel HLA-A*0201 T-cell epitopes in avian...,Vet Res,The influenza A nucleoprotein (NP) is an<br>a...
3,e0737ee93afe7b0bf06b1e3f9adf21d541dd10f0,The outbreak of severe acute respiratory syndr...,acute lung injury; inflammatory response; neut...,"['Han, Bing', 'Ma, Xuezhong', 'Zhang, Jianhua'...",Protective Effects of Long Pentraxin PTX3 on<...,Lab Invest,The outbreak of severe acute respiratory<br>s...
4,3c3572ba243d61e7631725669c8f88347fdbd5bc,The prevalence of feline herpesvirus-1 (FHV-1)...,Feline herpesvirus type 1 (FHV-1) is the most ...,"['Kang, Byeong-Teck', 'Park, Hee-Myung']","Prevalence of feline herpesvirus 1, feline<br...",J Vet Sci,The prevalence of feline herpesvirus-1<br>(FH...


In [12]:
import spacy
nlp_vecs = spacy.load('en_core_sci_md')
nlp = spacy.load('en_ner_bc5cdr_md')

In [15]:
parsed_abstracts = [d for d in nlp.pipe(df_covid.abstract)]

In [42]:
# make an inventory of all entities
from collections import defaultdict
ent_count = defaultdict(int)
for d in parsed_abstracts:
    for e in d.ents:
        ent_count[(e.text, e.label_)]+=1
ent_inv = list(set([k[0] for k in ent_count]))
# max len
max_len = max([len(e.split()) for e in ent_inv])

In [None]:
# count of entities
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(vocabulary=ent_inv, ngram_range=(1, max_len))
ent_vecs = cv.fit_transform(df_covid.abstract)

In [29]:
#import gzip
#import pickle
#with gzip.open('parsed_abstracts.pkl.gz', 'wb') as f:
#    pickle.dump(parsed_abstracts, f)

In [25]:
# make an inventory of all entities
from collections import defaultdict
ent_count = defaultdict(int)
for d in parsed_abstracts:
    for e in d.ents:
        ent_count[(e.text, e.label_)]+=1

In [27]:
sorted(ent_count.items(), key=lambda x: x[1], reverse=True)

[(('infection', 'DISEASE'), 3607),
 (('SARS', 'DISEASE'), 1264),
 (('infections', 'DISEASE'), 892),
 (('pneumonia', 'DISEASE'), 825),
 (('diarrhea', 'DISEASE'), 746),
 (('infectious diseases', 'DISEASE'), 711),
 (('death', 'DISEASE'), 696),
 (('MERS', 'DISEASE'), 586),
 (('viral infection', 'DISEASE'), 568),
 (('fever', 'DISEASE'), 515),
 (('respiratory syndrome', 'DISEASE'), 487),
 (('infectious disease', 'DISEASE'), 464),
 (('cancer', 'DISEASE'), 454),
 (('viral infections', 'DISEASE'), 447),
 (('inflammation', 'DISEASE'), 439),
 (('amino acid', 'CHEMICAL'), 434),
 (('nucleotide', 'CHEMICAL'), 431),
 (('tumor', 'DISEASE'), 387),
 (('deaths', 'DISEASE'), 322),
 (('acute respiratory syndrome', 'DISEASE'), 315),
 (('NP', 'CHEMICAL'), 310),
 (('TB', 'DISEASE'), 308),
 (('virus infection', 'DISEASE'), 299),
 (('ILI', 'DISEASE'), 291),
 (('respiratory infections', 'DISEASE'), 288),
 (('nucleic acid', 'CHEMICAL'), 266),
 (('SARS-CoV-2', 'CHEMICAL'), 262),
 (('ARI', 'DISEASE'), 250),
 (('ast