In [35]:
#pip install scispacy

In [36]:
#pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_lg-0.2.4.tar.gz

In [None]:
#pip install swifter

In [38]:
import glob
import json
import re
import pandas as pd
import numpy as np
from sys import getsizeof # Get size of a variable in bytes
import swifter
import pickle

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Spacy
import scispacy
import spacy
import en_core_sci_lg

# Plotting tools
import matplotlib.pyplot as plt

%matplotlib inline

**Read data**

In [4]:
root_path = '/kaggle/input/CORD-19-research-challenge/'
metadata_path = f'{root_path}/metadata.csv'
meta_df = pd.read_csv(metadata_path, dtype={
    'pubmed_id': str,
    'Microsoft Academic Paper ID': str, 
    'doi': str
})
meta_df.head()

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_pdf_parse,has_pmc_xml_parse,full_text_file,url
0,xqhn0vbp,1e1286db212100993d03cc22374b624f7caee956,PMC,Airborne rhinovirus detection and effect of ul...,10.1186/1471-2458-3-5,PMC140314,12525263,no-cc,"BACKGROUND: Rhinovirus, the most common cause ...",2003-01-13,"Myatt, Theodore A; Johnston, Sebastian L; Rudn...",BMC Public Health,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
1,gi6uaa83,8ae137c8da1607b3a8e4c946c07ca8bda67f88ac,PMC,Discovering human history from stomach bacteria,10.1186/gb-2003-4-5-213,PMC156578,12734001,no-cc,Recent analyses of human pathogens have reveal...,2003-04-28,"Disotell, Todd R",Genome Biol,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
2,le0ogx1s,,PMC,A new recruit for the army of the men of death,10.1186/gb-2003-4-7-113,PMC193621,12844350,no-cc,"The army of the men of death, in John Bunyan's...",2003-06-27,"Petsko, Gregory A",Genome Biol,,,False,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
3,fy4w7xz8,0104f6ceccf92ae8567a0102f89cbb976969a774,PMC,Association of HLA class I with severe acute r...,10.1186/1471-2350-4-9,PMC212558,12969506,no-cc,BACKGROUND: The human leukocyte antigen (HLA) ...,2003-09-12,"Lin, Marie; Tseng, Hsiang-Kuang; Trejaut, Jean...",BMC Med Genet,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...
4,0qaoam29,5b68a553a7cbbea13472721cd1ad617d42b40c26,PMC,A double epidemic model for the SARS propagation,10.1186/1471-2334-3-19,PMC222908,12964944,no-cc,BACKGROUND: An epidemic of a Severe Acute Resp...,2003-09-10,"Ng, Tuen Wai; Turinici, Gabriel; Danchin, Antoine",BMC Infect Dis,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...


In [5]:
all_json = glob.glob(f'{root_path}/**/*.json', recursive=True)
len(all_json), type(all_json)

(60596, list)

In [6]:
class FileReader:
    def __init__(self, file_path):
        with open(file_path) as file:
            content = json.load(file)
            self.paper_id = content['paper_id']
            self.abstract = []
            self.body_text = []
            # Abstract
            for entry in content['abstract']:
                self.abstract.append(entry['text'])
            # Body text
            for entry in content['body_text']:
                self.body_text.append(entry['text'])
            self.abstract = '\n'.join(self.abstract)
            self.body_text = '\n'.join(self.body_text)
    def __repr__(self):
        return f'{self.paper_id}: {self.abstract[:200]}... {self.body_text[:200]}...'
# first_row = FileReader(all_json[0])
# print(first_row)

In [7]:
def get_breaks(content, length):
    data = ""
    words = content.split(' ')
    total_chars = 0

    # add break every length characters
    for i in range(len(words)):
        total_chars += len(words[i])
        if total_chars > length:
            data = data + "<br>" + words[i]
            total_chars = 0
        else:
            data = data + " " + words[i]
    return data

In [8]:
def getPapersAsDataframe(json_list=None):
    dict_ = {'paper_id': [], 'doi':[], 'abstract': [], 'body_text': [], 'authors': [], 'title': [], 'journal': [], 'abstract_summary': []}
    for idx, entry in enumerate(json_list):
        if idx % (len(json_list) // 10) == 0:
            print(f'Processing index: {idx} of {len(json_list)}')

        try:
            content = FileReader(entry)
        except Exception as e:
            continue  # invalid paper format, skip

        # get metadata information
        meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
        # no metadata, skip this paper
        if len(meta_data) == 0:
            continue

        dict_['abstract'].append(content.abstract)
        dict_['paper_id'].append(content.paper_id)
        dict_['body_text'].append(content.body_text)

        # also create a column for the summary of abstract to be used in a plot
        if len(content.abstract) == 0: 
            # no abstract provided
            dict_['abstract_summary'].append("Not provided.")
        elif len(content.abstract.split(' ')) > 100:
            # abstract provided is too long for plot, take first 300 words append with ...
            info = content.abstract.split(' ')[:100]
            summary = get_breaks(' '.join(info), 40)
            dict_['abstract_summary'].append(summary + "...")
        else:
            # abstract is short enough
            summary = get_breaks(content.abstract, 40)
            dict_['abstract_summary'].append(summary)

        # get metadata information
        meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]

        try:
            # if more than one author
            authors = meta_data['authors'].values[0].split(';')
            if len(authors) > 2:
                # more than 2 authors, may be problem when plotting, so take first 2 append with ...
                dict_['authors'].append(get_breaks('. '.join(authors), 40))
            else:
                # authors will fit in plot
                dict_['authors'].append(". ".join(authors))
        except Exception as e:
            # if only one author - or Null valie
            dict_['authors'].append(meta_data['authors'].values[0])

        # add the title information, add breaks when needed
        try:
            title = get_breaks(meta_data['title'].values[0], 40)
            dict_['title'].append(title)
        # if title was not provided
        except Exception as e:
            dict_['title'].append(meta_data['title'].values[0])

        # add the journal information
        dict_['journal'].append(meta_data['journal'].values[0])

        # add doi
        dict_['doi'].append(meta_data['doi'].values[0])

    #df_covid = pd.DataFrame(dict_, columns=['paper_id', 'doi', 'abstract', 'body_text', 'authors', 'title', 'journal', 'abstract_summary'])
    return(pd.DataFrame(dict_, columns=['paper_id', 'doi', 'abstract', 'body_text', 'authors', 'title', 'journal', 'abstract_summary']))


In [9]:
df_full = getPapersAsDataframe(json_list=all_json)

Processing index: 0 of 60596
Processing index: 6059 of 60596
Processing index: 12118 of 60596
Processing index: 18177 of 60596
Processing index: 24236 of 60596
Processing index: 30295 of 60596
Processing index: 36354 of 60596
Processing index: 42413 of 60596
Processing index: 48472 of 60596
Processing index: 54531 of 60596
Processing index: 60590 of 60596


In [12]:
??# Combine abstract and body text.
df_full['abstract_body_text'] = df_full['abstract'] + df_full['body_text']

**Load SciSpacy model**

In [14]:
# Load scispacy
nlp = spacy.load("en_core_sci_lg") #en_core_sci_sm.load(disable=["tagger", "parser", "ner"])
nlp.max_length = 2000000

In [33]:
# Tokenizer for lemmatized words derived from the entities given by scispacy.
def spacy_tokenizer_ent(sentence):
    return ' '.join([word for sublist in [ent.lemma_.split() for ent in nlp(sentence).ents] for word in sublist if not (word.isnumeric() or len(word)<3 or word==' ')])


In [1]:
# Tokenize abstracts using scispacy entities
df_full['abstract_ent'] = df_full['abstract'].swifter.apply(spacy_tokenizer_ent)

In [43]:
# Save abstract entities for future use.
df_full.to_csv('1.abstrac_entity_vectorization_abstract-entities.csv', columns=['paper_id', 'abstract_ent'])

**Vectorize**

In [45]:
# Create vectorizer
vectorizer = CountVectorizer(tokenizer=None,
                             min_df=0.01,
                             max_df=0.9,
                             stop_words='english')

In [None]:
full_vectorized = vectorizer.fit_transform(df_full['abstract_ent'])

In [None]:
# Check vertorized data
print(full_vectorized.shape, type(full_vectorized))
print(len(vectorizer.get_feature_names()))

In [None]:
# Dump vectorizer and vetorized data for future use
pickle.dump(vectorizer, open('1.abstrac_entity_vectorization_vectorizer.pkl', "wb")) # Writing binary file
pickle.dump(full_vectorized, open('1.abstrac_entity_vectorization_vectorized.pkl', "wb")) # Writing binary file

In [None]:
# Most frequent words
word_count = pd.DataFrame({'word': vectorizer.get_feature_names(), 'count': np.asarray(full_vectorized.sum(axis=0))[0]})
word_count.sort_values('count', ascending=False).set_index('word')[:20].sort_values('count', ascending=True).plot(kind='barh')