In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import json

import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [None]:
root_path = '/kaggle/input/CORD-19-research-challenge/'
metadata_path = f'{root_path}/metadata.csv'
meta_df = pd.read_csv(metadata_path, dtype={
    'pubmed_id': str,
    'Microsoft Academic Paper ID': str, 
    'doi': str
})

In [None]:
all_json = glob.glob(f'{root_path}/**/*.json', recursive=True)
len(all_json)

60596

In [None]:
class FileReader:
    def __init__(self, file_path):
        with open(file_path) as file:
            content = json.load(file)
            self.paper_id = content['paper_id']
            self.abstract = []
            self.body_text = []
            # Abstract
            for entry in content['abstract']:
                self.abstract.append(entry['text'])
            # Body text
            for entry in content['body_text']:
                self.body_text.append(entry['text'])
            self.abstract = '\n'.join(self.abstract)
            self.body_text = '\n'.join(self.body_text)
    def __repr__(self):
        return f'{self.paper_id}: {self.abstract[:200]}... {self.body_text[:200]}...'

In [None]:
def get_breaks(content, length):
    data = ""
    words = content.split(' ')
    total_chars = 0

    # add break every length characters
    for i in range(len(words)):
        total_chars += len(words[i])
        if total_chars > length:
            data = data + "<br>" + words[i]
            total_chars = 0
        else:
            data = data + " " + words[i]
    return data

In [None]:
dict_ = {'paper_id': [], 'doi':[], 'abstract': [], 'body_text': [], 'authors': [], 'title': [], 'journal': [], 'abstract_summary': []}
for idx, entry in enumerate(all_json):
    if idx % (len(all_json) // 10) == 0:
        print(f'Processing index: {idx} of {len(all_json)}')
    
    try:
        content = FileReader(entry)
    except Exception as e:
        continue  # invalid paper format, skip
    
    # get metadata information
    meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
    # no metadata, skip this paper
    if len(meta_data) == 0:
        continue
    
    dict_['abstract'].append(content.abstract)
    dict_['paper_id'].append(content.paper_id)
    dict_['body_text'].append(content.body_text)
    
    # also create a column for the summary of abstract to be used in a plot
    if len(content.abstract) == 0: 
        # no abstract provided
        dict_['abstract_summary'].append("Not provided.")
    elif len(content.abstract.split(' ')) > 100:
        # abstract provided is too long for plot, take first 300 words append with ...
        info = content.abstract.split(' ')[:100]
        summary = get_breaks(' '.join(info), 40)
        dict_['abstract_summary'].append(summary + "...")
    else:
        # abstract is short enough
        summary = get_breaks(content.abstract, 40)
        dict_['abstract_summary'].append(summary)
        
    # get metadata information
    meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
    
    try:
        # if more than one author
        authors = meta_data['authors'].values[0].split(';')
        if len(authors) > 2:
            # more than 2 authors, may be problem when plotting, so take first 2 append with ...
            dict_['authors'].append(get_breaks('. '.join(authors), 40))
        else:
            # authors will fit in plot
            dict_['authors'].append(". ".join(authors))
    except Exception as e:
        # if only one author - or Null valie
        dict_['authors'].append(meta_data['authors'].values[0])
    
    # add the title information, add breaks when needed
    try:
        title = get_breaks(meta_data['title'].values[0], 40)
        dict_['title'].append(title)
    # if title was not provided
    except Exception as e:
        dict_['title'].append(meta_data['title'].values[0])
    
    # add the journal information
    dict_['journal'].append(meta_data['journal'].values[0])
    
    # add doi
    dict_['doi'].append(meta_data['doi'].values[0])
    
df_covid = pd.DataFrame(dict_, columns=['paper_id', 'doi', 'abstract', 'body_text', 'authors', 'title', 'journal', 'abstract_summary'])

Processing index: 0 of 60596
Processing index: 6059 of 60596
Processing index: 12118 of 60596
Processing index: 18177 of 60596
Processing index: 24236 of 60596
Processing index: 30295 of 60596
Processing index: 36354 of 60596
Processing index: 42413 of 60596
Processing index: 48472 of 60596
Processing index: 54531 of 60596
Processing index: 60590 of 60596


In [None]:
df_covid['abstract_body_text']   = df_covid['abstract']+' '+df_covid['body_text']

In [None]:
path = '/kaggle/input/externaldataset/*.txt'
#, path+'10_keggMedicus_network.list.txt'
#,path+'09_keggMedicus_drug.list.txt'
# path='/kaggle/input/externaldataset/'
keyword_files = glob.glob(path)
# keyword_files =[path+'08_new_keggMedicus_disease.list.txt'] #2537 all kegg data= 14K, all data 15K
all_keyword =[]
for f in keyword_files:
    file_object  = open(f, 'r')
    keyword = file_object.read()
    keyword_arr = keyword.split('\n')
    keyword_arr_clean = [i.lower().replace('/', ' ').replace('^+^', '').replace(':','').replace('-','') for i in keyword_arr]
    all_keyword.append(keyword_arr_clean)

In [None]:
keywords = [item.strip().lower() for sublist in all_keyword for item in sublist if item]
keywords_uni = set(keywords)
len(keywords_uni)

15814

#new code

In [None]:
import re
def match_keyword(text):
    return [dis for dis in keywords_uni if dis in text.lower()] 

In [None]:
# !pip install swifter

In [None]:
import time
import swifter
n = 500  #chunk row size
list_df = [df_covid[i:i+n] for i in range(0,df_covid.shape[0],n)]
count=3
for i in range(count, len(list_df)):
    print('processing..', count)
    df = list_df[i]
    df['keyword_match']=df.swifter.apply(lambda row: match_keyword(row['abstract_body_text']),axis=1)
    df.to_csv('df_'+str(count)+'.csv')
    count=count+1

processing.. 3


AttributeError: 'int' object has no attribute 'swifter'

In [None]:
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_lg-0.2.4.tar.gz

In [None]:
!pip install scispacy

In [None]:
import scispacy
import spacy
import en_core_sci_lg
from spacy import displacy
from scispacy.abbreviation import AbbreviationDetector
from scispacy.umls_linking import UmlsEntityLinker

# nlp = spacy.load("en_core_sci_sm")
nlp = spacy.load("en_core_sci_lg")
# nlp = en_core_sci_lg.load(disable=["tagger", "parser", "ner"])
nlp.max_length = 2000000

In [None]:
text = "Red mice was done. Calf diarrhea is a commonly reported disease in young animals, and still a major cause of productivity and economic loss to cattle producers worldwide. In the report of the 2007 National Animal Health Monitoring System for U.S. dairy, half of the deaths among unweaned calves was attributed to diarrhea. Multiple pathogens are known or postulated to cause or contribute to calf diarrhea development. Other factors including both the environment and management practices influence disease severity or outcomes. The multifactorial nature of calf diarrhea makes this disease hard to control effectively in modern cow-calf operations. The purpose of this review is to provide a better understanding of a) the ecology and pathogenesis of well-known and potential bovine enteric pathogens implicated in calf diarrhea, b) describe diagnostic tests used to detect various enteric pathogens along with their pros and cons, and c) propose improved intervention strategies for treating calf diarrhea."

In [None]:
# def my_tokenizer_ent(sentence):
#     return set([str(word).lower() for word in nlp(text).ents])

In [None]:
# data_vect  = pd.DataFrame()
df_covid['abstract_body_text_nlp'] = df_covid['abstract_body_text'].apply(lambda x : nlp(x).ents)
# token_ent  = set([str(word).lower() for word in nlp(text).ents])

In [None]:
from fuzzywuzzy import process
from fuzzywuzzy import fuzz

def fuzzy_match(x, choices, scorer, cutoff):
    match = process.extractOne(x['entities'], 
                               choices=choices, 
                               scorer=scorer, 
                               score_cutoff=cutoff)
    if match:
        return match[0]
    
def get_keyword_match(data_vect):
    return data_vect.apply(fuzzy_match, args=(list(keywords_uni), fuzz.token_set_ratio, 100), axis=1)

In [None]:
pd.set_option('display.max_colwidth', 0)

In [None]:
data_vect[pd.notnull(data_vect['FuzzyMatch'])]
# data_vect[pd.notnull(data_vect['FuzzyMatch'])].to_csv('entities_match2.csv')

In [None]:
def my_tokenizer(sentence):
    return [word.lemma_ for word in nlp(sentence) if not (word.like_num or word.is_stop or word.is_punct or word.is_space or len(word)==1)]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer='word',                   # minimum reqd occurences of a word 
                              stop_words='english',             # remove stop words
                              lowercase=True,                   # convert all words to lowercase
                              token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                              tokenizer = my_tokenizer,
                              min_df=2         # max number of uniq words
                             )
data_vectorized = vectorizer.fit_transform(df_covid['abstract_body_text'])
data_vectorized_mod = {k:v for k, v in (vectorizer.vocabulary_.items()) if not k.isdigit()}