In [1]:
!pip install rank_bm25 nltk

Collecting rank_bm25
  Downloading https://files.pythonhosted.org/packages/d2/e4/38d03d6d5e2deae8d2838b81d6ba2742475ced42045f5c46aeb00c5fb79c/rank_bm25-0.2.tar.gz
Collecting nltk
  Downloading https://files.pythonhosted.org/packages/f6/1d/d925cfb4f324ede997f6d47bea4d9babba51b49e87a767c170b77005889d/nltk-3.4.5.zip (1.5MB)
Building wheels for collected packages: rank-bm25, nltk
  Building wheel for rank-bm25 (setup.py): started
  Building wheel for rank-bm25 (setup.py): finished with status 'done'
  Created wheel for rank-bm25: filename=rank_bm25-0.2-cp37-none-any.whl size=4169 sha256=b745c59970644c9c064b22ae4af4c3afedc073f4cfcd47d84361e55fc10a2c06
  Stored in directory: C:\Users\Tiger\AppData\Local\pip\Cache\wheels\6f\0c\1f\78945dd6a5478bbcdb50d73ac96ae5af2ffcdfcd374fd9b1bf
  Building wheel for nltk (setup.py): started
  Building wheel for nltk (setup.py): finished with status 'done'
  Created wheel for nltk: filename=nltk-3.4.5-cp37-none-any.whl size=1449913 sha256=5fe698a0155b498aff1e

In [3]:
!pip install ipywidgets

Collecting ipywidgets
  Downloading https://files.pythonhosted.org/packages/56/a0/dbcf5881bb2f51e8db678211907f16ea0a182b232c591a6d6f276985ca95/ipywidgets-7.5.1-py2.py3-none-any.whl (121kB)
Collecting widgetsnbextension~=3.5.0
  Downloading https://files.pythonhosted.org/packages/6c/7b/7ac231c20d2d33c445eaacf8a433f4e22c60677eb9776c7c5262d7ddee2d/widgetsnbextension-3.5.1-py2.py3-none-any.whl (2.2MB)
Installing collected packages: widgetsnbextension, ipywidgets
Successfully installed ipywidgets-7.5.1 widgetsnbextension-3.5.1


In [4]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pathlib import Path, PurePath
import pandas as pd
import requests
from requests.exceptions import HTTPError, ConnectionError
from ipywidgets import interact
import ipywidgets as widgets
from rank_bm25 import BM25Okapi
import nltk
from nltk.corpus import stopwords
nltk.download("punkt")
import re

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Tiger\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [5]:
from ipywidgets import interact
import ipywidgets as widgets
import pandas as pd

def set_column_width(ColumnWidth, MaxRows):
    pd.options.display.max_colwidth = ColumnWidth
    pd.options.display.max_rows = MaxRows
    print('Set pandas dataframe column width to', ColumnWidth, 'and max rows to', MaxRows)
    
interact(set_column_width, 
         ColumnWidth=widgets.IntSlider(min=50, max=400, step=50, value=200),
         MaxRows=widgets.IntSlider(min=50, max=500, step=100, value=100));

Set pandas dataframe column width to 200 and max rows to 100


In [7]:
input_dir = PurePath('../../../../COVID19')

list(Path(input_dir).glob('*'))

[WindowsPath('../../../../COVID19/biorxiv_medrxiv'),
 WindowsPath('../../../../COVID19/comm_use_subset'),
 WindowsPath('../../../../COVID19/COVID.DATA.LIC.AGMT.pdf'),
 WindowsPath('../../../../COVID19/custom_license'),
 WindowsPath('../../../../COVID19/json_schema.txt'),
 WindowsPath('../../../../COVID19/metadata.csv'),
 WindowsPath('../../../../COVID19/metadata.readme'),
 WindowsPath('../../../../COVID19/noncomm_use_subset')]

In [8]:
metadata_path = input_dir / 'metadata.csv'
metadata = pd.read_csv(metadata_path,
                               dtype={'Microsoft Academic Paper ID': str,
                                      'pubmed_id': str})

# Set the abstract to the paper title if it is null
metadata.abstract = metadata.abstract.fillna(metadata.title)

## Create Data Classes for the Research Dataset and Papers

In [11]:
def get(url, timeout=6):
    try:
        r = requests.get(url, timeout=timeout)
        return r.text
    except ConnectionError:
        print(f'Cannot connect to {url}')
        print(f'Remember to turn Internet ON in the Kaggle notebook settings')
    except HTTPError:
        print('Got http error', r.status, r.text)

# Convert the doi to a url
def doi_url(d): 
    return f'http://{d}' if d.startswith('doi.org') else f'http://doi.org/{d}'


class ResearchPapers:
    
    def __init__(self, metadata: pd.DataFrame):
        self.metadata = metadata
        
    def __getitem__(self, item):
        return Paper(self.metadata.iloc[item])
    
    def __len__(self):
        return len(self.metadata)
    
    def head(self, n):
        return ResearchPapers(self.metadata.head(n).copy().reset_index(drop=True))
    
    def tail(self, n):
        return ResearchPapers(self.metadata.tail(n).copy().reset_index(drop=True))
    
    def abstracts(self):
        return self.metadata.abstract.dropna()
    
    def titles(self):
        return self.metadata.title.dropna()
        
    def _repr_html_(self):
        return self.metadata._repr_html_()
    
class Paper:
    
    '''
    A single research paper
    '''
    def __init__(self, item):
        self.paper = item.to_frame().fillna('')
        self.paper.columns = ['Value']
    
    def doi(self):
        return self.paper.loc['doi'].values[0]
    
    def html(self):
        '''
        Load the paper from doi.org and display as HTML. Requires internet to be ON
        '''
        if self.doi():
            url = doi_url(self.doi()) 
            text = get(url)
            return widgets.HTML(text)
    
    def text(self):
        '''
        Load the paper from doi.org and display as text. Requires Internet to be ON
        '''
        text = get(self.doi())
        return text
    
    def abstract(self):
        return self.paper.loc['abstract'].values[0]
    
    def title(self):
        return self.paper.loc['title'].values[0]
    
    def authors(self, split=False):
        '''
        Get a list of authors
        '''
        authors = self.paper.loc['authors'].values[0]
        if not authors:
            return []
        if not split:
            return authors
        if authors.startswith('['):
            authors = authors.lstrip('[').rstrip(']')
            return [a.strip().replace("\'", "") for a in authors.split("\',")]
        
        # Todo: Handle cases where author names are separated by ","
        return [a.strip() for a in authors.split(';')]
        
    def _repr_html_(self):
        return self.paper._repr_html_()
    

papers = ResearchPapers(metadata)

## Creating a search index

In [17]:
from rank_bm25 import BM25Okapi
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Tiger\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [19]:
english_stopwords = list(set(stopwords.words('english')))

def strip_characters(text):
    t = re.sub('\(|\)|:|,|;|\.|’|”|“|\?|%|>|<', '', text)
    t = re.sub('/', ' ', t)
    t = t.replace("'",'')
    return t

def clean(text):
    t = text.lower()
    t = strip_characters(t)
    return t

def tokenize(text):
    words = nltk.word_tokenize(text)
    return list(set([word for word in words 
                     if len(word) > 1
                     and not word in english_stopwords
                     and not (word.isnumeric() and len(word) is not 4)
                     and (not word.isnumeric() or word.isalpha())] )
               )

def preprocess(text):
    t = clean(text)
    tokens = tokenize(t)
    return tokens

class SearchResults:
    
    def __init__(self, 
                 data: pd.DataFrame,
                 columns = None):
        self.results = data
        if columns:
            self.results = self.results[columns]
            
    def __getitem__(self, item):
        return Paper(self.results.loc[item])
    
    def __len__(self):
        return len(self.results)
        
    def _repr_html_(self):
        return self.results._repr_html_()

SEARCH_DISPLAY_COLUMNS = ['title', 'abstract', 'doi', 'authors', 'journal']
    
class RankBM25Index:
    
    def __init__(self, corpus: pd.DataFrame, columns=SEARCH_DISPLAY_COLUMNS):
        self.corpus = corpus
        self.columns = columns
        raw_search_str = self.corpus.abstract.fillna('') + ' ' + self.corpus.title.fillna('')
        self.index = raw_search_str.apply(preprocess).to_frame()
        self.index.columns = ['terms']
        self.index.index = self.corpus.index
        self.bm25 = BM25Okapi(self.index.terms.tolist())
        
    def search(self, search_string, n=4):
        search_terms = preprocess(search_string)
        doc_scores = self.bm25.get_scores(search_terms)
        ind = np.argsort(doc_scores)[::-1][:n]
        results = self.corpus.iloc[ind][self.columns]
        results['Score'] = doc_scores[ind]
        results = results[results.Score > 0]
        return SearchResults(results.reset_index(), self.columns + ['Score'])
    
bm25_index = RankBM25Index(metadata.head(10000))

In [21]:
results = bm25_index.search('cruise ship')
results

Unnamed: 0,title,abstract,doi,authors,journal,Score
0,Chapter 34 Cruise Ship Travel,Chapter 34 Cruise Ship Travel,10.1016/B978-0-323-03453-1.10034-3,"Mitruka, Kiren; Wheeler, Robert E.",Travel Medicine,26.485593
1,40 Cruise Ship Travel,Abstract Cruise ships can be amplifiers of infectious diseases because of the close human proximity of semiclosed ship environments. The most common diagnoses of cruise passengers evaluated in cru...,10.1016/B978-0-323-54696-6.00040-9,"Hill, Carter D.",Travel Medicine,14.745112
2,Chapter 4 Cruise Geography,Chapter 4 Cruise Geography,10.1016/B978-0-7506-7835-3.50008-X,,Cruise Operations Management,13.044365
3,Chapter 1 Contemporary Cruise Operations,Chapter 1 Contemporary Cruise Operations,10.1016/B978-0-7506-7835-3.50005-4,,Cruise Operations Management,12.855523


In [22]:
results[3].title()

'Chapter 1 Contemporary Cruise Operations'

## Research Papers for each task

In [23]:
tasks = [('What is known about transmission, incubation, and environmental stability?', 
        'transmission incubation environment coronavirus'),
        ('What do we know about COVID-19 risk factors?', 'risk factors'),
        ('What do we know about virus genetics, origin, and evolution?', 'genetics origin evolution'),
        ('What has been published about ethical and social science considerations','ethics ethical social'),
        ('What do we know about diagnostics and surveillance?','diagnose diagnostic surveillance'),
        ('What has been published about medical care?', 'medical care'),
        ('What do we know about vaccines and therapeutics?', 'vaccines vaccine vaccinate therapeutic therapeutics')] 
tasks = pd.DataFrame(tasks, columns=['Task', 'Keywords'])


def show_task(Task):
    print(Task)
    keywords = tasks[tasks.Task == Task].Keywords.values[0]
    search_results = bm25_index.search(keywords, n=200)
    return search_results
    
results = interact(show_task, Task = tasks.Task.tolist());

What is known about transmission, incubation, and environmental stability?


Unnamed: 0,title,abstract,doi,authors,journal,Score
0,Chapter 22 Environmentally Transmitted Pathogens,"This chapter describes a variety of pathogens found in the environment that are capable of infecting humans and causing disease. Different classes of pathogens are discussed including bacteria, pa...",10.1016/B978-0-12-394626-3.00022-3,"Gerba, Charles P.",Environmental Microbiology,11.231821
1,MERS-CoV outbreak following a single patient exposure in an emergency room in South Korea: an epidemiological outbreak study,"Summary Background In 2015, a large outbreak of Middle East respiratory syndrome coronavirus (MERS-CoV) infection occurred following a single patient exposure in an emergency room at the Samsung M...",10.1016/S0140-6736(16)30623-7,"Cho, Sun Young; Kang, Ji-Man; Ha, Young Eun; Park, Ga Eun; Lee, Ji Yeon; Ko, Jae-Hoon; Lee, Ji Yong; Kim, Jong Min; Kang, Cheol-In; Jo, Ik Joon; Ryu, Jae Geum; Choi, Jong Rim; Kim, Seonwoo; Huh, H...",The Lancet,8.580750
2,Induction of lactogenic immunity to transmissible gastroenteritis virus of swine using an attenuated coronavirus mutant able to survive in the physicochemical environment of the digestive tract,Induction of lactogenic immunity to transmissible gastroenteritis virus of swine using an attenuated coronavirus mutant able to survive in the physicochemical environment of the digestive tract,10.1016/0264-410X(92)90273-M,"Aynaud, J.M.; Bernard, S.; Bottreau, E.; Lantier, I.; Salmon, H.; Vannier, Ph.",Vaccine,8.351886
3,Chapter 17 Meat Safety—I Foodborne Pathogens and Other Biological Issues,"Abstract This chapter presents information pertinent to foodborne pathogens (bacteria and bacterial toxins, viruses, parasites) and other biological issues (prions) with importance to the safety o...",10.1016/B978-0-08-100694-8.00017-0,"Lianou, Alexandra; Panagou, Efstathios Z.; Nychas, George-John E.",Lawrie´s Meat Science,7.678443
4,"Deactivated triple vaccine for abortus fever, Ibaraki disease and Akabane disease produced by incubation of the viruses in cell culture","Deactivated triple vaccine for abortus fever, Ibaraki disease and Akabane disease produced by incubation of the viruses in cell culture",10.1016/0264-410X(88)90167-3,,Vaccine,7.455350
...,...,...,...,...,...,...
195,Chapter 11 Nanotechnology and sialic acid biology,"Abstract Nanotechnology is the science of matter at size in a scale of 1/1,000,000,000 of a meter. In the last century, considerable progress has been made in the field of nanotechnology and its f...",10.1016/B978-0-12-816126-5.00011-1,"Ghosh, Shyamasree","Sialic Acids and Sialoglycoconjugates in the Biology of Life, Health and Disease",3.479238
196,Chapter 3 Replication and Expression Strategies of Viruses,"Summary Regardless of their genetic constitution, viral genomes are replicated, expressed, and assembled in association with living host cells. These entities do not undergo division, but rather g...",10.1016/B978-0-12-811257-1.00003-6,"Rampersad, Sephra; Tennant, Paula",Viruses,3.479238
197,"Chapter 36 Identifying, Understanding, and Managing Patient Safety and Clinical Risks in the Clinical Research Environment","Abstract Meticulous study design, exacting scientific review, scrupulous data management, rigorous human subjects' protection, and effective recruitment strategies are essential components of all ...",10.1016/B978-0-12-849905-4.00036-8,"Lee, Laura M.; Henderson, David K.",Principles and Practice of Clinical Research,3.479238
198,Antibody detection of SARS-CoV spike and nucleocapsid protein,Abstract Early detection and identification of SARS-CoV-infected patients and actions to prevent transmission are absolutely critical to prevent another SARS outbreak. Antibodies that specifically...,10.1016/j.bbrc.2003.12.195,"Chang, Mau-Sun; Lu, Yen-Ta; Ho, Shin-Tsung; Wu, Chao-Chih; Wei, Tsai-Yin; Chen, Chia-Ju; Hsu, Yun-Ting; Chu, Po-Chen; Chen, Ching-Hsin; Chu, Jien-Ming; Jan, Ya-Lin; Hung, Chia-Chien; Fan, Chi-Chen...",Biochemical and Biophysical Research Communications,3.472354


## Creating an Autocomplete Search bar

In [24]:
from IPython.display import display

def search_papers(SearchTerms: str):
    search_results = bm25_index.search(SearchTerms, n=10)
    if len(search_results) > 0:
        display(search_results) 
    return search_results

searchbar = widgets.interactive(search_papers, SearchTerms='cruise ship')
searchbar

Unnamed: 0,title,abstract,doi,authors,journal,Score
0,Chapter 34 Cruise Ship Travel,Chapter 34 Cruise Ship Travel,10.1016/B978-0-323-03453-1.10034-3,"Mitruka, Kiren; Wheeler, Robert E.",Travel Medicine,26.485593
1,40 Cruise Ship Travel,Abstract Cruise ships can be amplifiers of infectious diseases because of the close human proximity of semiclosed ship environments. The most common diagnoses of cruise passengers evaluated in cru...,10.1016/B978-0-323-54696-6.00040-9,"Hill, Carter D.",Travel Medicine,14.745112
2,Chapter 4 Cruise Geography,Chapter 4 Cruise Geography,10.1016/B978-0-7506-7835-3.50008-X,,Cruise Operations Management,13.044365
3,Chapter 1 Contemporary Cruise Operations,Chapter 1 Contemporary Cruise Operations,10.1016/B978-0-7506-7835-3.50005-4,,Cruise Operations Management,12.855523
4,59 Respiratory Infections,Abstract Respiratory tract infections (RTIs) are a common health problem of international travelers. Travelers may be at increased risk of RTIs due to travel itself (mingling and close quarters in...,10.1016/B978-0-323-54696-6.00059-8,"Saleri, Nuccia; Ryan, Edward T.",Travel Medicine,6.93343
5,Chapter 22 Cetacea,"Abstract This chapter presents the pathology of cetaceans, a diverse group of mammals restricted exclusively to aquatic habitats. The taxa include the largest mammals on earth, the baleen whales, ...",10.1016/B978-0-12-805306-5.00022-5,"St. Leger, Judy; Raverty, Stephen; Mena, Alexandria",Pathology of Wildlife and Zoo Animals,6.44533


## Access the search results

In [25]:
searchbar.result[0]

Unnamed: 0,Value
title,Chapter 34 Cruise Ship Travel
abstract,Chapter 34 Cruise Ship Travel
doi,10.1016/B978-0-323-03453-1.10034-3
authors,"Mitruka, Kiren; Wheeler, Robert E."
journal,Travel Medicine
Score,26.4856


## Parsing Json files

In [27]:
json_path = '../../../../COVID19/custom_license/custom_license/aecbc613ebdab36753235197ffb4f35734b5ca63.json'


import json
import collections
from functools import  partial

def load_json(json_file):
    with open(json_file, 'r') as f:
        return json.load(f)

paper_json = load_json(json_path)

In [29]:
def get_text(paper, text_key):
    body_dict = collections.defaultdict(list)
    for rec in paper[text_key]:
        body_dict[rec['section']].append(rec['text'])
    
    body = ''
    for section, text_sections in body_dict.items():
        body += section + '\n\n'
        for text in text_sections:
            body += text + '\n\n'
    return body

get_body = partial(get_text, text_key='body_text')
get_abstract = partial(get_text, text_key='abstract') 

In [30]:
body = get_body(paper_json)
print(body[:1000])



The patient (Fo, ) was a 58 year old mentally retarded white woman, born in a rural area of southwestern Virginia.

In July 1967 she was referred to the University of Virginia Hospital (UVH) because of edema of the legs and facial swelling of recent onset. She admitted having arthralgias but denied having recent rash, pleurisy or hair loss. Previous medical history revealed that a systolic heart murmur had been heard in 1962. A hemogram and urinalysis at that time were normal. In June 1963 she had been admitted to her local hospital with congestive heart failure attributed to mitral insufficiency.

Laboratory studies included a positive lupus erythematosus cell preparation, and she received a brief course of prednisone therapy. A pruritic rash of her neck and trunk was recorded in June 1964.

Family members included a healthy twin sister (Case 2), who was identical in appearance.

The sisters shared the phenotype Gm (3, 5, 13, 14) ; Inv(-l)* and common red blood cell groups (type 0, 

In [32]:
abstract = get_abstract(paper_json)
print(abstract[:1000])

Abstract

Middle-aged female identical twins, one of whom had systemic lupus erythematosus (SLE), were evaluated for immunologic reactivity to previous antigenic challenges, including primary immunization with a foreign antigen, keyhole limpet hemocyanin (KLH). These two women had lived together for all of their 58 years and neither was receiving anti-inflammatory or immunosuppressive drugs at the time of these studies. Both twins demonstrated comparable 7s and 19s humoral antibody response to KLH, as well as similar viral antibody titers. However, the twin with SLE was anergic to common antigens, streptokinase-streptodornase, Trichophyton and Candida; furthermore delayed hypersensitivity to KLH did not develop after immunization. This observed discrepancy between humoral and cellular immunity in genetically similar subjects may be significant in the pathogenesis of SLE.

Reports of an increased incidence of systemic lupus erythematosus (SLE), other connective tissue diseases, and sero

In [33]:
def author_name(author_json):
    first = author_json.get('first')
    middle = "".join(author_json.get('middle'))
    last = author_json.get('last')
    if middle:
        return ' '.join([first, middle, last])
    return ' '.join([first, last])

def get_affiliation(author_json):
    affiliation = author_json['affiliation']
    institution = affiliation.get('institution', '')
    location = affiliation.get('location')
    if location:
        location = ' '.join(location.values())
    return f'{institution}, {location}'

def get_authors(paper, include_affiliation=False):
    if include_affiliation:
        return [f'{author_name(a)}, {get_affiliation(a)}'
                   for a in paper['metadata']['authors']]
    else:
        return [author_name(a) for a in paper['metadata']['authors']]
    
authors = get_authors(paper_json)
authors

['Carolyn M Brunner',
 'A David',
 ' Horwitz',
 'K Mary',
 ' Shann',
 'Benjamin A Sturgill',
 'S John',
 ' Davis',
 'Virginia Charlottesville']

## The CORD Research Paper Search Engine