This code is completaly based on https://www.kaggle.com/dgunning/browsing-research-papers-with-a-bm25-search-engine

Installing rank_bm25 library

In [1]:
!pip install rank_bm25 nltk

Collecting rank_bm25
  Downloading https://files.pythonhosted.org/packages/d2/e4/38d03d6d5e2deae8d2838b81d6ba2742475ced42045f5c46aeb00c5fb79c/rank_bm25-0.2.tar.gz
Collecting nltk
  Downloading https://files.pythonhosted.org/packages/f6/1d/d925cfb4f324ede997f6d47bea4d9babba51b49e87a767c170b77005889d/nltk-3.4.5.zip (1.5MB)
Building wheels for collected packages: rank-bm25, nltk
  Building wheel for rank-bm25 (setup.py): started
  Building wheel for rank-bm25 (setup.py): finished with status 'done'
  Stored in directory: C:\Users\joseh\AppData\Local\pip\Cache\wheels\6f\0c\1f\78945dd6a5478bbcdb50d73ac96ae5af2ffcdfcd374fd9b1bf
  Building wheel for nltk (setup.py): started
  Building wheel for nltk (setup.py): finished with status 'done'
  Stored in directory: C:\Users\joseh\AppData\Local\pip\Cache\wheels\96\86\f6\68ab24c23f207c0077381a5e3904b2815136b879538a24b483
Successfully built rank-bm25 nltk
Installing collected packages: rank-bm25, nltk
Successfully installed nltk-3.4.5 rank-bm25-0.2


In [3]:
!pip install ipywidgets

Collecting ipywidgets
  Downloading https://files.pythonhosted.org/packages/56/a0/dbcf5881bb2f51e8db678211907f16ea0a182b232c591a6d6f276985ca95/ipywidgets-7.5.1-py2.py3-none-any.whl (121kB)
Collecting widgetsnbextension~=3.5.0 (from ipywidgets)
  Downloading https://files.pythonhosted.org/packages/6c/7b/7ac231c20d2d33c445eaacf8a433f4e22c60677eb9776c7c5262d7ddee2d/widgetsnbextension-3.5.1-py2.py3-none-any.whl (2.2MB)
Installing collected packages: widgetsnbextension, ipywidgets
Successfully installed ipywidgets-7.5.1 widgetsnbextension-3.5.1


# Importing some libraries

In [4]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pathlib import Path, PurePath
import pandas as pd
import requests
from requests.exceptions import HTTPError, ConnectionError
from ipywidgets import interact
import ipywidgets as widgets
from rank_bm25 import BM25Okapi
import nltk
from nltk.corpus import stopwords
nltk.download("punkt")
import re

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\joseh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


Some adjustments in display

In [5]:
from ipywidgets import interact
import ipywidgets as widgets
import pandas as pd

def set_column_width(ColumnWidth, MaxRows):
    pd.options.display.max_colwidth = ColumnWidth
    pd.options.display.max_rows = MaxRows
    print('Set pandas dataframe column width to', ColumnWidth, 'and max rows to', MaxRows)
    
interact(set_column_width, 
         ColumnWidth=widgets.IntSlider(min=50, max=400, step=50, value=200),
         MaxRows=widgets.IntSlider(min=50, max=500, step=100, value=100));

Set pandas dataframe column width to 200 and max rows to 100


# 1. Load metadata

In [16]:
# Where are all the files located
input_dir = PurePath('2020-03-13')

# The all sources metadata file
metadata = pd.read_csv(input_dir / 'all_sources_metadata_2020-03-13.csv',
                       dtype={'Microsoft Academic Paper ID': str,
                             'pubmed_id': str})

# Convert the doi to a url
def doi_url(d): return f'http://{d}' if d.startswith('doi.org') else f'http://doi.org/{d}'
metadata.doi = metadata.doi.fillna('').apply(doi_url)

# Set the abstract to the paper title if it is null
metadata.abstract = metadata.abstract.fillna(metadata.title)

Total number of papers on metadata

In [19]:
len(metadata)

29500

### Dropping duplicate papers

In [20]:
# Some papers are duplicated since they were collected from separate sources. Thanks Joerg Rings
duplicate_paper = ~(metadata.title.isnull() | metadata.abstract.isnull()) & (metadata.duplicated(subset=['title', 'abstract']))
metadata = metadata[~duplicate_paper].reset_index(drop=True)

Results after deleting duplicates

In [21]:
len(metadata)

25133

# 2. Class to process the Research Dataset

A class with function that help handle papers information

In [22]:
def get(url, timeout=6):
    try:
        r = requests.get(url, timeout=timeout)
        return r.text
    except ConnectionError:
        print(f'Cannot connect to {url}')
        print(f'Remember to turn Internet ON in the Kaggle notebook settings')
    except HTTPError:
        print('Got http error', r.status, r.text)

class DataHolder:
    '''
    A wrapper for a dataframe with useful functions for notebooks
    '''
    def __init__(self, data: pd.DataFrame):
        self.data = data
        
    def __len__(self): return len(self.data)
    def __getitem__(self, item): return self.data.loc[item]
    def head(self, n:int): return DataHolder(self.data.head(n).copy())
    def tail(self, n:int): return DataHolder(self.data.tail(n).copy())
    def _repr_html_(self): return self.data._repr_html_()
    def __repr__(self): return self.data.__repr__()


class ResearchPapers:
    
    def __init__(self, metadata: pd.DataFrame):
        self.metadata = metadata
        
    def __getitem__(self, item):
        return Paper(self.metadata.iloc[item])
    
    def __len__(self):
        return len(self.metadata)
    
    def head(self, n):
        return ResearchPapers(self.metadata.head(n).copy().reset_index(drop=True))
    
    def tail(self, n):
        return ResearchPapers(self.metadata.tail(n).copy().reset_index(drop=True))
    
    def abstracts(self):
        return self.metadata.abstract.dropna()
    
    def titles(self):
        return self.metadata.title.dropna()
        
    def _repr_html_(self):
        return self.metadata._repr_html_()
    
class Paper:
    
    '''
    A single research paper
    '''
    def __init__(self, item):
        self.paper = item.to_frame().fillna('')
        self.paper.columns = ['Value']
    
    def doi(self):
        return self.paper.loc['doi'].values[0]
    
    def html(self):
        '''
        Load the paper from doi.org and display as HTML. Requires internet to be ON
        '''
        text = get(self.doi())
        return widgets.HTML(text)
    
    def text(self):
        '''
        Load the paper from doi.org and display as text. Requires Internet to be ON
        '''
        text = get(self.doi())
        return text
    
    def abstract(self):
        return self.paper.loc['abstract'].values[0]
    
    def title(self):
        return self.paper.loc['title'].values[0]
    
    def authors(self, split=False):
        '''
        Get a list of authors
        '''
        authors = self.paper.loc['authors'].values[0]
        if not authors:
            return []
        if not split:
            return authors
        if authors.startswith('['):
            authors = authors.lstrip('[').rstrip(']')
            return [a.strip().replace("\'", "") for a in authors.split("\',")]
        
        # Todo: Handle cases where author names are separated by ","
        return [a.strip() for a in authors.split(';')]
        
    def _repr_html_(self):
        return self.paper._repr_html_()
    

papers = ResearchPapers(metadata)

# Search index

Search index for matching tokens in document

In [38]:
from rank_bm25 import BM25Okapi

Text Preprocessing
To prepare the text for the search index we perform the following steps

1. Remove punctuations and special characters
2. Convert to lowercase
3. Tokenize into individual tokens (words mostly)
4. Remove stopwords like (and, to))

You can tweak the code below to improve the search results

In [41]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\joseh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [43]:
english_stopwords = list(set(stopwords.words('english')))

def strip_characters(text):
    t = re.sub('\(|\)|:|,|;|\.|’|”|“|\?|%|>|<', '', text)
    t = re.sub('/', ' ', t)
    t = t.replace("'",'')
    return t

def clean(text):
    t = text.lower()
    t = strip_characters(t)
    return t

def tokenize(text):
    words = nltk.word_tokenize(text)
    return list(set([word for word in words 
                     if len(word) > 1
                     and not word in english_stopwords
                     and not (word.isnumeric() and len(word) is not 4)
                     and (not word.isnumeric() or word.isalpha())] )
               )

def preprocess(text):
    t = clean(text)
    tokens = tokenize(t)
    return tokens

class SearchResults:
    
    def __init__(self, 
                 data: pd.DataFrame,
                 columns = None):
        self.results = data
        if columns:
            self.results = self.results[columns]
            
    def __getitem__(self, item):
        return Paper(self.results.loc[item])
    
    def __len__(self):
        return len(self.results)
        
    def _repr_html_(self):
        return self.results._repr_html_()

SEARCH_DISPLAY_COLUMNS = ['title', 'abstract', 'doi', 'authors', 'journal']

class WordTokenIndex:
    
    def __init__(self, 
                 corpus: pd.DataFrame, 
                 columns=SEARCH_DISPLAY_COLUMNS):
        self.corpus = corpus
        raw_search_str = self.corpus.abstract.fillna('') + ' ' + self.corpus.title.fillna('')
        self.index = raw_search_str.apply(preprocess).to_frame()
        self.index.columns = ['terms']
        self.index.index = self.corpus.index
        self.columns = columns
    
    def search(self, search_string):
        search_terms = preprocess(search_string)
        result_index = self.index.terms.apply(lambda terms: any(i in terms for i in search_terms))
        results = self.corpus[result_index].copy().reset_index().rename(columns={'index':'paper'})
        return SearchResults(results, self.columns + ['paper'])

# Using rankBM25 search index

In [53]:
class RankBM25Index(WordTokenIndex):
    
    def __init__(self, corpus: pd.DataFrame, columns=SEARCH_DISPLAY_COLUMNS):
        super().__init__(corpus, columns)
        self.bm25 = BM25Okapi(self.index.terms.tolist())
        
    def search(self, search_string, n=10):
        search_terms = preprocess(search_string)
        doc_scores = self.bm25.get_scores(search_terms)
        ind = np.argsort(doc_scores)[::-1][:n]
        results = self.corpus.iloc[ind][self.columns]
        results['Score'] = doc_scores[ind]
        results = results[results.Score > 0]
        return SearchResults(results.reset_index(), self.columns + ['Score'])
    
bm25 = RankBM25Index(metadata.head(100))

In [71]:
bm25_index = RankBM25Index(metadata.head(len(metadata)))

# 5. Creating autocomplete text bar

In [82]:
from IPython.display import display

def search_papers(SearchTerms: str):
    search_results = bm25_index.search(SearchTerms, n=100)
    if len(search_results) > 0:
        display(search_results) 
    return search_results

searchbar = widgets.interactive(search_papers, SearchTerms='mexico')
display(searchbar)

Unnamed: 0,title,abstract,doi,authors,journal,Score
0,"Novel Bat Coronaviruses, Brazil and Mexico","Novel Bat Coronaviruses, Brazil and Mexico",http://doi.org/http://dx.doi.org/10.3201/eid1910.130525,"['Góes, Luiz Gustavo Bentim', 'Ruvalcaba, Sicilene Gonzalez', 'Campos, Angélica Almeida', 'Queiroz, Luzia Helena', 'de Carvalho, Cristiano', 'Jerez, José Antonio', 'Durigon, Edison Luiz', 'Dávalos...",Emerg Infect Dis,9.983204
1,Clinical Characteristics of Asthmatic Patients With Influenza-Like Illness and Risk for Severe Exacerbations in Mexico,Clinical Characteristics of Asthmatic Patients With Influenza-Like Illness and Risk for Severe Exacerbations in Mexico,http://doi.org/http://dx.doi.org/10.1016/j.anai.2016.03.007,"['Paulin-Prado, Paulina', 'Nishimura, Katherine', 'Freimanis-Hance, Laura', 'Hunsberger, Sally', 'Beigel, John', 'Fraga, Arturo Galindo', 'Hernandez, Ana A Ortiz', 'Llamosas-Gallardo, Beatriz', 'M...",,9.547902
2,Genome Sequence of a Bovine Rhinitis B Virus Identified in Cattle in Sweden,"A bovine rhinitis B virus, identified in a calf from Sweden, was genetically characterized. The complete polyprotein was recovered, and phylogenetic analysis showed that this virus has the highest...",http://doi.org/http://dx.doi.org/10.1128/genomeA.00172-17,"['Blomström, Anne-Lie', 'Oma, Veslemøy', 'Khatri, Mamata', 'Hansen, Hanne H.', 'Stokstad, Maria', 'Berg, Mikael', 'Myrmel, Mette']",Genome Announc,8.712165
3,Complete Genome Sequence of Human Coronavirus OC43 Isolated from Mexico,"We report the complete genome sequence of the first Mexican human coronavirus (HCoV) OC43, obtained by new-generation sequencing and a metagenomic approach, isolated from a child hospitalized with...",http://doi.org/http://dx.doi.org/10.1128/genomeA.01256-16,"['Taboada, B. T.', 'Isa, P.', 'Espinoza, M. A.', 'Aponte, F. E.', 'Arias-Ortiz, M. A.', 'Monge-Martínez, J.', 'Rodríguez-Vázquez, R.', 'Díaz-Hernández, F.', 'Zárate-Vidal, F.', 'Wong-Chew, R. M.',...",Genome Announc,8.315164
4,Identification of co-infection by rotavirus and parvovirus in dogs with gastroenteritis in Mexico,This is the first report on circulating canine rotavirus in Mexico. Fifty samples from dogs with gastroenteritis were analyzed used polymerase chain reaction and reverse transcription polymerase c...,http://doi.org/http://dx.doi.org/10.1016/j.bjm.2017.03.008,"['Ortega, Ariadna Flores', 'Martínez-Castañeda, José Simón', 'Bautista-Gómez, Linda G.', 'Muñoz, Raúl Fajardo', 'Hernández, Israel Quijano']",Braz J Microbiol,8.190751
5,Complete Genome Sequences of Four Novel Human Coronavirus OC43 Isolates Associated with Severe Acute Respiratory Infection,We report here the complete genome sequences of four human coronavirus (HCoV) OC43 isolates generated using targeted viral nucleic acid capture and next-generation sequencing; the isolates were co...,http://doi.org/http://dx.doi.org/10.1128/genomeA.00452-18,"['Dinwiddie, Darrell L.', 'Hardin, Olga', 'Denson, Jesse L.', 'Kincaid, John C.', 'Schwalm, Kurt C.', 'Stoner, Ashley N.', 'Abramo, Thomas J.', 'Thompson, Tonya M.', 'Putt, Claire M.', 'Young, Ste...",Genome Announc,7.674056
6,RNA electropherotypes of human rotaviruses from North and South America,"Between April 1979 and December 1982, viral agents were found in 231 of 695 children admitted to the Texas Children's Hospital with gastroenteritis. Electron microscopic analysis showed that rotav...",http://doi.org/,"['Dimitrov, D. H.', 'Graham, D. Y.', 'Lopez, J.', 'Muchinik, G.', 'Velasco, G.', 'Stenback, W. A.', 'Estes, M. K.']",,7.218682
7,Entry and exit screening of airline travellers during the A(H1N1) 2009 pandemic: a retrospective evaluation,OBJECTIVE: To evaluate the screening measures that would have been required to assess all travellers at risk of transporting A(H1N1)pdm09 out of Mexico by air at the start of the 2009 pandemic. ME...,http://doi.org/http://dx.doi.org/10.2471/BLT.12.114777,"['Khan, Kamran', 'Eckhardt, Rose', 'Brownstein, John S', 'Naqvi, Raza', 'Hu, Wei', 'Kossowsky, David', 'Scales, David', 'Arino, Julien', 'MacDonald, Michael', 'Wang, Jun', 'Sears, Jennifer', 'Cetr...",,6.569013
8,Dengue virus in Mexican bats,"Individuals belonging to five families, 12 genera, and 19 different species of bats from dengue endemic areas in the Gulf and Pacific coasts of Mexico were examined by ELISA, RT–PCR, and for the p...",http://doi.org/http://dx.doi.org/10.1017/S0950268808000460,"['AGUILAR-SETIÉN, Á.', 'ROMERO-ALMARAZ, M.\xa0L.', 'SÁNCHEZ-HERNÁNDEZ, C.', 'FIGUEROA, R.', 'JUÁREZ-PALMA, L.\xa0P.', 'GARCÍA-FLORES, M.\xa0M.', 'VÁZQUEZ-SALINAS, C.', 'SALAS-ROJAS, M.', 'HIDALGO-...",,6.569013
9,Estimating the incidence reporting rates of new influenza pandemics at an early stage using travel data from the source country,"During the surveillance of influenza pandemics, underreported data are a public health challenge that complicates the understanding of pandemic threats and can undermine mitigation efforts. We pro...",http://doi.org/http://dx.doi.org/10.1017/S0950268813002550,"['CHONG, K. C.', 'FONG, H. F.', 'ZEE, C. Y.']",Epidemiol Infect,6.232466


In [79]:
searchbar.result[0]

Unnamed: 0,Value
title,Infectious Diseases and Maternal Morbidity and Mortality
abstract,Infectious Diseases and Maternal Morbidity and Mortality
doi,http://doi.org/http://dx.doi.org/10.3201/eid1011.040624_05
authors,"['Finnegan, Loretta P.', 'Sheffield, Jeanne', 'Sanghvi, Harshad', 'Anker, Martha']"
journal,Emerg Infect Dis
Score,4.79877
