This code is completaly based on https://www.kaggle.com/dgunning/browsing-research-papers-with-a-bm25-search-engine

Installing rank_bm25 library

# Importing some libraries

In [31]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pathlib import Path, PurePath
import pandas as pd
import requests
from requests.exceptions import HTTPError, ConnectionError
from ipywidgets import interact
import ipywidgets as widgets
from rank_bm25 import BM25Okapi
import nltk
from nltk.corpus import stopwords
nltk.download("punkt")
import re

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\joseh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Some adjustments in display

In [32]:
from ipywidgets import interact
import ipywidgets as widgets
import pandas as pd

def set_column_width(ColumnWidth, MaxRows):
    pd.options.display.max_colwidth = ColumnWidth
    pd.options.display.max_rows = MaxRows
    print('Set pandas dataframe column width to', ColumnWidth, 'and max rows to', MaxRows)
    
interact(set_column_width, 
         ColumnWidth=widgets.IntSlider(min=50, max=400, step=50, value=200),
         MaxRows=widgets.IntSlider(min=50, max=500, step=100, value=100));

interactive(children=(IntSlider(value=200, description='ColumnWidth', max=400, min=50, step=50), IntSlider(val…

# 1. Load metadata

In [4]:
# Where are all the files located
input_dir = PurePath('2020-03-13')

# The all sources metadata file
metadata = pd.read_csv(input_dir / 'all_sources_metadata_2020-03-13.csv',
                       dtype={'Microsoft Academic Paper ID': str,
                             'pubmed_id': str})

# Convert the doi to a url
def doi_url(d): return f'http://{d}' if d.startswith('doi.org') else f'http://doi.org/{d}'
metadata.doi = metadata.doi.fillna('').apply(doi_url)

# Set the abstract to the paper title if it is null
metadata.abstract = metadata.abstract.fillna(metadata.title)

Total number of papers on metadata

### Dropping duplicate papers

In [7]:
# Some papers are duplicated since they were collected from separate sources. Thanks Joerg Rings
duplicate_paper = ~(metadata.title.isnull() | metadata.abstract.isnull()) & (metadata.duplicated(subset=['title', 'abstract']))
metadata = metadata[~duplicate_paper].reset_index(drop=True)

Results after deleting duplicates

# 2. Class to process the Research Dataset

A class with function that help handle papers information

In [8]:
def get(url, timeout=6):
    try:
        r = requests.get(url, timeout=timeout)
        return r.text
    except ConnectionError:
        print(f'Cannot connect to {url}')
        print(f'Remember to turn Internet ON in the Kaggle notebook settings')
    except HTTPError:
        print('Got http error', r.status, r.text)

class DataHolder:
    '''
    A wrapper for a dataframe with useful functions for notebooks
    '''
    def __init__(self, data: pd.DataFrame):
        self.data = data
        
    def __len__(self): return len(self.data)
    def __getitem__(self, item): return self.data.loc[item]
    def head(self, n:int): return DataHolder(self.data.head(n).copy())
    def tail(self, n:int): return DataHolder(self.data.tail(n).copy())
    def _repr_html_(self): return self.data._repr_html_()
    def __repr__(self): return self.data.__repr__()


class ResearchPapers:
    
    def __init__(self, metadata: pd.DataFrame):
        self.metadata = metadata
        
    def __getitem__(self, item):
        return Paper(self.metadata.iloc[item])
    
    def __len__(self):
        return len(self.metadata)
    
    def head(self, n):
        return ResearchPapers(self.metadata.head(n).copy().reset_index(drop=True))
    
    def tail(self, n):
        return ResearchPapers(self.metadata.tail(n).copy().reset_index(drop=True))
    
    def abstracts(self):
        return self.metadata.abstract.dropna()
    
    def titles(self):
        return self.metadata.title.dropna()
        
    def _repr_html_(self):
        return self.metadata._repr_html_()
    
class Paper:
    
    '''
    A single research paper
    '''
    def __init__(self, item):
        self.paper = item.to_frame().fillna('')
        self.paper.columns = ['Value']
    
    def doi(self):
        return self.paper.loc['doi'].values[0]
    
    def html(self):
        '''
        Load the paper from doi.org and display as HTML. Requires internet to be ON
        '''
        text = get(self.doi())
        return widgets.HTML(text)
    
    def text(self):
        '''
        Load the paper from doi.org and display as text. Requires Internet to be ON
        '''
        text = get(self.doi())
        return text
    
    def abstract(self):
        return self.paper.loc['abstract'].values[0]
    
    def title(self):
        return self.paper.loc['title'].values[0]
    
    def authors(self, split=False):
        '''
        Get a list of authors
        '''
        authors = self.paper.loc['authors'].values[0]
        if not authors:
            return []
        if not split:
            return authors
        if authors.startswith('['):
            authors = authors.lstrip('[').rstrip(']')
            return [a.strip().replace("\'", "") for a in authors.split("\',")]
        
        # Todo: Handle cases where author names are separated by ","
        return [a.strip() for a in authors.split(';')]
        
    def _repr_html_(self):
        return self.paper._repr_html_()
    

papers = ResearchPapers(metadata)

# Search index

Search index for matching tokens in document

In [9]:
from rank_bm25 import BM25Okapi

Text Preprocessing
To prepare the text for the search index we perform the following steps

1. Remove punctuations and special characters
2. Convert to lowercase
3. Tokenize into individual tokens (words mostly)
4. Remove stopwords like (and, to))

You can tweak the code below to improve the search results

In [10]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\joseh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
english_stopwords = list(set(stopwords.words('english')))

def strip_characters(text):
    t = re.sub('\(|\)|:|,|;|\.|’|”|“|\?|%|>|<', '', text)
    t = re.sub('/', ' ', t)
    t = t.replace("'",'')
    return t

def clean(text):
    t = text.lower()
    t = strip_characters(t)
    return t

def tokenize(text):
    words = nltk.word_tokenize(text)
    return list(set([word for word in words 
                     if len(word) > 1
                     and not word in english_stopwords
                     and not (word.isnumeric() and len(word) is not 4)
                     and (not word.isnumeric() or word.isalpha())] )
               )

def preprocess(text):
    t = clean(text)
    tokens = tokenize(t)
    return tokens

class SearchResults:
    
    def __init__(self, 
                 data: pd.DataFrame,
                 columns = None):
        self.results = data
        if columns:
            self.results = self.results[columns]
            
    def __getitem__(self, item):
        return Paper(self.results.loc[item])
    
    def __len__(self):
        return len(self.results)
        
    def _repr_html_(self):
        return self.results._repr_html_()

SEARCH_DISPLAY_COLUMNS = ['title', 'abstract', 'doi', 'authors', 'journal']

class WordTokenIndex:
    
    def __init__(self, 
                 corpus: pd.DataFrame, 
                 columns=SEARCH_DISPLAY_COLUMNS):
        self.corpus = corpus
        raw_search_str = self.corpus.abstract.fillna('') + ' ' + self.corpus.title.fillna('')
        self.index = raw_search_str.apply(preprocess).to_frame()
        self.index.columns = ['terms']
        self.index.index = self.corpus.index
        self.columns = columns
    
    def search(self, search_string):
        search_terms = preprocess(search_string)
        result_index = self.index.terms.apply(lambda terms: any(i in terms for i in search_terms))
        results = self.corpus[result_index].copy().reset_index().rename(columns={'index':'paper'})
        return SearchResults(results, self.columns + ['paper'])

# Using rankBM25 search index

In [12]:
class RankBM25Index(WordTokenIndex):
    
    def __init__(self, corpus: pd.DataFrame, columns=SEARCH_DISPLAY_COLUMNS):
        super().__init__(corpus, columns)
        self.bm25 = BM25Okapi(self.index.terms.tolist())
        
    def search(self, search_string, n=10):
        search_terms = preprocess(search_string)
        doc_scores = self.bm25.get_scores(search_terms)
        ind = np.argsort(doc_scores)[::-1][:n]
        results = self.corpus.iloc[ind][self.columns]
        results['Score'] = doc_scores[ind]
        results = results[results.Score > 0]
        return SearchResults(results.reset_index(), self.columns + ['Score'])

In [13]:
bm25_index = RankBM25Index(metadata.head(len(metadata)))

In [46]:
print(repr(bm25_index))

<__main__.RankBM25Index object at 0x000001CC661569B0>


In [47]:
import pickle

with open('mypickle.pickle', 'wb') as f:
    pickle.dump(bm25_index, f)

# 5. Creating autocomplete text bar

In [53]:
with open('mypickle.pickle', 'rb') as f:
    loaded = pickle.load(f)

In [54]:
from IPython.display import display

def search_papers(SearchTerms: str):
    search_results = loaded.search(SearchTerms, n=100)
    if len(search_results) > 0:
        display(search_results) 
    return search_results

searchbar = widgets.interactive(search_papers, SearchTerms='mexico')
display(searchbar)

interactive(children=(Text(value='mexico', description='SearchTerms'), Output()), _dom_classes=('widget-intera…

In [65]:
search_results = bm25_index.search('Panama', n=100)
type(search_results[0])

__main__.Paper

In [87]:
search_results[0].doi()

'http://doi.org/'

In [82]:
class MyEncoder(JSONEncoder):
        def default(self, o):
            return o.__dict__    

In [83]:
MyEncoder().encode(search_results[0])

'{"paper": {"_is_copy": null, "_data": {}, "_item_cache": {}}}'