# Takeda Part 2: Iterative Keyword Search

## Part I: Search (keyword -> papers)

In [137]:
from Bio import Entrez
from nltk import word_tokenize
import re
import yake
import pandas as pd

_email = ''
_db = 'pubmed'
_retmode = 'xml'
_sort = 'relevance'
_retmax = 500

In [141]:
# Config Keywords
config = {
    'lan': 'en',
    'n': 3,
    'dedupLim': 0.9,
    'dedupFunc': 'seqm',
    'windowsSize': 1,
    'top': 100,
    'features': None
}

config_100 = {**config}
kwe_100 = yake.KeywordExtractor(**config_100)

config_20 = {**config}
config_20['top'] = 20
kwe_20 = yake.KeywordExtractor(**config)

In [122]:
def search(query):
    Entrez.email = _email
    handle = Entrez.esearch(db=_db,
                           sort=_sort,
                           retmax=_retmax,
                           retmode=_retmode,
                           term=query)
    results = Entrez.read(handle)
    return results

def fetch_details(id_list):
    ids = ','.join(id_list)
    Entrez.email = _email
    handle = Entrez.efetch(db=_db,
                          retmode=_retmode,
                          id=ids)
    papers_details = Entrez.read(handle)
    papers = []
    
    for j, paper in enumerate(papers_details['PubmedArticle']):
        article = paper['MedlineCitation']['Article']
        abstract_text = ' '.join(article['Abstract']['AbstractText'])
        
        # Listed Keywords
        kw_lists = paper['MedlineCitation']['KeywordList']
        kws = sorted(set([str(kw) for kw_list in kw_lists for kw in kw_list]))
        
        # Get Keywords from Title and Abstract
        all_text = str(article['ArticleTitle']) + ' ' + abstract_text
        all_text = re.sub('\s+', ' ', all_text)
        all_text = ' '.join(word_tokenize(all_text)).lower()
        keywords = list(map(lambda e: e[0], kw_extractor.extract_keywords(all_text)))
        kws.extend(keywords)
        
        papers.append({
            'id': id_list[j],
            'title': article['ArticleTitle'],
            'abstract': abstract_text,
            'keywords': kws[:15]
        })
        
    return papers

In [123]:
# Example query
query = '(PFS[Title/Abstract]) AND (CLINICAL TRIAL[Title/Abstract])'
res = search(query)
id_list = res['IdList']
papers = fetch_details(id_list)

In [125]:
len(papers)

500

## Part II: Keyword Extraction (papers -> keywords)

#### Current Approach
PubMed entries come with keywords per along with each paper entry. For now, only those keywords will be utilized during the search.

#### TODO:
* Expand keyword instances by utilizing RAKE/YAKE

## Part III: Filter Algorithm

In [6]:
!conda install -c conda-forge ipywidgets -y

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [92]:
pip install sympy

Collecting sympy
  Downloading sympy-1.9-py3-none-any.whl (6.2 MB)
[K     |████████████████████████████████| 6.2 MB 6.6 MB/s eta 0:00:01
[?25hCollecting mpmath>=0.19
  Downloading mpmath-1.2.1-py3-none-any.whl (532 kB)
[K     |████████████████████████████████| 532 kB 60.3 MB/s eta 0:00:01
[?25hInstalling collected packages: mpmath, sympy
Successfully installed mpmath-1.2.1 sympy-1.9
Note: you may need to restart the kernel to use updated packages.


In [100]:
import random
from sympy import *
import numpy as np
import sympy
import re

In [114]:
class Document:
    def __init__(self, id, title, abstract, keywords):
        self.id = id
        self.abstract = abstract
        self.title = title
        self.keywords = keywords
    
    def get_document_text(self):
        return re.sub('\s+', ' ', self.title + ' ' + self.abstract).strip()
        
    def __repr__(self):
        return f'<Document id:{self.id}\ttitle: {self.title[:50]}>'

In [222]:
class QueryBuilder:
    def __init__(self):
        self.documents = []
        self.document_decisions = []
        
    def add_document(self, document, decision):
                
        # Save document
        self.document_decisions.append(decision)
        self.documents.append(document)
        
        
    def create_query_v2(self):
        
        texts = [d.get_document_text() for d in self.documents]
        all_text = ' '.join(texts)
        
        keywords = list(map(lambda e: e[0], kw_extractor.extract_keywords(all_text)))
        kw_to_id = {}
        
        
        print(keywords)
        
    def create_query(self):
        return self.create_query_v2()

In [102]:
qb = QueryBuilder()
docs = [Document(**kwargs) for kwargs in papers]
doc_decisions = [random.random() > 0.5 for _ in range(len(docs))]

for doc, dec in zip(docs, doc_decisions):
    qb.add_document(doc, dec)

### Proof of Concept Run

In [227]:
df = pd.DataFrame(papers)
texts = df['title'] + ' ' + df['abstract']
all_text = ' '.join(texts)
keywords_e = kwe_100.extract_keywords(all_text)

In [228]:
# Create karnaugh Map of keywords in documents
keyword_to_id = {k[0]:j for j, k in enumerate(keywords_e)}
kwo = np.zeros((len(texts), len(keywords_e)))
for t_idx, text in enumerate(texts):
    for k, k_idx in keyword_to_id.items():
        if k in text:
            kwo[t_idx, k_idx] = 1

In [229]:
keywords = np.array([k[0] for k in keywords_e])

In [265]:
# For the first run, take the OR of the most prominent keywords
dnf = [(k,) for k in keywords[:10]]

In [266]:
def construct_query(dnf):
    
    dnf_clauses = []
    for clause in dnf:
        literals = []
        for lit in clause:
            lit_str = f'({lit}[Title/Abstract])'
            literals.append(lit_str)
        clause_str = ' AND '.join(literals)
        dnf_clauses.append(clause_str)
    dnf_str = ' OR '.join([f'({c})' for c in dnf_clauses])
    
    return dnf_str

In [None]:
query = construct_query(dnf)
res = search(query)
id_list = res['IdList']
papers_new = fetch_details(id_list)

In [None]:
print('DONE')