In [1]:
from Bio import Entrez
import os
import re
import os.path as osp
import logging
import traceback
import pandas as pd
from bs4 import BeautifulSoup
%run src/logging.py
%run env.py
#Entrez.email = 'eric@hammerlab.org'
Entrez.email = 'eczech52@gmail.com'

In [2]:
def search(query, retstart=0, retmax=1000):
    handle = Entrez.esearch(db='pmc', 
                            sort='relevance', 
                            retmax=str(retmax),
                            retstart=str(retstart),
                            retmode='xml', 
                            term=query)
    results = Entrez.read(handle)
    return results

def fetch_details(id_list):
    ids = ','.join(id_list)
    handle = Entrez.efetch(db='pmc', rettype="full", retmode='xml', id=ids)
    return handle.read()

def parse(soup):
    res = {}
    res['id'] = [t for t in soup.find('article-meta').find_all('article-id') if t['pub-id-type'] == 'pmc'][0].text
    res['title'] = soup.find('title-group').find('article-title').text
    res['xml'] = str(soup)
    
    try:
        res['abstract'] = soup.find('abstract').text
    except:
        res['abstract'] = None
    
    try:
        res['has_text'] = len(soup.find('body').text.strip()) > 0
    except:
        res['has_text'] = False
        
    try:
        date = [t for t in soup.find('history').find_all('date') if t['date-type'] == 'received'][0]
        res['date'] = pd.to_datetime('{}-{}-{}'.format(
            date.find('year').text,
            date.find('month').text if date.find('month') else '00',
            date.find('day').text if date.find('day') else '00'
        ))
    except:
        res['date'] = None
    return res

def to_df(dets):
    soup = BeautifulSoup(dets, 'xml')
    return pd.DataFrame([parse(article) for article in soup.find_all('article')])

def collect(query, output_file, start_index=0, batch_size=100, batch_limit=None, max_failures=5):
    i = start_index
    ct = 0
    failures = 0
    if not osp.exists(osp.dirname(output_file)):
        os.makedirs(osp.dirname(output_file))
    while True:
        try:
            ids = search(query, retstart=i, retmax=batch_size)['IdList']
            if len(ids) == 0:
                break
            logger.info('Processing batch at start index {} (num ids = {})'.format(i, len(ids)))
            dets = fetch_details(ids)
            df = to_df(dets)
            df.to_csv(output_file, index=False, header=not osp.exists(output_file), mode='a')
            ct += 1
            i += batch_size
            if batch_limit and ct >= batch_limit:
                break
        except:
            traceback.print_exc()
            failures += 1
            if failures > max_failures:
                raise ValueError('Max failure threshold ({}) exceeded'.format(max_failures))
            logger.warning('Error occurred at index {}. Will retry up to {} times'.format(i, max_failures))
    logger.info('Collection complete (num failures = {})'.format(failures))
            

In [7]:
output_file = osp.join(IMPORT_DATA_DIR_02, 'data.csv')
batch_size, batch_limit = 25, 816
output_file

'/Users/eczech/data/research/hammer/nlp/20190311-pubmed-tcell-relation/articles/import/20190501/data.csv'

In [1]:
# Efetch docs: https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch
# Get doc details: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=212403
# 124720 results as of 2019-03-11
# Test single article: 
#query = '5360497'

In [3]:
query = '(human) AND ((t cell) OR (t lymphocyte)) AND (cytokine) AND ((differentiate) OR (differentiation) OR (differentiated)) AND ((polarization) OR (polarize) OR (induce) OR (induction))'

In [None]:
![ -e $output_file ] && rm $output_file

collect(
    query, output_file, 
    batch_size=batch_size, batch_limit=batch_limit,
    start_index=0, max_failures=1000000
)

2019-05-01 13:15:19,128:INFO:root: Processing batch at start index 0 (num ids = 25)
2019-05-01 13:15:29,249:INFO:root: Processing batch at start index 25 (num ids = 25)
2019-05-01 13:15:34,418:INFO:root: Processing batch at start index 50 (num ids = 25)
2019-05-01 13:15:43,201:INFO:root: Processing batch at start index 75 (num ids = 25)
2019-05-01 13:15:56,086:INFO:root: Processing batch at start index 100 (num ids = 25)
2019-05-01 13:16:04,217:INFO:root: Processing batch at start index 125 (num ids = 25)
2019-05-01 13:16:15,270:INFO:root: Processing batch at start index 150 (num ids = 25)
2019-05-01 13:16:27,020:INFO:root: Processing batch at start index 175 (num ids = 25)
2019-05-01 13:16:34,642:INFO:root: Processing batch at start index 200 (num ids = 25)
2019-05-01 13:16:45,715:INFO:root: Processing batch at start index 225 (num ids = 25)
2019-05-01 13:16:54,426:INFO:root: Processing batch at start index 250 (num ids = 25)
2019-05-01 13:17:01,656:INFO:root: Processing batch at star

In [4]:
df = pd.read_csv(output_file)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20450 entries, 0 to 20449
Data columns (total 6 columns):
abstract    20334 non-null object
date        13014 non-null object
has_text    20450 non-null bool
id          20450 non-null int64
title       20450 non-null object
xml         20450 non-null object
dtypes: bool(1), int64(1), object(4)
memory usage: 818.9+ KB


In [5]:
df['has_text'].value_counts()

False    10540
True      9910
Name: has_text, dtype: int64

In [6]:
assert df['id'].value_counts().max() == 1

1