In [2]:
from Bio import Entrez
import os
import re
import os.path as osp
import logging
import traceback
import pandas as pd
from bs4 import BeautifulSoup
%run utils/logging.py
%run env.py
#Entrez.email = 'eric@hammerlab.org'
Entrez.email = 'eczech52@gmail.com'

In [3]:
def search(query, retstart=0, retmax=1000):
    handle = Entrez.esearch(db='pmc', 
                            sort='relevance', 
                            retmax=str(retmax),
                            retstart=str(retstart),
                            retmode='xml', 
                            term=query)
    results = Entrez.read(handle)
    return results

def fetch_details(id_list):
    ids = ','.join(id_list)
    handle = Entrez.efetch(db='pmc', rettype="full", retmode='xml', id=ids)
    return handle.read()

def parse(soup):
    res = {}
    res['id'] = [t for t in soup.find('article-meta').find_all('article-id') if t['pub-id-type'] == 'pmc'][0].text
    res['title'] = soup.find('title-group').find('article-title').text
    res['xml'] = str(soup)
    
    try:
        res['abstract'] = soup.find('abstract').text
    except:
        res['abstract'] = None
    
    try:
        res['has_text'] = len(soup.find('body').text.strip()) > 0
    except:
        res['has_text'] = False
        
    try:
        date = [t for t in soup.find('history').find_all('date') if t['date-type'] == 'received'][0]
        res['date'] = pd.to_datetime('{}-{}-{}'.format(
            date.find('year').text,
            date.find('month').text if date.find('month') else '00',
            date.find('day').text if date.find('day') else '00'
        ))
    except:
        res['date'] = None
    return res

def to_df(dets):
    soup = BeautifulSoup(dets, 'xml')
    return pd.DataFrame([parse(article) for article in soup.find_all('article')])

def collect(query, output_file, start_index=0, batch_size=100, batch_limit=None, max_failures=5):
    i = start_index
    ct = 0
    failures = 0
    if not osp.exists(osp.dirname(output_file)):
        os.makedirs(osp.dirname(output_file))
    while True:
        try:
            ids = search(query, retstart=i, retmax=batch_size)['IdList']
            if len(ids) == 0:
                break
            logger.info('Processing batch at start index {} (num ids = {})'.format(i, len(ids)))
            dets = fetch_details(ids)
            df = to_df(dets)
            df.to_csv(output_file, index=False, header=not osp.exists(output_file), mode='a')
            ct += 1
            i += batch_size
            if batch_limit and ct >= batch_limit:
                break
        except:
            traceback.print_exc()
            failures += 1
            if failures > max_failures:
                raise ValueError('Max failure threshold ({}) exceeded'.format(max_failures))
            logger.warning('Error occurred at index {}. Will retry up to {} times'.format(i, max_failures))
    logger.info('Collection complete (num failures = {})'.format(failures))
            

In [4]:
# # https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=6089340
# dets = fetch_details(['6089340', '212403'])

In [5]:
# from bs4 import BeautifulSoup
# soup = BeautifulSoup(dets, 'xml')

In [12]:
output_file = osp.join(DATA_DIR, 'articles', 'data.csv')
![ -e $output_file ] && rm $output_file

# Efetch docs: https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch
# Get doc details: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=212403
# 124720 results as of 2019-03-11
# Test single article: 
#query = '5360497'
query = '(human) AND ((t cell) OR (t lymphocyte)) AND (cytokine) AND ((differentiate) OR (differentiation) OR (differentiated)) AND ((polarization) OR (polarize) OR (induce) OR (induction))'
collect(
    query, output_file, 
    start_index=0, batch_size=10, max_failures=1000000,
    batch_limit=350
)

2019-03-14 10:43:48,565:INFO:root: Processing batch at start index 0 (num ids = 10)
2019-03-14 10:43:51,818:INFO:root: Processing batch at start index 10 (num ids = 10)
2019-03-14 10:43:56,353:INFO:root: Processing batch at start index 20 (num ids = 10)
2019-03-14 10:44:01,841:INFO:root: Processing batch at start index 30 (num ids = 10)
2019-03-14 10:44:04,812:INFO:root: Processing batch at start index 40 (num ids = 10)
2019-03-14 10:44:12,635:INFO:root: Processing batch at start index 50 (num ids = 10)
2019-03-14 10:44:18,281:INFO:root: Processing batch at start index 60 (num ids = 10)
2019-03-14 10:44:22,010:INFO:root: Processing batch at start index 70 (num ids = 10)
2019-03-14 10:44:25,185:INFO:root: Processing batch at start index 80 (num ids = 10)
2019-03-14 10:44:31,267:INFO:root: Processing batch at start index 90 (num ids = 10)
2019-03-14 10:44:39,867:INFO:root: Processing batch at start index 100 (num ids = 10)
2019-03-14 10:44:42,943:INFO:root: Processing batch at start inde

2019-03-14 10:48:50,003:INFO:root: Processing batch at start index 280 (num ids = 10)
Traceback (most recent call last):
  File "<ipython-input-3-6c5b1272821b>", line 59, in collect
    dets = fetch_details(ids)
  File "<ipython-input-3-6c5b1272821b>", line 13, in fetch_details
    handle = Entrez.efetch(db='pmc', rettype="full", retmode='xml', id=ids)
  File "/Users/eczech/anaconda3/envs/pubmed-nlp/lib/python3.6/site-packages/Bio/Entrez/__init__.py", line 184, in efetch
    return _open(cgi, variables, post=post)
  File "/Users/eczech/anaconda3/envs/pubmed-nlp/lib/python3.6/site-packages/Bio/Entrez/__init__.py", line 545, in _open
    raise exception
  File "/Users/eczech/anaconda3/envs/pubmed-nlp/lib/python3.6/site-packages/Bio/Entrez/__init__.py", line 543, in _open
    handle = _urlopen(cgi)
  File "/Users/eczech/anaconda3/envs/pubmed-nlp/lib/python3.6/urllib/request.py", line 223, in urlopen
    return opener.open(url, data, timeout)
  File "/Users/eczech/anaconda3/envs/pubmed-nlp

2019-03-14 10:52:22,399:INFO:root: Processing batch at start index 390 (num ids = 10)
2019-03-14 10:52:27,307:INFO:root: Processing batch at start index 400 (num ids = 10)
2019-03-14 10:52:30,136:INFO:root: Processing batch at start index 410 (num ids = 10)
2019-03-14 10:52:32,703:INFO:root: Processing batch at start index 420 (num ids = 10)
2019-03-14 10:52:40,149:INFO:root: Processing batch at start index 430 (num ids = 10)
2019-03-14 10:52:50,718:INFO:root: Processing batch at start index 440 (num ids = 10)
2019-03-14 10:52:53,791:INFO:root: Processing batch at start index 450 (num ids = 10)
2019-03-14 10:53:01,470:INFO:root: Processing batch at start index 460 (num ids = 10)
2019-03-14 10:53:03,927:INFO:root: Processing batch at start index 470 (num ids = 10)
2019-03-14 10:53:10,993:INFO:root: Processing batch at start index 480 (num ids = 10)
2019-03-14 10:53:13,758:INFO:root: Processing batch at start index 490 (num ids = 10)
2019-03-14 10:53:20,822:INFO:root: Processing batch at

2019-03-14 10:55:59,170:INFO:root: Processing batch at start index 600 (num ids = 10)
2019-03-14 10:56:06,404:INFO:root: Processing batch at start index 610 (num ids = 10)
2019-03-14 10:56:09,955:INFO:root: Processing batch at start index 620 (num ids = 10)
2019-03-14 10:56:15,140:INFO:root: Processing batch at start index 630 (num ids = 10)
2019-03-14 10:56:18,079:INFO:root: Processing batch at start index 640 (num ids = 10)
2019-03-14 10:56:24,396:INFO:root: Processing batch at start index 650 (num ids = 10)
2019-03-14 10:56:26,988:INFO:root: Processing batch at start index 660 (num ids = 10)
2019-03-14 10:56:31,051:INFO:root: Processing batch at start index 670 (num ids = 10)
2019-03-14 10:56:35,898:INFO:root: Processing batch at start index 680 (num ids = 10)
2019-03-14 10:56:39,496:INFO:root: Processing batch at start index 690 (num ids = 10)
2019-03-14 10:56:45,905:INFO:root: Processing batch at start index 700 (num ids = 10)
2019-03-14 10:56:49,668:INFO:root: Processing batch at

2019-03-14 10:59:27,684:INFO:root: Processing batch at start index 930 (num ids = 10)
2019-03-14 10:59:30,082:INFO:root: Processing batch at start index 940 (num ids = 10)
2019-03-14 10:59:34,266:INFO:root: Processing batch at start index 950 (num ids = 10)
2019-03-14 10:59:38,991:INFO:root: Processing batch at start index 960 (num ids = 10)
2019-03-14 10:59:41,958:INFO:root: Processing batch at start index 970 (num ids = 10)
2019-03-14 10:59:49,131:INFO:root: Processing batch at start index 980 (num ids = 10)
2019-03-14 10:59:51,888:INFO:root: Processing batch at start index 990 (num ids = 10)
2019-03-14 10:59:59,507:INFO:root: Processing batch at start index 1000 (num ids = 10)
2019-03-14 11:00:03,825:INFO:root: Processing batch at start index 1010 (num ids = 10)
2019-03-14 11:00:10,324:INFO:root: Processing batch at start index 1020 (num ids = 10)
2019-03-14 11:00:12,877:INFO:root: Processing batch at start index 1030 (num ids = 10)
2019-03-14 11:00:19,932:INFO:root: Processing batc

2019-03-14 11:04:13,630:INFO:root: Processing batch at start index 1470 (num ids = 10)
2019-03-14 11:04:17,250:INFO:root: Processing batch at start index 1480 (num ids = 10)
2019-03-14 11:04:21,741:INFO:root: Processing batch at start index 1490 (num ids = 10)
2019-03-14 11:04:24,999:INFO:root: Processing batch at start index 1500 (num ids = 10)
2019-03-14 11:04:28,388:INFO:root: Processing batch at start index 1510 (num ids = 10)
2019-03-14 11:04:34,694:INFO:root: Processing batch at start index 1520 (num ids = 10)
2019-03-14 11:04:38,329:INFO:root: Processing batch at start index 1530 (num ids = 10)
2019-03-14 11:04:45,708:INFO:root: Processing batch at start index 1540 (num ids = 10)
2019-03-14 11:04:48,960:INFO:root: Processing batch at start index 1550 (num ids = 10)
2019-03-14 11:04:56,329:INFO:root: Processing batch at start index 1560 (num ids = 10)
2019-03-14 11:04:59,401:INFO:root: Processing batch at start index 1570 (num ids = 10)
2019-03-14 11:05:05,545:INFO:root: Processi

2019-03-14 11:09:14,073:INFO:root: Processing batch at start index 2060 (num ids = 10)
2019-03-14 11:09:21,445:INFO:root: Processing batch at start index 2070 (num ids = 10)
2019-03-14 11:09:29,798:INFO:root: Processing batch at start index 2080 (num ids = 10)
2019-03-14 11:09:31,822:INFO:root: Processing batch at start index 2090 (num ids = 10)
2019-03-14 11:09:34,348:INFO:root: Processing batch at start index 2100 (num ids = 10)
2019-03-14 11:09:41,413:INFO:root: Processing batch at start index 2110 (num ids = 10)
2019-03-14 11:09:44,483:INFO:root: Processing batch at start index 2120 (num ids = 10)
2019-03-14 11:09:51,714:INFO:root: Processing batch at start index 2130 (num ids = 10)
2019-03-14 11:09:54,269:INFO:root: Processing batch at start index 2140 (num ids = 10)
2019-03-14 11:10:01,073:INFO:root: Processing batch at start index 2150 (num ids = 10)
2019-03-14 11:10:04,235:INFO:root: Processing batch at start index 2160 (num ids = 10)
2019-03-14 11:10:14,620:INFO:root: Processi

2019-03-14 11:14:36,020:INFO:root: Processing batch at start index 2620 (num ids = 10)
2019-03-14 11:14:38,784:INFO:root: Processing batch at start index 2630 (num ids = 10)
2019-03-14 11:14:41,222:INFO:root: Processing batch at start index 2640 (num ids = 10)
2019-03-14 11:14:48,004:INFO:root: Processing batch at start index 2650 (num ids = 10)
2019-03-14 11:14:50,289:INFO:root: Processing batch at start index 2660 (num ids = 10)
2019-03-14 11:14:57,823:INFO:root: Processing batch at start index 2670 (num ids = 10)
2019-03-14 11:15:01,569:INFO:root: Processing batch at start index 2680 (num ids = 10)
Traceback (most recent call last):
  File "/Users/eczech/anaconda3/envs/pubmed-nlp/lib/python3.6/urllib/request.py", line 1318, in do_open
    encode_chunked=req.has_header('Transfer-encoding'))
  File "/Users/eczech/anaconda3/envs/pubmed-nlp/lib/python3.6/http/client.py", line 1239, in request
    self._send_request(method, url, body, headers, encode_chunked)
  File "/Users/eczech/anacon

2019-03-14 11:17:18,099:INFO:root: Processing batch at start index 2950 (num ids = 10)
2019-03-14 11:17:23,700:INFO:root: Processing batch at start index 2960 (num ids = 10)
2019-03-14 11:17:29,895:INFO:root: Processing batch at start index 2970 (num ids = 10)
2019-03-14 11:17:32,545:INFO:root: Processing batch at start index 2980 (num ids = 10)
2019-03-14 11:17:38,498:INFO:root: Processing batch at start index 2990 (num ids = 10)
2019-03-14 11:17:41,228:INFO:root: Processing batch at start index 3000 (num ids = 10)
2019-03-14 11:17:48,389:INFO:root: Processing batch at start index 3010 (num ids = 10)
2019-03-14 11:17:51,558:INFO:root: Processing batch at start index 3020 (num ids = 10)
2019-03-14 11:17:54,797:INFO:root: Processing batch at start index 3030 (num ids = 10)
2019-03-14 11:18:00,922:INFO:root: Processing batch at start index 3040 (num ids = 10)
2019-03-14 11:18:03,383:INFO:root: Processing batch at start index 3050 (num ids = 10)
2019-03-14 11:18:10,755:INFO:root: Processi

In [13]:
df = pd.read_csv(output_file)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3500 entries, 0 to 3499
Data columns (total 6 columns):
abstract    3481 non-null object
date        2155 non-null object
has_text    3500 non-null bool
id          3500 non-null int64
title       3500 non-null object
xml         3500 non-null object
dtypes: bool(1), int64(1), object(4)
memory usage: 140.2+ KB


In [14]:
df['has_text'].value_counts()

False    1766
True     1734
Name: has_text, dtype: int64