# Query the Entrez eutils API for articles and retreive/parse each record

In [63]:
import itertools
import time
import gzip

import xml.etree.ElementTree as ET
import requests
import pandas
import numpy

## Functions for querying pubmed

In [13]:
def esearch_query(payload, retmax = 10000, sleep=3):
    """
    Query the esearch E-utility.
    """
    url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    payload['retmax'] = retmax
    payload['retstart'] = 0
    ids = list()
    count = 1
    while payload['retstart'] < count:
        response = requests.get(url, params=payload)
        xml = ET.fromstring(response.content)
        count = int(xml.findtext('Count'))
        ids += [xml_id.text for xml_id in xml.findall('IdList/Id')]
        payload['retstart'] += retmax
        time.sleep(sleep)
    return ids

In [3]:
def esummary_query(ids, retmax=100, sleep=2):
    url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi'
    id_subsets = (ids[i:i+retmax] for i in range(0, len(ids), retmax))
    elements = list()
    for id_subset in id_subsets:
        id_string = ','.join(map(str, id_subset))
        payload = {'db': 'pubmed', 'id': id_string}
        response = requests.get(url, params=payload)
        xml = ET.fromstring(response.text)
        elements += xml.getchildren()
        time.sleep(sleep)
    return elements

In [4]:
def parse_doc_summary(elem):
    doc = dict()
    doc['pubmed_id'] = elem.findtext("Id")
    doc['pubdate'] = elem.findtext("Item[@Name='PubDate']")
    doc['epub_date'] = elem.findtext("Item[@Name='EPubDate']")
    doc['journal_abbrev'] = elem.findtext("Item[@Name='Source']")
    doc['date_received'] = elem.findtext("Item[@Name='History']/Item[@Name='received']")
    doc['date_accepted'] = elem.findtext("Item[@Name='History']/Item[@Name='accepted']")
    doc['date_epublish'] = elem.findtext("Item[@Name='History']/Item[@Name='epublish']")
    doc['date_pubmed'] = elem.findtext("Item[@Name='History']/Item[@Name='pubmed']")
    doc['date_medline'] = elem.findtext("Item[@Name='History']/Item[@Name='medline']")
    doc['doi'] = elem.findtext("Item[@Name='DOI']")
    doc['journal'] = elem.findtext("Item[@Name='FullJournalName']")
    doc['pubtype'] = '|'.join(x.text for x in elem.findall("Item[@Name='PubTypeList']/Item[@Name='PubType']"))
    return doc

## Retreive pubmed information for all PLOS publications

In [5]:
journals = [
    'PLoS Biol',
    'PLoS Comput Biol',
    'PLoS Genet',
    'PLoS Med',
    'PLoS Negl Trop Dis',
    'PLoS Pathog',
    'PloS One'
]

In [6]:
pubmed_ids = list()
for journal in journals:
    print(journal)
    payload = {'db': 'pubmed', 'term': '{}[journal]'.format(journal)}
    pubmed_ids += esearch_query(payload)

len(pubmed_ids)

PLoS Biol
PLoS Comput Biol
PLoS Genet
PLoS Med
PLoS Negl Trop Dis
PLoS Pathog
PloS One


151728

In [7]:
docs = esummary_query(pubmed_ids, retmax=500)
pubmed_df = pandas.DataFrame(list(map(parse_doc_summary, docs)))

In [8]:
with gzip.open('data/pubmed-plos.tsv.gz', 'wt') as write_file:
    pubmed_df.to_csv(write_file, index=False, sep='\t')

In [9]:
pubmed_df.head()

Unnamed: 0,date_accepted,date_epublish,date_medline,date_pubmed,date_received,doi,epub_date,journal,journal_abbrev,pubdate,pubmed_id,pubtype
0,,2015/06/23 00:00,2015/06/24 06:00,2015/06/24 06:00,,10.1371/journal.pbio.1002185,2015 Jun 23,PLoS biology,PLoS Biol,2015 Jun,26102073,Journal Article
1,,2015/06/22 00:00,2015/06/23 06:00,2015/06/23 06:00,,10.1371/journal.pbio.1002182,2015 Jun 22,PLoS biology,PLoS Biol,2015 Jun,26098912,Journal Article
2,2015/05/12 00:00,2015/06/22 00:00,2015/06/23 06:00,2015/06/23 06:00,2015/01/23 00:00,10.1371/journal.pbio.1002181,2015 Jun 22,PLoS biology,PLoS Biol,2015 Jun,26098891,Journal Article
3,2015/05/12 00:00,2015/06/22 00:00,2015/06/23 06:00,2015/06/23 06:00,2014/10/22 00:00,10.1371/journal.pbio.1002180,2015 Jun 22,PLoS biology,PLoS Biol,2015 Jun,26098873,Journal Article
4,,2015/06/22 00:00,2015/06/23 06:00,2015/06/23 06:00,,10.1371/journal.pbio.1002179,2015 Jun 22,PLoS biology,PLoS Biol,2015 Jun,26098828,Journal Article


## Retreive pubmed information for all publications since 2014

In [15]:
payload = {'db': 'pubmed', 'mindate': '2014', 'maxdate': '2015'}
recent_ids = esearch_query(payload)

In [16]:
# articles in pubmed since 2014
len(recent_ids)

1572538

In [18]:
docs = esummary_query(recent_ids, retmax=450, sleep=2.1)
recent_df = pandas.DataFrame(list(map(parse_doc_summary, docs)))
with gzip.open('data/pubmed-since-2014.tsv.gz', 'wt') as write_file:
    recent_df.to_csv(write_file, index=False, sep='\t')