In [1]:
from Bio import Entrez
import logging
import os
import os.path as osp
import pandas as pd
console = logging.StreamHandler()
console.setFormatter(logging.Formatter('%(asctime)s:%(levelname)s:%(name)s: %(message)s'))
logger = logging.getLogger()
logger.setLevel('INFO')
logger.addHandler(console)
output_dir = '/Users/eczech/tmp/nlp/data'

In [44]:
import traceback
def search(query, retstart=0, retmax=1000):
    Entrez.email = 'eric@hammerlab.org'
    handle = Entrez.esearch(db='pubmed', 
                            sort='relevance', 
                            retmax=str(retmax),
                            retstart=str(retstart),
                            retmode='xml', 
                            term=query)
    results = Entrez.read(handle)
    return results

def fetch_details(id_list):
    ids = ','.join(id_list)
    Entrez.email = 'your.email@example.com'
    handle = Entrez.efetch(db='pubmed',
                           retmode='xml',
                           id=ids)
    results = Entrez.read(handle)
    return results

def parse(dets):
    res = {}
    dets = dets['MedlineCitation']
    res['id'] = str(dets['PMID'])
    res['title'] = dets['Article']['ArticleTitle']
    try:
        res['abstract'] = dets['Article']['Abstract']['AbstractText'][0]
    except:
        res['abstract'] = None
    try:
        date = dets['Article']['ArticleDate'][0]
        res['date'] = '{}-{}-{}'.format(date['Year'], date['Month'], date['Day'])
    except:
        res['date'] = None
    try:
        def get_mesh_terms(v):
            return '/'.join([v['DescriptorName']] + v['QualifierName'])
        res['terms'] = '|'.join([get_mesh_terms(v) for v in dets['MeshHeadingList']])
    except:
        res['terms'] = None
        
    return res

def to_df(dets):
    return pd.DataFrame([parse(d) for d in dets['PubmedArticle']])

def collect(query, output_file, start_index=0, batch_size=100, batch_limit=None, max_failures=5):
    i = start_index
    ct = 0
    failures = 0
    while True:
        try:
            ids = search(query, retstart=i, retmax=batch_size)['IdList']
            if len(ids) == 0:
                break
            logger.info('Processing batch at start index {} (num ids = {})'.format(i, len(ids)))
            dets = fetch_details(ids)
            df = to_df(dets)
            df.to_csv(output_file, index=False, header=not osp.exists(output_file), mode='a')
            ct += 1
            i += batch_size
            if batch_limit and ct >= batch_limit:
                break
        except:
            traceback.print_exc()
            failures += 1
            if failures > max_failures:
                raise ValueError('Max failure threshold ({}) exceeded'.format(max_failures))
            logger.warning('Error occurred at index {}. Will retry up to {} times'.format(i, max_failures))
    logger.info('Collection complete (num failures = {})'.format(failures))
            

In [45]:
!rm $output_dir/pubmed_abstracts.csv
#query = 'T cells OR T lymphocytes'
query = '"humans"[MeSH Terms] AND "t lymphocyte subsets/immunology"[MeSH Terms]'
collect(
    query, osp.join(output_dir, 'pubmed_abstracts.csv'), 
    start_index=0, batch_size=1000, batch_limit=None, max_failures=10
)

2019-01-29 17:02:15,743:INFO:root: Processing batch at start index 0 (num ids = 1000)
2019-01-29 17:02:34,145:INFO:root: Processing batch at start index 1000 (num ids = 1000)
2019-01-29 17:02:50,968:INFO:root: Processing batch at start index 2000 (num ids = 1000)
2019-01-29 17:03:08,668:INFO:root: Processing batch at start index 3000 (num ids = 1000)
2019-01-29 17:03:26,944:INFO:root: Processing batch at start index 4000 (num ids = 1000)
2019-01-29 17:03:46,505:INFO:root: Processing batch at start index 5000 (num ids = 1000)
2019-01-29 17:04:00,996:INFO:root: Processing batch at start index 6000 (num ids = 1000)
2019-01-29 17:04:17,830:INFO:root: Processing batch at start index 7000 (num ids = 1000)
2019-01-29 17:04:32,039:INFO:root: Processing batch at start index 8000 (num ids = 1000)
2019-01-29 17:04:48,625:INFO:root: Processing batch at start index 9000 (num ids = 1000)
2019-01-29 17:05:04,142:INFO:root: Processing batch at start index 10000 (num ids = 1000)
2019-01-29 17:05:21,247

In [46]:
df = pd.read_csv(osp.join(output_dir, 'pubmed_abstracts.csv'))
df.head()

Unnamed: 0,abstract,date,id,terms,title
0,Sodium chloride (NaCl) has been proposed as a ...,,30554495,"Animals|Arthritis, Experimental/immunology/pat...",Sodium Chloride Aggravates Arthritis via Th17 ...
1,The severity of cryptococcosis in lupus from v...,2018-11-19,30456753,Animals|Cryptococcosis/etiology/genetics/immun...,Increased susceptibility against Cryptococcus ...
2,For efficacy of peptide vaccination immunother...,2018-12-01,30375705,Amino Acid Sequence|Antigen Presentation/immun...,Development of a T-cell receptor multimer with...
3,Bronchopulmonary dysplasia (BPD) is one of the...,2018-10-15,30324231,Biomarkers/blood|Bronchopulmonary Dysplasia/bl...,Increased serum Th2 chemokine levels are assoc...
4,Breast cancer remains one of the leading cause...,2018-10-03,30283982,Adult|Aged|Breast Neoplasms/immunology/patholo...,An autologous dendritic cell vaccine polarizes...


In [47]:
df['id'].value_counts().max()

1