In [1]:
from Bio import Entrez
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
import numpy as np
import os
import pandas as pd
import time
import sys

In [2]:
output_notebook()

In [3]:
term = '"respiratory sinus arrhythmia"'
retmax = 2000

Entrez.email = os.environ['EMAIL']  # replace this line with your email, or set an environment variable

# search with term
handle = Entrez.esearch(db="pmc", term=term, retmax=retmax)
record = Entrez.read(handle)
handle.close()

# display the number of entries
print(record['Count'])

1825


In [4]:
records = []      # list to store records
t0 = time.time()  # start time 
counter = 0       # records processed

def get_link_ids(pmcid, linkname='pmc_pcm_cites'):
    '''
    For cited articles: linkname = 'pmc_pcm_cites'
    For articles citing this article: linkname = 'pmc_pmc_citedby'
    '''
    h = Entrez.elink(db='pmc', dbfrom='pmc', id=pmcid, linkname=linkname)
    r = Entrez.read(h)
    ids = [i['Id'] for i in r[0]['LinkSetDb'][0]['Link']]
    return ids

# loop through the entries
for i in record["IdList"]:
    
    counter += 1
    
    # get the summary for this record
    h = Entrez.esummary(db="pmc", id=i)
    rec, = Entrez.read(h)
    h.close()
    
    # update master list
    records.append(rec)
    
    # display progress
    dt = time.time() - t0
    line = '%d of %s. %.2f seconds' % (counter, record['Count'], dt)
    sys.stdout.write('\r%s' % line)

1825 of 1825. 715.42 seconds

In [5]:
# convert to data frame
record_table = pd.DataFrame(records)

# save to pickle file
record_table.to_pickle('rsa_pubmed.pkl')