In [1]:
from Bio import Entrez
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
import numpy as np
import os
import pandas as pd
import time
import sys

In [2]:
output_notebook()

In [4]:
Entrez.email = os.environ['EMAIL']  # replace this line with your email, or set an environment variable

# search with term
handle = Entrez.esearch(db="pubmed", term="ptsd combat veteran", retmax=int(1e6))
record = Entrez.read(handle)
handle.close()

# display the number of entries
print(record['Count'])

2592


In [5]:
records = []      # list to store records
t0 = time.time()  # start time 
counter = 0       # records processed

# loop through the entries
for i in record["IdList"]:
    
    counter += 1
    
    # get the summary for this record
    h = Entrez.esummary(db="pubmed", id=i)
    rec, = Entrez.read(h)
    h.close()
    
    # update master list
    records.append(rec)
    
    # display progress
    dt = time.time() - t0
    line = '%d of %s. %.2f seconds' % (counter, record['Count'], dt)
    sys.stdout.write('\r%s' % line)

2592 of 2592. 1135.42 seconds

In [6]:
# convert to data frame
record_table = pd.DataFrame(records)

# save to pickle file
record_table.to_pickle('ptsd_pubmed.pkl')

In [7]:
# compute the publication year (first 4 characters or PubDate field)
record_table['PubYear'] = record_table['PubDate'].apply(lambda s: int(s[:4]))

In [9]:
# count the number of records per year
by_year = record_table.groupby('PubYear')
year_counts = by_year['ArticleIds'].count()

In [10]:
x = year_counts.index
y = year_counts.values

# plot number of publications per year
fig = figure(width=900, height=400)
fig.line(x, y)
fig.circle(x, y)
show(fig)