To start, we need to import the modules we'll need later

In [1]:
import pandas as pd
import numpy as np
import matplotlib as pltfrom Bio import Entrez

Next, we decide what we want to search for and open an API to access PubMed data, using ESearch to allow us to retrieve full records from PubMed.  Below, define the keyword you want to search for.  It should be formatted 'like+this'.  A search term for alcohol addiction has been given as an example.  You must also provide a valid email address to use with Entrez.

In [None]:
keyword = 'alcohol+addiction'

In [None]:
Entrez.email = 'kalawson@vassar.edu'

In [None]:
handle = Entrez.esearch(db='pubmed', term=keyword, field = 'tile', retmode='xml', idtype='acc')
data = Entrez.read(handle)

ESearch gives us a lot of information, but it's in the form of ID numbers, not titles, authors, or any other relevant information.  To make it easier to read, we'll use ESummary to search PubMed using the ID numbers we got from ESearch.

In [None]:
UID = data['IdList']
record = Entrez.esummary(db='pubmed', id=UID[1])attributes_list = Entrez.read(record)
for attributes in attributes_list:for key, value in attributes.items():print(key, value)

In [None]:
#searching PubMed for a specific author:
data_author = pd.read_csv('citation_data.csv', header=True)

#assumes that we only have one author in our data
data_author_pub = data_author[['year', 'title']]
data_author_pub = data_author_pub['title'].nunique().sum().groupby('year')

author_pub = plt.plot(data_author_pub["year"], data_author_pub["nunique_title"])
author_pub.set_xlabel = "Year"
author_pub.set_ylabel = "Number of publications"
author_pub.set_title = "Author's publications by year"
plt.show()


#searching PubMed by keyword:
data_keyword = pd.read_csv('keywords.csv', header=True)

#assumes that we only have the one keyword in our data
data_keyword_mentions = data_keyword[['year', 'title']]
data_keyword_mentions = data_keyword_mentions['title'].nunique().sum().groupby('year')

keyword_mentions = plt.plot(data_keyword_mentions["year"], data_keyword_mentions["nunique_title"])
keyword_mentions.set_xlabel = "Year"
keyword_mentions.set_ylabel = "Number of mentions"
keyword_mentions.set_title = "Keyword mentions by year"
plt.show()


#if we can get both datasets to coexist:
fig, data = plt.subplots(2, 1)
data[0].plot(data_author_pub["year"], data_author_pub["number_publications"],color='b')
data[1].plot(data_keyword_mentions["year"], data_keyword_mentions["number_mentions"],color='r')
data[0].set_ylabel("Number of Publications")
data[1].set_ylabel("Number of Mentions")
data[1].set_xlabel("Year")
plt.show()