## Scraping Pubmed (NCBI database) for abstract data

Using Biopython library via Python 3 to generate a data frame (.csv) from available medline data of a pre-defined search string

In [15]:
# import libraries
import Bio
import matplotlib
import pandas
from Bio import Entrez
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sb
import pandas as pd

# Check version
print('Pandas version: %s' %pd.__version__)
print('Biopython version: %s' %Bio.__version__)
print('Matplotlib version: %s' %matplotlib.__version__)

Pandas version: 0.20.3
Biopython version: 1.66
Matplotlib version: 1.5.1+1795.g515ba4b


## Total number of Pubmed entries

In [52]:
Entrez.email = "jiajun.liu@adelaide.edu.au"
handle = Entrez.egquery(term="glucocor* AND immune")
record = Entrez.read(handle)

# number of entries
for row in record["eGQueryResult"]:
    numberentries = 0 
    if row["DbName"]=="pubmed":
        print(row["Count"])
        numberentries = row["Count"]   
        
numberentries        

8502


0

## Get list of PubMed  IDs

In [17]:
#esearch to generate list of PMID 
handle= Entrez.esearch(db="pubmed",term="glucocor* AND immune", usehistory = 'y', retmax = 8494 )
searchrecord= Entrez.read(handle)

idlist= searchrecord["IdList"]
count = len(idlist)
print("Found %i results" % count)
#print(idlist)

#Store Session cookie and QueryKey
webenv= searchrecord["WebEnv"]
query_key = searchrecord["QueryKey"]


print (query_key)
print (webenv)

Found 10 results
1
NCID_1_151528049_130.14.18.34_9001_1501575572_617495126_0MetA0_S_MegaStore_F_1


Email address is not specified.

To make use of NCBI's E-utilities, NCBI requires you to specify your
email address with each request.  As an example, if your email address
is A.N.Other@example.com, you can specify it as follows:
   from Bio import Entrez
   Entrez.email = 'A.N.Other@example.com'
In case of excessive usage of the E-utilities, NCBI will attempt to contact
a user at the email address provided before blocking access to the
E-utilities.


In [4]:
## WRITE ID LIST TO FILE

IDLIST = list(idlist)
#IDLIST = pd.Series(IDLIST)
#IDLIST.to_csv('IDLIST.csv')

## Get Medline file of searched PMIDs

In [18]:
## Import Medline

import time

output = open("data.txt", "w")
# Determine Batch
batch_size = 10
# Fetch data in batches


for start in range(0, count, batch_size):
    end = min(count, start+batch_size)
    print("going to download record %i to %i" % (start+1, end));
    fetch_handle = Entrez.efetch(db = "pubmed", rettype = "medline", retmode = "text", 
                                 retstart= start, retmax = end, webenv=webenv, query_key=query_key, post = True)
    data = fetch_handle.read()
    output.write(data)
    time.sleep(3) # make sure to sleep between each batch
output.close()
    

going to download record 1 to 10


In [None]:
## DEPLOY ONLY IF USE HISTORY FAILS
#import time
#output = open("data.txt", "w")
# Determine Batch
#batch_size = 10
## Fetch data in batches

"""
for start in range(0, count, batch_size):
    end = min(count, start+batch_size)
    print("going to download record %i to %i" % (start+1, end))
    fetch_handle = Entrez.efetch(db = "pubmed", rettype = "medline", retmode = "text", 
                                 id = idlist, retstart= start, retmax = end, post = True)
    data = fetch_handle.read()
    output.write(data)
    time.sleep(3) # make sure to sleep between each batch
output.close()
"""    

## Generate DataFrame

In [80]:
from Bio import Medline
dataset = open("data.txt", "r") 

# Parse medline data
data = Medline.parse(dataset)  

In [81]:
medlinefile = list(data)

#reader = pd.read_csv('alldata.csv', error_bad_lines = False, nrows = 1000000, low_memory=False) ## Debug: Memory issue

In [82]:
# set up dataframe
d = pd.DataFrame(medlinefile)

In [None]:
# d['PMID'].value_counts() # Check How many duplicates there are
# d.loc[d.PMID.isin(['%s'])] %TOP_DUPLICATE_VALUE # Look at duplicate rows- check if they make sense

In [76]:
#drop duplicates
x= d.drop_duplicates(subset = ['PMID'])

19404519    128
19418731    128
19404521    128
19424046    128
19426224    128
19419738    128
19409827    128
21425972    128
19432889    128
19399500    128
19384873    127
19387765    127
19376397    127
19387032    127
19373457    127
19440116    127
19398001    127
19467321    127
19435796    127
19433626    127
19464360    127
19494313    127
19398916    127
19386083    127
19371813    127
19489103    127
19435479    127
25384870    127
21426128    127
19390148    127
           ... 
26767526      1
26799851      1
11054347      1
26749950      1
11024552      1
26801639      1
11061532      1
26756637      1
11007144      1
26762089      1
11016271      1
26745276      1
26748887      1
26746234      1
11054981      1
11056663      1
26801103      1
11024533      1
26810223      1
11067955      1
11060520      1
11019530      1
26790188      1
26807874      1
11060527      1
26812334      1
26745435      1
11042469      1
26778776      1
11044214      1
dtype: int64

In [96]:
# Write dataframe to csv, LOAD THIS FILE FOR ANALYSIS INSTEAD
x.to_csv('dataframe.csv', header = True, index = False)