## PubMed Data Mining Algorithm

The following code is a data mining algorithm that scraps article citation information from PubMed (NCBI). The data is organized into pandas, and has natural language processing applied to research affliation data to pinpoint the location of the research.

In [1]:
# Libraries
from Bio import Entrez
from Bio import Medline
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline  
import numpy as np
import pandas as pd
import sqlite3
import timeit
from collections import defaultdict
from geopy.geocoders import Nominatim
from mpl_toolkits.basemap import Basemap

In [2]:
###########################
### Functions: Methods: ###

# setting up with your email
class setup():
    # setup the email
    def email(self):
        email = input("Please enter your email: ")
        print ("You enter: ", email)
        return(email)
    # returns list of keywords you are interested in
    def keyword(self, numbers):
        keyword_lst = []
        while len(keyword_lst) != numbers:
            keyword = input("Input keyword: ")
            keyword_lst.append(keyword)
        return(keyword_lst)
    
# retrieve the data
class retrieve():
    # returns list of keywords you are interested in
    def keyword(self, numbers):
        keyword_lst = []
        while len(keyword_lst) != numbers:
            keyword = input("Input keyword: ")
            keyword_lst.append(keyword)
        return(keyword_lst)

In [3]:
# Getting data from Medline: journals, authors, location
def record_fetch(pmid):
    # length of pmid - test sake
    print ("The length of pmid is", len(pmid))

    # timer
    start_time_1 = timeit.default_timer()

    # fetching pubmed articles using pmid ids
    fetch_handle = Entrez.efetch(db="pubmed", id=pmid, rettype="medline",retmode="text")

    # timer
    print ("The handle fetch program time is ", timeit.default_timer() - start_time_1)

    # timer
    start_time_2 = timeit.default_timer()

    # parsing Medline
    records = Medline.parse(fetch_handle)
    records = list(records)

    # timer
    print ("The Medline parse program time is ", timeit.default_timer() - start_time_2)

    # timer
    start_time_3 = timeit.default_timer()

    # init data columns
    record_authors = []
    record_first = []
    record_journals = []
    record_dp = []
    record_place = []
    record_mesh = []
    record_pmid = []
    record_aff = []

    # iterate over records - try and errors are there to catch None types
    for record in records:
        try:
            record_authors.append(record.get("AU"))
        except TypeError:
            record_authors.append("None")
        try:
            record_first.append(record.get("AU")[0])
        except TypeError:
            record_first.append("None")
        try:
            record_journals.append(record.get("JT"))
        except TypeError:
            record_journals.append("None")
        try:
            record_dp.append(record.get("DP"))
        except TypeError:
            record_dp.append("None")
        try:
            record_place.append(record.get("PL"))
        except TypeError:
            record_place.append("None")
        try:
            record_mesh.append(record.get("MH"))
        except TypeError:
            record_mesh.append("None")
        try:
            record_pmid.append(record.get("PMID"))
        except TypeError:
            record_pmid.append("None")
        try:
            record_aff.append(record.get("AD"))
        except TypeError:
            record_aff.append("None")

    # timer
    print ("The author program time is ", timeit.default_timer() - start_time_3)

    # data frame
    data = pd.DataFrame({'authors': record_authors, 'firstauthor': record_first, 'journals': record_journals, 'datepublication': record_dp,
    'place': record_place, 'mesh': record_mesh, 'pmid': record_pmid,'affliation': record_aff}, columns=['authors', 'firstauthor', 'journals',
    'datepublication', 'place', 'mesh', 'pmid', 'affliation'])

    # dropping miss data (None types)
    data = data.dropna()

    return(data)

In [4]:
# matching, analysis, and graph algorithms
def analysis(keywords, date):
    article_lst = []
    ############################
    # [ initial handler setup ]
    start_time_final = timeit.default_timer()
    for word in range(len(keywords)):
        print(keywords[word])
        # to find keyword population with count
        start_time = timeit.default_timer()
        #search_handle = Entrez.esearch(db="pubmed",term=keywords[word],retmax=1, reldata=date, usehistory="y")
        search_handle = Entrez.esearch(db="pubmed",term=keywords[word],retmax=1, mindate='2015/01', maxdate='2016/01', usehistory="y")
        #handle = Entrez.esearch(db="pubmed",term=keywords[0], retmax=1)
        #handle = Entrez.esearch(db="pubmed",term=keywords[1],retmax=1)
        search_results = Entrez.read(search_handle)
        count = search_results['Count']

        if int(count) <= 10000:
            sample = 10000
        else:
            sample = float(count) * 0.30
            sample = int(sample)

        ### 1. I want to download the full sample of data (will take a long time) for the specified year
        # 2. Then randomly select 1000 articles for that year

        # 1. Date specified

        # taking 10% of population and using this as sample
        #search_handle= Entrez.esearch(db="pubmed",term=keywords[word],retmax=sample, reldata=date, usehistory="y")
        search_handle = Entrez.esearch(db="pubmed",term=keywords[word],retmax=sample, mindate='2015/01', maxdate='2016/01', usehistory="y")
        search_results = Entrez.read(search_handle)

        # web servery history parameters
        webenv = search_results["WebEnv"]
        query_key = search_results["QueryKey"]

        print ("The initial handle program time: ", timeit.default_timer() - start_time)

        ############################
        # [ handle test data ]
        #handle = Entrez.esearch(db="pubmed",term=keywords[0],retmax=sample)
        #handle = Entrez.esearch(db="pubmed",term=keywords[1],retmax=sample)

        print ("Sample size is ", sample)

        # storing the sample as the idlist for the articles

        ############################
        # [ splitting step ]
        start_time_2 = timeit.default_timer()
        id_results = search_results["IdList"]
        pmid = id_results
        pmid = sorted(pmid)
        print ("The sort program time ", timeit.default_timer() - start_time_2)
        test = record_fetch(pmid)
        article_lst.append(test)
    print ("The final program time: ", timeit.default_timer() - start_time_final)
    return(article_lst)

# Making csv files for each keyword in the data frame
def make_csv(df):
    for idx, val  in enumerate(df):
        val.to_csv('%s.csv' % keywords[idx], sep=',')
        
# Make each of the csv files into a listed dataframe
def make_df(keywords):
    df = []
    for idx, val in enumerate(keywords):
        df_temp = pd.read_csv('%s.csv' % keywords[idx])
        df.append(df_temp)
    return(df)

# random sample of 1000 for each data frame of keywords
def make_sampled(df):
    for idx, val in enumerate(df):
        if len(df[idx]) >= 1000:
            df[idx] = val.sample(n=1000)
        else:
            df[idx] = val.sample(n=len(df[idx]))
    return(df)

# matching articles
def matching(df, keywords):
    try:
        articles = [item['pmid'] for item in df]
        #print("list")
    except TypeError:
        articles = df['pmid']  
        articles = list(articles)
        #print("pandas")
         
    weights = []
    combos = []
    combos_lst = []
    weighted_el_lst = []
    weighted_el_ = []
    for article in range(len(articles)):
        #if len(list(set(article) & set(article + 1))) != 0:
        if article is not len(articles) - 1 : # does not match the last index
            for index in range(len(articles)):
                if (index + article) <= len(articles) - 1: # article number + index does not equal the full length of articles
                    #print(article, index)
                    if len(list(set(articles[article]).intersection(set(articles[article + index])))) != 0: # if there are any matches
                        #empty list to make weighted edges form: (edges, edge, weights)
                        combos_lst = []
                        weights_lst = []

                        combo = tuple([keywords[article]] + [keywords[article + index]])
                        combos.append(combo)

                        #combos for weighted edges
                        combo_lst = keywords[article]
                        combos_lst.append(combo_lst)
                        combo_lst2 = keywords[article + index]
                        combos_lst.append(combo_lst2)

                        weight = len(list(set(articles[article]).intersection(set(articles[article + index]))))
                        weights.append(weight)

                        weighted_el = zip(combos, weights)
                        weighted_el_.append(weighted_el)

                        #for weighted edges
                        weights_lst.append(weight)
                        weighted_el_tup = tuple(combos_lst + weights_lst)
                        weighted_el_lst.append(weighted_el_tup)

    return(weights, combos, weighted_el_, weighted_el_lst)


# make a weighted graph of the keywords with the keyword of interest
def target_graph(weighted_el_lst):
    G = nx.Graph()

    # specify the keyword that you're interested in
    interested_keyword = input("Please enter the keyword you are interested in: ")
    print ("You enter: ", interested_keyword)

    weighted_edgelist = match[3]

    # only choose the combos that include the interested keyword
    graph_of_interest = [interest for interest in weighted_edgelist if interested_keyword in interest]

    G.add_weighted_edges_from(graph_of_interest)
    pos = nx.spring_layout(G)
    custom_labels = {}
    nodes = G.nodes()
    for labels in nodes:
        custom_labels[labels] = labels

    edge_weight=dict([((u,v,),int(d['weight'])) for u,v,d in G.edges(data=True)])

    nx.draw_networkx_edge_labels(G,pos,edge_labels=edge_weight)
    nx.draw_networkx_nodes(G,pos)
    nx.draw_networkx_edges(G,pos)
    nx.draw_networkx_labels(G,pos)
    plt.show()

# make a weighted graph of the keywords
def all_graph(weighted_el_lst):
    G = nx.Graph()
    preds = nx.adamic_adar_index(G, match[1])

    weighted_edgelist = match[3]
    G.add_weighted_edges_from(weighted_edgelist)
    preds = nx.adamic_adar_index(G, match[1])
    pos = nx.spring_layout(G)
    custom_labels = {}
    nodes = G.nodes()
    for labels in nodes:
        custom_labels[labels] = labels

    edge_weight=dict([((u,v,),int(d['weight'])) for u,v,d in G.edges(data=True)])

    nx.draw_networkx_edge_labels(G,pos,edge_labels=edge_weight)
    nx.draw_networkx_nodes(G,pos)
    nx.draw_networkx_edges(G,pos)
    nx.draw_networkx_labels(G,pos)
    plt.show()


# mapping  countries
def map_countries(data):
    geolocator = Nominatim()
    location = [geolocator.geocode(country) for country in unique_countries]
    #institution = [geolocator.geocode(affliation) for affliation in unique_affliations]
    print(location)
    # make sure the value of resolution is a lowercase L,
    #  for 'low', not a numeral 1
    my_map = Basemap(projection='robin', lat_0=0, lon_0=-100, resolution='l', area_thresh=1000.0)

    my_map.drawcoastlines()
    my_map.drawcountries()
    my_map.fillcontinents(color='coral')
    my_map.drawmapboundary()

    my_map.drawmeridians(np.arange(0, 360, 30))
    my_map.drawparallels(np.arange(-90, 90, 30))

    lons = [(location[lon][1][1]) for lon in range(len(location))]
    lats = [(location[lat][1][0]) for lat in range(len(location))]
    x,y = my_map(lons, lats)
    my_map.plot(x, y, 'bo', markersize=10)
    plt.show()

In [5]:
### 1: Setup

# Must run this and enter your email for NCBI
init = setup()
Entrez.email = init.email()

Please enter your email: kjchoi10@gmail.com
You enter:  kjchoi10@gmail.com


In [6]:
# Example keyword list
keywords = ["Robotic Surgery", "Cardiac Surgery", "Colorectal Surgery", "General Surgery", "Gynecologic Surgery", "Head & Neck Surgery","Thoracic Surgery", "Urologic Surgery"]

# Number of days in the past you want to look at (past 5 years)
date = 1825

In [None]:
# Analysis: returns a pandas data frame
# outputs are the keywords, sample size, and timing for each keyword
df = analysis(keywords, date)

Robotic Surgery
The initial handle program time:  1.0315892290018382
Sample size is  10000
The sort program time  0.00013467100143316202
The length of pmid is 1620
The handle fetch program time is  0.723740966997866
The Medline parse program time is  26.098299224002403
The author program time is  0.005180276999453781
Cardiac Surgery
The initial handle program time:  1.715174503999151
Sample size is  5697
The sort program time  0.0011230220006837044
The length of pmid is 5697
The handle fetch program time is 

In [17]:
# Create the covariate names more explicity for the data frame
articles = [item['pmid'] for item in df]
place = [item['place'] for item in df]
date = [item['datepublication'] for item in df]
affliation = [item['affliation'] for item in df]

In [3]:
import sqlite3

In [18]:
# Finds all matching articles between keywords
match = matching(df, keywords)

In [19]:
#### plotting graphs:
# Place for first keyword
place = df[0]['place']
place = map(lambda x: x.upper(), place)
df[0]['place'] = map(lambda x: x.upper(), df[0]['place']) # to avoid duplicates of country names
unique_countries = pd.unique(df[0].place.ravel()) # getting the unique country names

In [24]:
#### plotting the graphs:
# Affliation fo first keyword
df[0]['affliation'] = map(lambda x: x.upper(), df[0]['affliation'])
affliation = df[0]['affliation']
unique_affliations = pd.unique(df[0].affliation.ravel()) # getting the unique country names

In [28]:
df[1]['affliation'][51]

'Divisions of Cardiovascular Surgery and Cardiology, Peter Munk Cardiac Centre, Toronto General Hospital, University of Toronto, Toronto, Ontario, Canada. Electronic address: tirone.david@uhn.ca. Divisions of Cardiovascular Surgery and Cardiology, Peter Munk Cardiac Centre, Toronto General Hospital, University of Toronto, Toronto, Ontario, Canada. Divisions of Cardiovascular Surgery and Cardiology, Peter Munk Cardiac Centre, Toronto General Hospital, University of Toronto, Toronto, Ontario, Canada. Divisions of Cardiovascular Surgery and Cardiology, Peter Munk Cardiac Centre, Toronto General Hospital, University of Toronto, Toronto, Ontario, Canada. Divisions of Cardiovascular Surgery and Cardiology, Peter Munk Cardiac Centre, Toronto General Hospital, University of Toronto, Toronto, Ontario, Canada. Divisions of Cardiovascular Surgery and Cardiology, Peter Munk Cardiac Centre, Toronto General Hospital, University of Toronto, Toronto, Ontario, Canada.'

In [115]:
# Testing affliation geolocation
import nltk
test = df[1]['affliation'][45]

In [116]:
test

'From the Department of Radiology and Research Institute of Radiology, University of Ulsan College of Medicine, Asan Medical Center, Seoul, Korea (HX, HJK, SL, JWL, HNL, MYK); Department of Radiology, The First Affiliated Hospital of Nanjing Medical University, Nanjing, Jiangsu Province, China (HX); Department of Thoracic and Cardiovascular Surgery (DKK); and Pathology, University of Ulsan College of Medicine, Asan Medical Center, Seoul, Korea (JSS).'

In [117]:
token = test.split()

In [118]:
from nltk.tokenize import wordpunct_tokenize 
tokened = wordpunct_tokenize(test)
tokened

['From',
 'the',
 'Department',
 'of',
 'Radiology',
 'and',
 'Research',
 'Institute',
 'of',
 'Radiology',
 ',',
 'University',
 'of',
 'Ulsan',
 'College',
 'of',
 'Medicine',
 ',',
 'Asan',
 'Medical',
 'Center',
 ',',
 'Seoul',
 ',',
 'Korea',
 '(',
 'HX',
 ',',
 'HJK',
 ',',
 'SL',
 ',',
 'JWL',
 ',',
 'HNL',
 ',',
 'MYK',
 ');',
 'Department',
 'of',
 'Radiology',
 ',',
 'The',
 'First',
 'Affiliated',
 'Hospital',
 'of',
 'Nanjing',
 'Medical',
 'University',
 ',',
 'Nanjing',
 ',',
 'Jiangsu',
 'Province',
 ',',
 'China',
 '(',
 'HX',
 ');',
 'Department',
 'of',
 'Thoracic',
 'and',
 'Cardiovascular',
 'Surgery',
 '(',
 'DKK',
 ');',
 'and',
 'Pathology',
 ',',
 'University',
 'of',
 'Ulsan',
 'College',
 'of',
 'Medicine',
 ',',
 'Asan',
 'Medical',
 'Center',
 ',',
 'Seoul',
 ',',
 'Korea',
 '(',
 'JSS',
 ').']

In [119]:
# how to get the location of the research
import string
# removes the punctations in the tokened array
tokened = [''.join(c for c in s if c not in string.punctuation) for s in tokened]
tokened = [s for s in tokened if s]

In [122]:
# get rid of noise_list_words
def clean_phrase(data):
    try:
        tokened = [number for number in df[1]['affliation'][number]]
    except:
    tokened = wordpunct_tokenize(test)
    noise_words_set = ['of', 'the', 'in', 'for', 'at', 'and', 'from', 'com', 'org', 'MD']
    clean = [phrase for phrase in tokened if phrase.lower() not in noise_words_set]

# working
noise_words_set = ['of', 'the', 'in', 'for', 'at', 'and', 'from', 'com', 'org', 'MD']
stuff = [phrase for phrase in tokened if phrase.lower() not in noise_words_set]
print(stuff)
location_set = max(set(stuff), key=stuff.count)
location_set
# use a database of city names to match the list

['Department', 'Radiology', 'Research', 'Institute', 'Radiology', 'University', 'Ulsan', 'College', 'Medicine', 'Asan', 'Medical', 'Center', 'Seoul', 'Korea', 'HX', 'HJK', 'SL', 'JWL', 'HNL', 'MYK', 'Department', 'Radiology', 'First', 'Affiliated', 'Hospital', 'Nanjing', 'Medical', 'University', 'Nanjing', 'Jiangsu', 'Province', 'China', 'HX', 'Department', 'Thoracic', 'Cardiovascular', 'Surgery', 'DKK', 'Pathology', 'University', 'Ulsan', 'College', 'Medicine', 'Asan', 'Medical', 'Center', 'Seoul', 'Korea', 'JSS']


'Radiology'

In [78]:
geolocator = Nominatim()
location = geolocator.geocode(location_set)
location

In [123]:
test

'From the Department of Radiology and Research Institute of Radiology, University of Ulsan College of Medicine, Asan Medical Center, Seoul, Korea (HX, HJK, SL, JWL, HNL, MYK); Department of Radiology, The First Affiliated Hospital of Nanjing Medical University, Nanjing, Jiangsu Province, China (HX); Department of Thoracic and Cardiovascular Surgery (DKK); and Pathology, University of Ulsan College of Medicine, Asan Medical Center, Seoul, Korea (JSS).'

2
