In [2]:
import pandas as pd
import numpy as np
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [3]:
url = "http://med.stanford.edu/school/directory.html#faculty-labs"
html = urlopen(url)

In [5]:
soup = BeautifulSoup(html, 'lxml')

In [120]:
all_links = soup.find_all('a', href = True)

In [132]:
import re
results = []
for link in all_links:
    text = BeautifulSoup(str(link), "lxml").get_text()
    url = link.get("href")
    if re.match('http:.*',url):
        results.append([text, url])

In [133]:
results

[['Adult-care doctor',
  'http://stanfordhealthcare.org/content/shc/en/search-results.doctors.html'],
 ['Pediatrician or pediatric specialist',
  'http://www.stanfordchildrens.org/en/search/default?tab=doctors'],
 ['Obstetrician',
  'http://www.stanfordchildrens.org/en/search/default?tab=doctors&search-term=Obstetrics'],
 ['Adult care',
  'http://stanfordhealthcare.org/content/shc/en/search-results.clinics.html'],
 ['Pediatric care', 'http://www.stanfordchildrens.org/en/service/index?'],
 ['Obstetrics', 'http://www.stanfordchildrens.org/en/service/index?'],
 ['Stanford Health Care',
  'http://stanfordhealthcare.org/content/shc/en/search-results.clinics.html'],
 ["Stanford Children's Health",
  'http://www.stanfordchildrens.org/en/locations-directions'],
 ['Emergency Department',
  'http://stanfordhealthcare.org/medical-clinics/emergency-department.html'],
 ['Lane Library', 'http://lane.stanford.edu/index.html'],
 ['Make a gift online', 'http://pgnet.stanford.edu/goto/medgift'],
 ['Ways

# Pubmed

In [134]:
import requests

In [339]:
url_base = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
params = dict(db = 'pubmed', term = 'stanford[ad]Environment+and+Public+Health[mh]', retmode = 'json', retmax = '30')
rsp = requests.get(url_base, params=params)


In [340]:
print(rsp.url)

https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=stanford%5Bad%5DEnvironment%2Band%2BPublic%2BHealth%5Bmh%5D&retmode=json&retmax=30


In [341]:
len(rsp.json()['esearchresult']['idlist'])

30

In [342]:
id_list = rsp.json()['esearchresult']['idlist']

In [343]:
','.join(id_list)

'30683880,30670697,30667499,30586832,30586505,30586170,30575872,30572165,30562241,30559176,30558033,30547806,30545871,30544793,30522922,30537958,30523329,30531929,30505424,30504286,30501481,30513076,30510157,30509974,30497546,30497450,30496171,30487548,30482205,30481383'

In [348]:
# fetch one paper
paper_url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id="+','.join(id_list)
paper_rsp = requests.get(paper_url)


In [350]:
print(paper_url)

http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=30683880,30670697,30667499,30586832,30586505,30586170,30575872,30572165,30562241,30559176,30558033,30547806,30545871,30544793,30522922,30537958,30523329,30531929,30505424,30504286,30501481,30513076,30510157,30509974,30497546,30497450,30496171,30487548,30482205,30481383


In [352]:
print(paper_rsp)

<Response [200]>


In [360]:
soup_paper = BeautifulSoup(paper_rsp, 'lxml')
print(soup_paper.find_all('Author'))

TypeError: object of type 'Response' has no len()

In [76]:
from Bio import Entrez
import Bio; print(Bio.__version__)
import pandas as pd
import numpy as np
import re
import time
import math

1.73


In [95]:
def search(query, retmax):
    Entrez.email = 'sehjk@stanford.edu'
    handle = Entrez.esearch(db='pubmed', 
                            retmax=retmax,
                            retmode='xml', 
                            term=query)
    results = Entrez.read(handle)
    return results

In [96]:
def fetch_details(idl):
    if(type(idl) == list):
        ids = ','.join(idl)
    else:
        ids = idl
    Entrez.email = 'sehjk@stanford.edu'
    handle = Entrez.efetch(db='pubmed',
                           retmode='xml',
                           id=ids)
    results = Entrez.read(handle)
    return results

In [98]:
meshTerms = ['Environment and Public Health',
            'Health Care Economics and Organizations',
            'Health Care Facilities, Manpower, and Services',
            'Health Care Quality, Access, and Evaluation',
            'Health Services Administration',
            'Population Characteristics']

authors = [] 
pmids = set()
    
for meshTerm in meshTerms:
    
    meshString = meshTerm.replace(" ", "+")
    searchString = 'stanford[ad]'+ meshString + '[mh]'
    print('Submitting {} search to Pubmed'.format(searchString))
    results = search(searchString, retmax = 50000)
    ids = results['IdList']
    
    print('Search returned {} ids for {}'.format(len(ids), meshTerm))
    
    if len(ids) != 0:
        
        if len(ids) >= 9999:
            id_list = np.array_split(np.array(ids), math.ceil(len(ids)/9999))
            
            for mini_list in id_list:
                print('Fetching {} Ids from Pubmed...'.format(len(mini_list)))
                papers = fetch_details(list(mini_list))
                print('Fetched details. Adding to set..')
                for i, paper in enumerate(papers['PubmedArticle']):
                    pmid = str(paper['MedlineCitation']['PMID'])
                    if pmid not in pmids:
                        pmids.add(pmid)
                        author_list = paper['MedlineCitation']['Article']['AuthorList']
                        for author in author_list:
                            if author['AffiliationInfo']: # otherwise typically collectives without affiliation
                                affiliation = author['AffiliationInfo'][0]['Affiliation']
                                if re.match(r'.*\bstanford.*', affiliation, re.IGNORECASE):
                                    #identifier = author['Identifier']
                                    if 'CollectiveName' in author:
                                        name = author['CollectiveName']
                                    else:
                                        lastname = author.get('LastName','NA')
                                        firstname = author.get('ForeName','NA')
                                        initials = author.get('Initials','NA')
                                        name = firstname + " " + lastname
                                    authors.append([pmid, name, affiliation])
                print('Added. Papers analyzed: {0}. {1} Authors detected'.format(len(pmids), len(authors)))
        else:
            id_list = ids
            print('Fetching {} Ids from Pubmed...'.format(len(id_list)))
            papers = fetch_details(list(id_list))
            print('Fetched details. Adding to set..')
            for i, paper in enumerate(papers['PubmedArticle']):
                pmid = str(paper['MedlineCitation']['PMID'])
                if pmid not in pmids:
                    pmids.add(pmid)
                    author_list = paper['MedlineCitation']['Article']['AuthorList']
                    for author in author_list:
                        if author['AffiliationInfo']: # otherwise typically collectives without affiliation
                            affiliation = author['AffiliationInfo'][0]['Affiliation']
                            if re.match(r'.*\bstanford.*', affiliation, re.IGNORECASE):
                                #identifier = author['Identifier']
                                if 'CollectiveName' in author:
                                    name = author['CollectiveName']
                                else:
                                    lastname = author.get('LastName','NA')
                                    firstname = author.get('ForeName','NA')
                                    initials = author.get('Initials','NA')
                                    name = firstname + " " + lastname
                                authors.append([pmid, name, affiliation])
            print('Added. Papers analyzed: {0}. {1} Authors detected'.format(len(pmids), len(authors)))

Submitting stanford[ad]Environment+and+Public+Health[mh] search to Pubmed
Search returned 27934 ids for Environment and Public Health
Fetching 9312 Ids from Pubmed...
Fetched details. Adding to set..
Added. Papers analyzed: 9312. 25574 Authors detected
Fetching 9311 Ids from Pubmed...
Fetched details. Adding to set..
Added. Papers analyzed: 18623. 39522 Authors detected
Fetching 9311 Ids from Pubmed...
Fetched details. Adding to set..
Added. Papers analyzed: 27934. 48840 Authors detected
Submitting stanford[ad]Health+Care Economics+and+Organizations[mh] search to Pubmed
Search returned 4380 ids for Health Care Economics and Organizations
Fetching 4380 Ids from Pubmed...
Fetched details. Adding to set..
Added. Papers analyzed: 29698. 51320 Authors detected
Submitting stanford[ad]Health+Care Facilities,+Manpower,+and+Services[mh] search to Pubmed
Search returned 7020 ids for Health Care Facilities, Manpower, and Services
Fetching 7020 Ids from Pubmed...
Fetched details. Adding to set..
A

In [99]:
authors_df = pd.DataFrame(authors)
authors_df.to_csv("authors.csv")