In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
req = requests.get("http://medicine.iu.edu/departments/surgery/faculty/")
soup = BeautifulSoup(req.text, "lxml")

In [3]:
faculty_list = []
for child in soup.h4.children:
    faculty_list.append(child)

### Find all the faculty sub-urls and add the appropriate root URL.

In [15]:
sub_links = soup.select('h4 a')

full_links = []
names = []
for sl in sub_links:
    full_links.append('http://medicine.iu.edu/departments/surgery/faculty/' + sl['href'])
    names.append(sl.get_text())
print('# Names: ' + str(len(names)) + ', # Links: ' + str(len(full_links)))
print('Last Name: ' + names[-1])
print('Last link: ' + full_links[-1])

# Names: 113, # Links: 113
Last Name: Nicholas J. Zyromski, MD
Last link: http://medicine.iu.edu/departments/surgery/faculty/17324/zyromski-nicholas/


### Setup a dataframe

In [25]:
faculty = pd.DataFrame({
    "name": names,
    "webpage": full_links
})
faculty['name'][0]
faculty['webpage'][0]

'Joshua M. Adkinson, MD'

#### Progress bar code from https://github.com/alexanderkuk/log-progress

In [31]:
def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

### Request faculty pages

In [32]:
page_reqs = []

for link in log_progress(full_links, every=1):
    page_reqs.append(requests.get(link))

### Parse all the requests

In [33]:
soups = []

for req in log_progress(page_reqs, every=1):
    soups.append(BeautifulSoup(req.text, 'html.parser'))

### Extract pubmed links

In [39]:
pubmed_links = []
for i in range(len(soups)):
    soup = soups[i]
    pubmed_links.append([])
    possible_pubmed_links = soup.select('#region-main div div div a')
    for l in possible_pubmed_links:       
        if 'ncbi.nlm.nih.gov/pubmed' in l['href']:
            for c in l.children:
                if c.name == 'img':
                    pubmed_links[i].append(l['href'])
faculty['pubmed'] = pubmed_links

# PB links: 113


### Request Pubmed Pages

In [83]:
pubmed_reqs = []
num_valid = 0
for i in log_progress(range(len(faculty)), every=1):
    if pubmed_links[i] == []:
        pubmed_reqs.append(None)
    else:
        pubmed_reqs.append(requests.get(pubmed_links[i][0]))
        num_valid += 1

### Parse Pubmed Requests

In [84]:
soups = []

for req in log_progress(pubmed_reqs, every=1):
    if req != None:
        soups.append(BeautifulSoup(req.text, 'html.parser'))
    else:
        soups.append(None)

### Extract number of search results

In [99]:
test = soups[2].find_all('h3', class_='result_count')
a = test[0].get_text()
a = 'hello 32'
b = [int(s) for s in a.split() if s.isdigit()]
b

[32]

In [121]:
num_publications = []
for i in log_progress(range(len(soups)), every=1):
    if soups[i] != None:
        result_count = soups[i].find_all('h3', class_='result_count')
        # If it didn't find anything, that means it was a link to a paper. So the count is one.
        if result_count == []:
            num_publications.append(1)
            continue
        result_count_str = result_count[0].get_text()
        parsed_publications = [int(s) for s in result_count_str.split() if s.isdigit()]
        # Last one is the final count
        num_publications.append(parsed_publications[-1])
    else:
        num_publications.append(None)
faculty['pubs'] = num_publications

### Cleanup and Save the DataFrame

In [158]:
cleaned_pubmed = []
for i in range(len(pubmed_links)):
    if len(pubmed_links[i])>0 :
        cleaned_pubmed.append(pubmed_links[i][0])
    else:
        cleaned_pubmed.append(None)
faculty['pubmed'] = cleaned_pubmed
faculty

Unnamed: 0,name,webpage,pubmed,pubs
0,"Joshua M. Adkinson, MD",http://medicine.iu.edu/departments/surgery/fac...,https://www.ncbi.nlm.nih.gov/pubmed/?term=Adki...,4.0
1,"Ambar Banerjee, MD",http://medicine.iu.edu/departments/surgery/fac...,https://www.ncbi.nlm.nih.gov/pubmed/?term=Bane...,22.0
2,"Christopher M. Bearden, MD",http://medicine.iu.edu/departments/surgery/fac...,https://www.ncbi.nlm.nih.gov/pubmed/?term=Bear...,6.0
3,"Daniel J. Beckman, MD",http://medicine.iu.edu/departments/surgery/fac...,https://www.ncbi.nlm.nih.gov/pubmed/?term=Beck...,10.0
4,"Teresa M. Bell, PhD",http://medicine.iu.edu/departments/surgery/fac...,,
5,"Deborah F. Billmire, MD",http://medicine.iu.edu/departments/surgery/fac...,https://www.ncbi.nlm.nih.gov/pubmed/?term=Bill...,17.0
6,"Thomas J. Birdas, MD",http://medicine.iu.edu/departments/surgery/fac...,https://www.ncbi.nlm.nih.gov/pubmed/?term=Bird...,16.0
7,"Andrea Bonetto, PhD",http://medicine.iu.edu/departments/surgery/fac...,https://www.ncbi.nlm.nih.gov/pubmed/?term=bone...,24.0
8,"Brian L. Brewer, MD",http://medicine.iu.edu/departments/surgery/fac...,https://www.ncbi.nlm.nih.gov/pubmed/?term=Brew...,4.0
9,"John W. Brown, MD",http://medicine.iu.edu/departments/surgery/fac...,https://www.ncbi.nlm.nih.gov/pubmed/?term=Brow...,215.0


In [159]:
writer = pd.ExcelWriter('IndianaSurgeryFacultyInfo.xlsx')
faculty.to_excel(writer,'Sheet1')
writer.save()