In [34]:
from bs4 import BeautifulSoup
import requests

import pandas as pd
import os
import time

URL_BASE = 'http://vcresearch.berkeley.edu/faculty-expertise?name=+&expertise_area=&term_node_tid_depth='

SCRAPED_DATA_FILE_LOCATION = 'scraped_data'

N_PAGES_TO_SCRAPE = 49 #hard coded by looking at how many pages

In [31]:
def scraped_file_loc(file_name):
    return os.path.join(SCRAPED_DATA_FILE_LOCATION, file_name) + '.csv'

In [32]:
all_rows = []


def get_rows_from_soup(soup):
    rows = []
    
    trs = soup.find_all('table', {'class':'views-table cols-3'})[0].find_all('tbody')[0].find_all('tr')

    for tr in trs:
        tds = tr.find_all('td')
        faculty_name = tds[0].find_all('a')[0].text
        faculty_profile_url = tds[0].find_all('a')[0]['href']
        l_expertise = [a.text for a in tds[1].find_all('a')]
        expertise_str = ','.join(l_expertise)
        department = tds[2].find_all('a')[0].text
        rows.append([faculty_name, faculty_profile_url, expertise_str, department])
    return rows


In [42]:
headers = ['faculty_name', 'faculty_profile_url', 'l_expertise', 'department']

In [36]:
r = requests.get(URL_BASE)
soup = BeautifulSoup(r.content)
all_rows.extend(get_rows_from_soup(soup))

for page_n in range(1, N_PAGES_TO_SCRAPE):
    r = requests.get('{}&page={}'.format(URL_BASE, page_n))
    soup = BeautifulSoup(r.content)
    all_rows.extend(get_rows_from_soup(soup))
    time.sleep(0.5) #500ms sleep


1
2
3
4
5
6
7
8
9
9
10
11
12
13
14
15
16
17
18
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
44
45
46
47
48


In [49]:
df = pd.DataFrame(all_rows, columns=headers)

In [50]:
df.head()

Unnamed: 0,faculty_name,faculty_profile_url,l_expertise,department
0,David A. Aaker,/faculty/david-aaker,"business,marketing,branding",Haas School of Business
1,Pieter Abbeel,/faculty/pieter-abbeel,"robotics,machine learning",Division of Computer Science/EECS
2,Elizabeth Abel,/faculty/elizabeth-abel,"feminist theory,psychoanalysis,Virginia Woolf,...",Department of English
3,Dor Abrahamson,/faculty/dor-abrahamson,"mathematical cognition,design-based research,m...",Graduate School of Education
4,Norman Abrahamson,/faculty/norman-abrahamson,"civil and environmental engineering,earthquake...",Department of Civil and Environmental Engineering


In [103]:
FACULTY_PROFILE_BASE_URL = 'http://vcresearch.berkeley.edu'

def scrape_profile(faculty_profile_url):
    url = '{}/{}'.format(FACULTY_PROFILE_BASE_URL, faculty_profile_url)
    while True:
        try:
            r = requests.get(url)
            break
        except:
            print '{} request failed. Trying again.'.format(faculty_profile_url)
            pass
    soup = BeautifulSoup(r.content)
    info_div = soup.find_all('div', {'class':'panel-panel panel-col'})[0]
    def get_field_text(class_name):
        try:
            return info_div.find_all('div', {'class':class_name})[0] \
                    .find_all('div', {'class':'field-item'})[0].text
        except:
            return None
    def get_field_links(class_name):
        try:
            return [a['href'] for a in info_div.find_all('div', {'class':class_name})[0] \
                    .find_all('div', {'class':'field-item'})[0].find_all('a')]
        except:
            return None


    row = []
    title_name = get_field_text('panel-pane pane-entity-field pane-node-field-title')
    faculty_site_url = get_field_text('field field-name-field-faculty-url field-type-link-field field-label-above')
    lab_url = get_field_text('panel-pane pane-entity-field pane-node-field-laboratory-url')
    faculty_email = get_field_text('panel-pane pane-entity-field pane-node-field-email')
    description = get_field_text('field field-name-body field-type-text-with-summary field-label-hidden')
    l_description_links = get_field_links('field field-name-body field-type-text-with-summary field-label-hidden')
    if l_description_links is None:
        description_links = ''
    else:
        description_links = ','.join(l_description_links)
    research_rows = info_div.find_all('div', 
                                      {'class':'panel-pane pane-views-panes pane-views-for-news-with-faculty-panel-pane-1'})
    if len(research_rows) != 0:
        research_rows = research_rows[0] \
            .find_all('div', {'class':'views-row'})


    PAST_N_RESEARCH = 5
    research_row = []
    for i in range(PAST_N_RESEARCH):
        if i >= len(research_rows):
            research_row.extend([None, None, None, None])
        else:
            research_fields = research_rows[i].find_all('div', {'class':'views-field'})
            article_date = research_fields[1].text
            title_of_news = research_fields[2].text
            link_to_news = research_fields[2].find_all('a')[0]['href']
            description_teaser = research_fields[3].text
            research_row.extend([article_date, title_of_news, link_to_news, description_teaser])

    row.extend([title_name, faculty_site_url, lab_url, faculty_email, description, description_links])
    row.extend(research_row)
    
    print faculty_profile_url
    time.sleep(0.1)
    return row
    
    
# scrape_profile('/faculty/nir-yosef')

In [104]:
profile_specific_rows = df['faculty_profile_url'].apply(scrape_profile)

/faculty/david-aaker
/faculty/pieter-abbeel
/faculty/elizabeth-abel
/faculty/dor-abrahamson
/faculty/norman-abrahamson
/faculty/richard-abrams
/faculty/barbara-abrams
/faculty/kathryn-abrams
/faculty/charisma-acey
/faculty/david-ackerly
/faculty/daniel-acland
/faculty/paul-adams
/faculty/anthony-adams
/faculty/john-w-addison-jr
/faculty/ilan-adler
/faculty/mina-aganagic
/faculty/sabrina-c-agarwal
/faculty/vinod-k-aggarwal
/faculty/alice-m-agogino
/faculty/adrian-aguilera
/faculty/jennifer-ahern
/faculty/shahwali-ahmadi
/faculty/asad-q-ahmed
/faculty/david-ahn
/faculty/george-akerlof
/faculty/m-reza-alam
/faculty/catherine-albiston
/faculty/david-aldous
/faculty/ronelle-alexander
/faculty/paul-alivisatos
/faculty/richard-allen
/faculty/barbara-allen-diaz
/faculty/rodrigo-almeida
/faculty/elad-alon
/faculty/nezar-alsayyad
/faculty/robert-alter
/faculty/miguel-altieri
/faculty/charles-altieri
/faculty/joel-altman
/faculty/lisa-alvarez-cohen
/faculty/bruce-n-ames
/faculty/genevieve-ames
/f

In [110]:
profile_headers = ['title_name',
 'faculty_site_url',
 'lab_url',
 'faculty_email',
 'description',
 'description_links',
 'article_date_1',
 'title_of_news_1',
 'link_to_news_1',
 'description_teaser_1',
 'article_date_2',
 'title_of_news_2',
 'link_to_news_2',
 'description_teaser_2',
 'article_date_3',
 'title_of_news_3',
 'link_to_news_3',
 'description_teaser_3',
 'article_date_4',
 'title_of_news_4',
 'link_to_news_4',
 'description_teaser_4',
 'article_date_5',
 'title_of_news_5',
 'link_to_news_5',
 'description_teaser_5']

In [123]:
profile_df = pd.DataFrame([l for l in profile_specific_rows.values], columns=profile_headers)

In [125]:
profile_df.head()

Unnamed: 0,title_name,faculty_site_url,lab_url,faculty_email,description,description_links,article_date_1,title_of_news_1,link_to_news_1,description_teaser_1,...,link_to_news_3,description_teaser_3,article_date_4,title_of_news_4,link_to_news_4,description_teaser_4,article_date_5,title_of_news_5,link_to_news_5,description_teaser_5
0,Professor of Marketing and Public Policy,http://www.haas.berkeley.edu/faculty/aaker.html,http://groups.haas.berkeley.edu/marketing/,aaker@haas.berkeley.edu,,,,,,,...,,,,,,,,,,
1,Professor,http://www.cs.berkeley.edu/~pabbeel,,pabbeel@cs.berkeley.edu,Robotics and Machine Learning.,,"February 22, 2016",“Deep Learning”: A Giant Step for Robots,/bakarfellows/profile/pieter_abbeel,Bakar Fellow Pieter Abbeel studies deep lear...,...,/news/new-deep-learning-technique-enables-robo...,UC Berkeley researchers have developed algor...,"December 17, 2012",Big NSF grant funds research into training ro...,/news/big-nsf-grant-funds-research-training-ro...,"What if robots and humans, working together,...","August 23, 2011",UC Berkeley robotics expert named among world...,/news/uc-berkeley-robotics-expert-named-among-...,"Pieter Abbeel, a UC Berkeley, professor know..."
2,Professor of English,http://english.berkeley.edu/profiles/5,,eabel@uclink.berkeley.edu,Elizabeth Abel's general research interest is...,,"February 15, 2011","Jim Crow signs as symbols of subjugation, tro...",/news/jim-crow-signs-symbols-subjugation-troph...,"In the mid 1960s, landmark laws brought an o...",...,,,,,,,,,,
3,Associate Professor of Cognition and Development,http://gse.berkeley.edu/people/dor-abrahamson,http://edrl.berkeley.edu/,dor@berkeley.edu,Dor Abrahamson studies the process of mathema...,,,,,,...,,,,,,,,,,
4,Adjunct Professor of Civil and Environmental E...,http://www.ce.berkeley.edu/faculty/faculty.php...,,naa3@earthlink.net,,,,,,,...,,,,,,,,,,


In [133]:
combined_df = df.copy()

In [134]:
for col in profile_df.columns:
    combined_df[col] = profile_df[col]

In [135]:
combined_df.head()

Unnamed: 0,faculty_name,faculty_profile_url,l_expertise,department,title_name,faculty_site_url,lab_url,faculty_email,description,description_links,...,link_to_news_3,description_teaser_3,article_date_4,title_of_news_4,link_to_news_4,description_teaser_4,article_date_5,title_of_news_5,link_to_news_5,description_teaser_5
0,David A. Aaker,/faculty/david-aaker,"business,marketing,branding",Haas School of Business,Professor of Marketing and Public Policy,http://www.haas.berkeley.edu/faculty/aaker.html,http://groups.haas.berkeley.edu/marketing/,aaker@haas.berkeley.edu,,,...,,,,,,,,,,
1,Pieter Abbeel,/faculty/pieter-abbeel,"robotics,machine learning",Division of Computer Science/EECS,Professor,http://www.cs.berkeley.edu/~pabbeel,,pabbeel@cs.berkeley.edu,Robotics and Machine Learning.,,...,/news/new-deep-learning-technique-enables-robo...,UC Berkeley researchers have developed algor...,"December 17, 2012",Big NSF grant funds research into training ro...,/news/big-nsf-grant-funds-research-training-ro...,"What if robots and humans, working together,...","August 23, 2011",UC Berkeley robotics expert named among world...,/news/uc-berkeley-robotics-expert-named-among-...,"Pieter Abbeel, a UC Berkeley, professor know..."
2,Elizabeth Abel,/faculty/elizabeth-abel,"feminist theory,psychoanalysis,Virginia Woolf,...",Department of English,Professor of English,http://english.berkeley.edu/profiles/5,,eabel@uclink.berkeley.edu,Elizabeth Abel's general research interest is...,,...,,,,,,,,,,
3,Dor Abrahamson,/faculty/dor-abrahamson,"mathematical cognition,design-based research,m...",Graduate School of Education,Associate Professor of Cognition and Development,http://gse.berkeley.edu/people/dor-abrahamson,http://edrl.berkeley.edu/,dor@berkeley.edu,Dor Abrahamson studies the process of mathema...,,...,,,,,,,,,,
4,Norman Abrahamson,/faculty/norman-abrahamson,"civil and environmental engineering,earthquake...",Department of Civil and Environmental Engineering,Adjunct Professor of Civil and Environmental E...,http://www.ce.berkeley.edu/faculty/faculty.php...,,naa3@earthlink.net,,,...,,,,,,,,,,


In [136]:
combined_df.to_csv(scraped_file_loc('faculty_profiles'), sep='~', index=False, encoding='utf-8')

In [137]:
pd.read_csv(scraped_file_loc('faculty_profiles'), sep='~')

Unnamed: 0,faculty_name,faculty_profile_url,l_expertise,department,title_name,faculty_site_url,lab_url,faculty_email,description,description_links,...,link_to_news_3,description_teaser_3,article_date_4,title_of_news_4,link_to_news_4,description_teaser_4,article_date_5,title_of_news_5,link_to_news_5,description_teaser_5
0,David A. Aaker,/faculty/david-aaker,"business,marketing,branding",Haas School of Business,Professor of Marketing and Public Policy,http://www.haas.berkeley.edu/faculty/aaker.html,http://groups.haas.berkeley.edu/marketing/,aaker@haas.berkeley.edu,,,...,,,,,,,,,,
1,Pieter Abbeel,/faculty/pieter-abbeel,"robotics,machine learning",Division of Computer Science/EECS,Professor,http://www.cs.berkeley.edu/~pabbeel,,pabbeel@cs.berkeley.edu,Robotics and Machine Learning.,,...,/news/new-deep-learning-technique-enables-robo...,UC Berkeley researchers have developed algor...,"December 17, 2012",Big NSF grant funds research into training ro...,/news/big-nsf-grant-funds-research-training-ro...,"What if robots and humans, working together,...","August 23, 2011",UC Berkeley robotics expert named among world...,/news/uc-berkeley-robotics-expert-named-among-...,"Pieter Abbeel, a UC Berkeley, professor know..."
2,Elizabeth Abel,/faculty/elizabeth-abel,"feminist theory,psychoanalysis,Virginia Woolf,...",Department of English,Professor of English,http://english.berkeley.edu/profiles/5,,eabel@uclink.berkeley.edu,Elizabeth Abel's general research interest is...,,...,,,,,,,,,,
3,Dor Abrahamson,/faculty/dor-abrahamson,"mathematical cognition,design-based research,m...",Graduate School of Education,Associate Professor of Cognition and Development,http://gse.berkeley.edu/people/dor-abrahamson,http://edrl.berkeley.edu/,dor@berkeley.edu,Dor Abrahamson studies the process of mathema...,,...,,,,,,,,,,
4,Norman Abrahamson,/faculty/norman-abrahamson,"civil and environmental engineering,earthquake...",Department of Civil and Environmental Engineering,Adjunct Professor of Civil and Environmental E...,http://www.ce.berkeley.edu/faculty/faculty.php...,,naa3@earthlink.net,,,...,,,,,,,,,,
5,Richard Abrams,/faculty/richard-abrams,"politics,recent U.S. history: business foreign...",Department of History,"Professor of the Graduate School, Department o...",http://history.berkeley.edu/people/richard-m-a...,,abramsr@berkeley.edu,"In addition to recent U.S. history, I lecture...",,...,,,,,,,,,,
6,Barbara Abrams,/faculty/barbara-abrams,"obesity,maternal and child health,epidemiology...",School of Public Health,Professor,http://sph.berkeley.edu/barbara-abrams-drph-rd,,babrams@berkeley.edu,My work focuses on the inter-relationships be...,,...,/news/two-uc-berkeley-faculty-named-institute-...,"Barbara Abrams, professor of epidemiology an...",,,,,,,,
7,Kathryn Abrams,/faculty/kathryn-abrams,"law,feminist jurisprudence,voting rights,const...",Boalt Hall School of Law,Professor of Law,http://www.law.berkeley.edu/faculty/profiles/f...,,krabrams@law.berkeley.edu,"Before entering academia, Kathy Abrams clerke...",,...,,,,,,,,,,
8,Charisma Acey,/faculty/charisma-acey,"water,sanitation,basic services delivery,pover...",Department of City & Regional Planning,Assistant Professor,http://ced.berkeley.edu/ced/faculty-staff/char...,,charisma.acey@berkeley.edu,,,...,,,,,,,,,,
9,David Ackerly,/faculty/david-ackerly,"california biodiversity,climate change,adaptation",Department of Integrative Biology,Professor of Integrative Biology,,http://ib.berkeley.edu/labs/ackerly/,dackerly@berkeley.edu,,,...,/news/warmer-drier-climate-altering-forests-st...,Historical California vegetation data that m...,"July 18, 2014",Scientists enlist big data to guide conservat...,/news/scientists-enlist-big-data-guide-conserv...,Despite a deluge of new information about th...,"July 28, 2011",Warming climate could give exotic grasses edg...,/news/warming-climate-could-give-exotic-grasse...,"With rising temperatures around the globe, C..."


In [138]:
combined_df

Unnamed: 0,faculty_name,faculty_profile_url,l_expertise,department,title_name,faculty_site_url,lab_url,faculty_email,description,description_links,...,link_to_news_3,description_teaser_3,article_date_4,title_of_news_4,link_to_news_4,description_teaser_4,article_date_5,title_of_news_5,link_to_news_5,description_teaser_5
0,David A. Aaker,/faculty/david-aaker,"business,marketing,branding",Haas School of Business,Professor of Marketing and Public Policy,http://www.haas.berkeley.edu/faculty/aaker.html,http://groups.haas.berkeley.edu/marketing/,aaker@haas.berkeley.edu,,,...,,,,,,,,,,
1,Pieter Abbeel,/faculty/pieter-abbeel,"robotics,machine learning",Division of Computer Science/EECS,Professor,http://www.cs.berkeley.edu/~pabbeel,,pabbeel@cs.berkeley.edu,Robotics and Machine Learning.,,...,/news/new-deep-learning-technique-enables-robo...,UC Berkeley researchers have developed algor...,"December 17, 2012",Big NSF grant funds research into training ro...,/news/big-nsf-grant-funds-research-training-ro...,"What if robots and humans, working together,...","August 23, 2011",UC Berkeley robotics expert named among world...,/news/uc-berkeley-robotics-expert-named-among-...,"Pieter Abbeel, a UC Berkeley, professor know..."
2,Elizabeth Abel,/faculty/elizabeth-abel,"feminist theory,psychoanalysis,Virginia Woolf,...",Department of English,Professor of English,http://english.berkeley.edu/profiles/5,,eabel@uclink.berkeley.edu,Elizabeth Abel's general research interest is...,,...,,,,,,,,,,
3,Dor Abrahamson,/faculty/dor-abrahamson,"mathematical cognition,design-based research,m...",Graduate School of Education,Associate Professor of Cognition and Development,http://gse.berkeley.edu/people/dor-abrahamson,http://edrl.berkeley.edu/,dor@berkeley.edu,Dor Abrahamson studies the process of mathema...,,...,,,,,,,,,,
4,Norman Abrahamson,/faculty/norman-abrahamson,"civil and environmental engineering,earthquake...",Department of Civil and Environmental Engineering,Adjunct Professor of Civil and Environmental E...,http://www.ce.berkeley.edu/faculty/faculty.php...,,naa3@earthlink.net,,,...,,,,,,,,,,
5,Richard Abrams,/faculty/richard-abrams,"politics,recent U.S. history: business foreign...",Department of History,"Professor of the Graduate School, Department o...",http://history.berkeley.edu/people/richard-m-a...,,abramsr@berkeley.edu,"In addition to recent U.S. history, I lecture...",,...,,,,,,,,,,
6,Barbara Abrams,/faculty/barbara-abrams,"obesity,maternal and child health,epidemiology...",School of Public Health,Professor,http://sph.berkeley.edu/barbara-abrams-drph-rd,,babrams@berkeley.edu,My work focuses on the inter-relationships be...,,...,/news/two-uc-berkeley-faculty-named-institute-...,"Barbara Abrams, professor of epidemiology an...",,,,,,,,
7,Kathryn Abrams,/faculty/kathryn-abrams,"law,feminist jurisprudence,voting rights,const...",Boalt Hall School of Law,Professor of Law,http://www.law.berkeley.edu/faculty/profiles/f...,,krabrams@law.berkeley.edu,"Before entering academia, Kathy Abrams clerke...",,...,,,,,,,,,,
8,Charisma Acey,/faculty/charisma-acey,"water,sanitation,basic services delivery,pover...",Department of City & Regional Planning,Assistant Professor,http://ced.berkeley.edu/ced/faculty-staff/char...,,charisma.acey@berkeley.edu,,,...,,,,,,,,,,
9,David Ackerly,/faculty/david-ackerly,"california biodiversity,climate change,adaptation",Department of Integrative Biology,Professor of Integrative Biology,,http://ib.berkeley.edu/labs/ackerly/,dackerly@berkeley.edu,,,...,/news/warmer-drier-climate-altering-forests-st...,Historical California vegetation data that m...,"July 18, 2014",Scientists enlist big data to guide conservat...,/news/scientists-enlist-big-data-guide-conserv...,Despite a deluge of new information about th...,"July 28, 2011",Warming climate could give exotic grasses edg...,/news/warming-climate-could-give-exotic-grasse...,"With rising temperatures around the globe, C..."


In [108]:
['{}_{}'.format(header, i) for i in range(1,6) for header in ['article_date', 'title_of_news', 'link_to_news', 'description_teaser']]

['article_date_1',
 'title_of_news_1',
 'link_to_news_1',
 'description_teaser_1',
 'article_date_2',
 'title_of_news_2',
 'link_to_news_2',
 'description_teaser_2',
 'article_date_3',
 'title_of_news_3',
 'link_to_news_3',
 'description_teaser_3',
 'article_date_4',
 'title_of_news_4',
 'link_to_news_4',
 'description_teaser_4',
 'article_date_5',
 'title_of_news_5',
 'link_to_news_5',
 'description_teaser_5']