## UC Berkeley CS Faculty Web Scraping

- URL: https://www2.eecs.berkeley.edu/Faculty/Lists/CS/faculty.html
- The following is a web scraping of the UC Berkelkey CS Professor's: Name, Position, Education, and Research
- Websites Used to Learn: https://www.dataquest.io/blog/web-scraping-beautifulsoup/, https://realpython.com/python-web-scraping-practical-introduction/
- To Do: Clean Education Columns

In [1]:
from requests.exceptions import RequestException
from requests import get
from contextlib import closing
from bs4 import BeautifulSoup
import pandas as pd

**Code here for future reference when building automation for web scraping professor/grad students info**
- Website: https://realpython.com/python-web-scraping-practical-introduction/

In [2]:
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

In [3]:
raw_html = simple_get('https://www2.eecs.berkeley.edu/Faculty/Lists/CS/faculty.html')
type(raw_html) #bytes

bytes

**Code for Pulling From UC Berkeley EECS Website**

In [13]:
from requests import get
response = get('https://www2.eecs.berkeley.edu/Faculty/Lists/CS/faculty.html')
html = BeautifulSoup(response.text, 'html.parser')
type(html) #bs4.BeautifulSoup
html

<!DOCTYPE html>

<html dir="ltr" version="XHTML+RDFa 1.0" xml:lang="en" xmlns="http://www.w3.org/1999/xhtml">
<head profile="http://www.w3.org/1999/xhtml/vocab">
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<link href="https://eecs.berkeley.edu/sites/all/themes/eecs/kalastatic/build/images/favicon.ico" rel="shortcut icon" type="image/vnd.microsoft.icon"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<link href="https://eecs.berkeley.edu/sites/all/themes/eecs/favicon.ico" rel="shortcut icon" type="image/vnd.microsoft.icon"/>
<link href="https://eecs.berkeley.edu/sites/all/themes/eecs/kalastatic/build/images/touch-icon.png" rel="apple-touch-icon"/>
<link href="https://eecs.berkeley.edu/sites/all/themes/eecs/kalastatic/build/images/favicon.png" rel="icon"/>
<meta content="/Assets/touch-icon.png" name="msapplication-TileImage"/>
<meta content="#003262" name="msapplicat

In [5]:
def get_education(professor):
    """
    Gets BS/BA, MS, and PhD degrees of a Professor
    """
       
        
    edu_lst = professor.find("strong", text="Education:").next_sibling.replace('.', '').lower().split(';')
    empty_bs_ba, empty_ms_ma, empty_phd = (0 for i in range(3))
    test1, test2, test3 = ([] for i in range(3))


    for edu_single in edu_lst: 

        if any(s in edu_single for s in ['ba', 'bs']):
            test1.append(edu_single) 
            empty_bs_ba += 1

        if any(s in edu_single for s in ['ms', 'ma']):
            test2.append(edu_single)
            empty_ms_ma += 1

        if ('phd')  in edu_single:
            test3.append(edu_single)
            empty_phd += 1


    if empty_bs_ba == 0:
        test1.append('NA')

    if empty_ms_ma == 0:
        test2.append('NA')

    if empty_phd == 0:
        test3.append('NA')



    return test1, test2, test3



In [10]:
def getInfo(name_lst, position_lst, research_lst, bs_ba_lst, ms_lst, phd_lst, html_bs4):
    
    """
    Generates DF with the columns above for each professor. If column value not available for a professor on website,
    automatically fills with NA
    """
    
    for professor in professors:
        
        name = professor.select_one('a[href*=Faculty]').text
        name_lst.append(name)
        
        position = professor.find('strong').text
        position_lst.append(position)
        
        research = [topic.text for topic in professor.select('a[href*=Research]')]
        research_lst.append(research)
        

        if professor.find("strong", text="Education:"): 
            sub1, sub2, sub3 = get_education(professor)
            bs_ba_lst.append(sub1)
            ms_lst.append(sub2)
            phd_lst.append(sub3)
        else:
            bs_ba_lst.append('N/A')
            ms_lst.append('N/A')
            phd_lst.append('N/A')

        

    df = pd.DataFrame(
            {'Name': name_lst,
             'Position': position_lst,
             'Research': research_lst,
             'BS/BA': bs_ba_lst,
             'MS': ms_lst,
             'PhD': phd_lst
            })
        
    
    return df
        
professors = html.findAll("div", {"class": "media-body"})
df = getInfo(name_lst=[], position_lst=[], research_lst=[], bs_ba_lst=[], ms_lst=[], phd_lst=[], html_bs4=professors)
df.head()

Unnamed: 0,Name,Position,Research,BS/BA,MS,PhD
0,Pieter Abbeel,Professor,"[Artificial Intelligence (AI), Control, Intell...",[NA],"[ 2000, ms, electrical engineering, ku leuven,...","[ \n 2008, phd, computer sc..."
1,Krste AsanoviÄ,Professor,"[Computer Architecture & Engineering (ARC), In...","[ 1987, ba, electrical and information science...","[ 1987, ba, electrical and information science...","[ \n 1998, phd, computer sc..."
2,Babak Ayazifar,Teaching Professor,"[Education (EDUC), Signal Processing (SP)]","[ 1989, bs, electrical engineering, caltech\n ...","[ \n 2003, phd, electrical ...","[ \n 2003, phd, electrical ..."
3,Jonathan Bachrach,Adjunct Assistant Professor,"[Programming Systems (PS), Computer Architectu...","[ 1985, bs, computer engineering and cognitive...","[ \n 1992, phd, computer sc...","[ \n 1992, phd, computer sc..."
4,Ruzena Bajcsy,Professor,"[Artificial Intelligence (AI), Biosystems & Co...",[NA],"[ 1957, ms, electrical engineering, slovak tec...","[ \n 1972, phd, computer sc..."


In [12]:
df.to_csv('UCBfaculty.csv')

In [7]:
df['BS/BA'] = df['BS/BA'].replace('\n  ', '')#can split by \n and then it makes it into separate parts of list

In [8]:
df["new_column"] = df['MS'].str.replace('[^\w\s]','')