# Penn Courses Analysis


In [1]:
from bs4 import BeautifulSoup

# open file containing all department codes
with open("penn-course-catalog.html") as file:
    soup = BeautifulSoup(file, "html.parser")

# find all department codes
dept_codes = list()
catalog_div = soup.find("div", {"id": "right-col"})
for dept_code_ul in catalog_div.find_all(
    lambda x: x.name == "ul" and x.find_previous("h2", class_="letternav-head")
):
    for dept_code_li in dept_code_ul.find_all("li"):
        dept_code_str = dept_code_li.find("a").text
        # strings are formatted like "Statistics (STAT)"
        # extract 4-letter code within parenthesis from string
        dept_code = dept_code_str.split('(', 1)[1].split(')')[0]
        dept_codes.append(dept_code)

print('All department codes:')
print(dept_codes)

All department codes:
['ACFD', 'ACCT', 'AFRC', 'ASLD', 'AMHR', 'ANAT', 'ANCH', 'ANEL', 'ANTH', 'AMCS', 'APOP', 'ARAB', 'ARCH', 'AAMW', 'ARTH', 'ASAM', 'ALAN', 'ASTR', 'BAAS', 'BDS', 'BENG', 'BENF', 'BMB', 'BCHE', 'BE', 'BIOE', 'BIOL', 'BIOM', 'BMIN', 'BSTA', 'BIOT', 'BCS', 'BEPP', 'CAMB', 'CBE', 'CHEM', 'CHIC', 'CHIN', 'CINM', 'CIMS', 'CPLN', 'CLST', 'CLSC', 'CLCH', 'COGS', 'COLL', 'COMM', 'COML', 'CIS', 'CIT', 'CRWR', 'CRIM', 'CZCH', 'DATA', 'DATS', 'DEMG', 'DCOH', 'DENT', 'DADE', 'GEND', 'GOMD', 'GORT', 'GPED', 'GPRD', 'GPRS', 'DOMD', 'DOSP', 'DRST', 'DSGN', 'DIGC', 'DTCH', 'EESC', 'EALC', 'ECON', 'EDUC', 'EDEN', 'EDHE', 'EDPR', 'EDME', 'EDMC', 'EDCL', 'EDSC', 'EDSL', 'EDTC', 'EDTF', 'ESE', 'ENMG', 'EAS', 'ENGR', 'ENM', 'ENGL', 'ENLT', 'ENVS', 'EPID', 'ETHC', 'FILP', 'FNCE', 'FNAR', 'FRSM', 'FREN', 'GSWS', 'GENC', 'GCB', 'GRMN', 'GMPA', 'GLBS', 'GAFL', 'GAS', 'GREK', 'GUJR', 'HSOC', 'HCIN', 'HCMG', 'HPR', 'HQS', 'HEBR', 'HIND', 'HSPV', 'HSSC', 'HIST', 'HUNG', 'IGBO', 'IMUN', 'INDO', 

In [3]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time

options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

all_courses = list()
login = True

for dept in dept_codes:
    dept_url = f'https://penncoursereview.com/department/{dept}'
    driver.get(dept_url)
    if (login):
        time.sleep(20)
        login = False
    else:
        time.sleep(4)
    dept_page = driver.page_source
    dept_soup = BeautifulSoup(dept_page, 'html.parser')
    for course_row in dept_soup.find_all('div', class_='rt-tr-group'):
        course_cells = course_row.find_all('div', class_='rt-td')
        course_dict = dict()
        course_code = course_cells[0].text
        if (len(course_cells) <= 5):
            # no course review data available
            print('Course review data not available for ' + course_code)
            continue
        course_dict['code'] = course_code
        course_dict['name'] = course_cells[1].text
        course_dict['course_quality'] = course_cells[2].text
        course_dict['instructor_quality'] = course_cells[3].text
        course_dict['difficulty'] = course_cells[4].text
        course_dict['work_required'] = course_cells[5].text
        all_courses.append(course_dict)
        print(f'Data scraped for {course_code}: ', course_dict)

Course review data not available for ACFD-6000
Course review data not available for ACFD-6010
Course review data not available for ACFD-6020
Course review data not available for ACFD-6030
Data scraped for ACCT-021:  {'code': 'ACCT-021', 'name': 'Intermed Financial Acct', 'course_quality': '2.62', 'instructor_quality': '2.81', 'difficulty': '2.62', 'work_required': '2.71'}
Data scraped for ACCT-022:  {'code': 'ACCT-022', 'name': 'Intermed Financial Acct', 'course_quality': '3.08', 'instructor_quality': '3.13', 'difficulty': '2.97', 'work_required': '3.06'}
Data scraped for ACCT-028:  {'code': 'ACCT-028', 'name': 'Fed Tax Acct/Tax Plan', 'course_quality': '2.85', 'instructor_quality': '2.92', 'difficulty': '3.38', 'work_required': '3.23'}
Data scraped for ACCT-1010:  {'code': 'ACCT-1010', 'name': 'Acct & Financial Report', 'course_quality': '2.39', 'instructor_quality': '2.64', 'difficulty': '2.79', 'work_required': '2.29'}
Data scraped for ACCT-1020:  {'code': 'ACCT-1020', 'name': 'Stra

In [5]:
print('Number of courses scraped: ', len(all_courses))

Number of courses scraped:  14696


In [18]:
import pandas as pd

courses_df = pd.DataFrame(all_courses)

In [19]:
courses_df.head(20)

Unnamed: 0,code,name,course_quality,instructor_quality,difficulty,work_required
0,ACCT-021,Intermed Financial Acct,2.62,2.81,2.62,2.71
1,ACCT-022,Intermed Financial Acct,3.08,3.13,2.97,3.06
2,ACCT-028,Fed Tax Acct/Tax Plan,2.85,2.92,3.38,3.23
3,ACCT-1010,Acct & Financial Report,2.39,2.64,2.79,2.29
4,ACCT-1020,Strategic Cost Analysis,2.15,2.6,2.6,2.11
5,ACCT-201,Financial Accounting I,2.77,3.01,2.91,2.44
6,ACCT-202,Financial Accounting II,2.88,3.09,2.88,2.4
7,ACCT-203,Cost Accounting,2.24,2.77,2.74,2.26
8,ACCT-205,Tax Planning & Administr,3.01,3.2,2.59,1.86
9,ACCT-208,Auditing,2.6,2.86,1.69,1.55


In [20]:
import numpy as np

courses_df = courses_df.replace('N/A', np.nan)
courses_df_clean = courses_df.dropna()
print('Cleaned number of courses: ', len(courses_df_clean))

Cleaned number of courses:  10007


In [21]:
courses_df_clean.head(20)

Unnamed: 0,code,name,course_quality,instructor_quality,difficulty,work_required
0,ACCT-021,Intermed Financial Acct,2.62,2.81,2.62,2.71
1,ACCT-022,Intermed Financial Acct,3.08,3.13,2.97,3.06
2,ACCT-028,Fed Tax Acct/Tax Plan,2.85,2.92,3.38,3.23
3,ACCT-1010,Acct & Financial Report,2.39,2.64,2.79,2.29
4,ACCT-1020,Strategic Cost Analysis,2.15,2.6,2.6,2.11
5,ACCT-201,Financial Accounting I,2.77,3.01,2.91,2.44
6,ACCT-202,Financial Accounting II,2.88,3.09,2.88,2.4
7,ACCT-203,Cost Accounting,2.24,2.77,2.74,2.26
8,ACCT-205,Tax Planning & Administr,3.01,3.2,2.59,1.86
9,ACCT-208,Auditing,2.6,2.86,1.69,1.55


In [22]:
courses_df_clean.to_csv('penn-course-data.csv', index=False)