In [116]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
import os
import numpy as np
import pickle

YEAR = 21
terms = [f'FA{YEAR-1}', f'WI{YEAR}', F'SP{YEAR}', F'S1{YEAR}', f'S2{YEAR}', f'S3{YEAR}']

CAPEURL = 'https://cape.ucsd.edu/responses/Results.aspx'
CAPETITLE = 'Course And Professor Evaluations (CAPE)'

with open('all_depts.pick', 'rb') as f:
    all_depts = pickle.load(f)

# taken from BetterCapes
# https://github.com/andportnoy/smartercapes.com/blob/master/tools.py
def get_raw_cape_dataframe(dept:str):

    options = webdriver.ChromeOptions()
    options.add_argument("user-data-dir=C:/Users/kalki/AppData/Local/Google/Chrome/User Data")
    options.add_argument("profile-directory=Profile 7")
    driver = webdriver.Chrome(options=options)
    
    # driver.get(f'https://cape.ucsd.edu/responses/Results.aspx?Name={course[0]}+{course[1]}&CourseNumber=')
    driver.get(f'https://cape.ucsd.edu/responses/Results.aspx?Name=&CourseNumber={dept}')
    wait = WebDriverWait(driver, 60)
    element = wait.until(expected_conditions.title_contains(CAPETITLE))
    # read in the dataset from the html file
    df = pd.read_html(driver.page_source)[0]
    driver.quit()

    return df

def clean_df(df, dept):
    df = (
        df[
            ['Instructor', 'Course', 'Term', 'Rcmnd Class',
            'Rcmnd Instr', 'Avg Grade Expected',
            'Avg Grade Received', 'Enroll']
        ]
        .dropna()
        .assign(Course = df.get('Course').str.split(' - ').apply(lambda x: x[0]))
    )

    yearly = df[df.Term.isin(terms)].groupby('Course').sum()
    def get_yearly_students(course):
        if course not in yearly.index:
            return 0
        return yearly.get('Enroll').loc[course]

    df = (df
        .assign(
            GPA=(df.get('Avg Grade Received')
                .str.split('(')
                .apply(lambda x : x[-1])
                .str.rstrip(")")
                .astype('float'))
        )
    )
    df = df.assign(total_grade_points = df.get('GPA')*df.get('Enroll'))
    df = df.groupby('Course').sum().reset_index()
    df = (df
        .assign(yearly_num=df.get('Course').apply(get_yearly_students))
        .assign(GPA = df.get('total_grade_points')/df.get('Enroll'))
        .drop(columns=['total_grade_points', 'Enroll'])
        .assign(dept=df.get('Course').str.split(' ').apply(lambda x: x[0]))
        .assign(num=df.get('Course').str.split(' ').apply(lambda x: x[1]))
    )

    df = df[df.get('dept') == dept]

    def remove_str(s:str):
        if s[-1].isalpha():
            return remove_str(s[:-1])
        return s
    df = df.assign(num = df.get('num').apply(remove_str).astype("int"))
    df = df.sort_values(by=['num', 'Course']).set_index('Course').reset_index()
    df = df.assign(index=df.index).set_index('Course')
    return df

In [121]:
dfs = []
for dept in all_depts:
    if os.path.exists(f'depts/{dept}.csv'):
        clean = pd.read_csv(f'depts/{dept}.csv')
    else:
        raw = get_raw_cape_dataframe(dept)
        clean = clean_df(raw, dept)
        clean.to_csv(f'depts/{dept}.csv')
        
    dfs.append(clean)

data = pd.concat(dfs)
data.to_csv('data.csv')

In [122]:
data

Unnamed: 0,Course,GPA,yearly_num,dept,num,index
0,AAS 10,3.651734,199,AAS,10,0
1,AAS 190,4.000000,83,AAS,190,1
0,ANAR 100,3.798909,0,ANAR,100,0
1,ANAR 111,3.293224,0,ANAR,111,1
2,ANAR 114,3.760000,0,ANAR,114,2
...,...,...,...,...,...,...
123,VIS 185,3.558923,0,VIS,185,123
124,VIS 194S,3.057370,0,VIS,194,124
0,WCWP 10A,3.229711,846,WCWP,10,0
1,WCWP 10B,3.329744,895,WCWP,10,1


In [113]:
df = clean_df(get_raw_cape_dataframe('ENG'), 'ENG')

  yearly = df[df.Term.isin(terms)].groupby('Course').sum()
  df = df.groupby('Course').sum().reset_index()


In [114]:
df

Unnamed: 0_level_0,GPA,yearly_num,dept,num,index
Course,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENG 10,3.884957,26,ENG,10,0
ENG 20,3.860385,24,ENG,20,1
ENG 100,3.573858,0,ENG,100,2
ENG 100A,3.640548,0,ENG,100,3
ENG 100B,3.943299,0,ENG,100,4
ENG 100D,3.74108,206,ENG,100,5
ENG 100L,3.915177,0,ENG,100,6
