<a href="https://colab.research.google.com/github/kellianneyang/grades-project/blob/main/preprocessing_and_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Grades Project: Preprocessing and Modeling**

# Preliminary Steps

In [1]:
# import libraries

import warnings
warnings.filterwarnings('ignore')

# general
import numpy as np
import pandas as pd

# preprocessing
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split

# modeling
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, \
GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

# evaluation
from sklearn.metrics import r2_score, mean_absolute_error, \
mean_squared_error, mean_absolute_percentage_error

# tuning
from sklearn.model_selection import GridSearchCV

# feature engineering
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn import set_config
set_config(display="text")
from kneed import KneeLocator

In [2]:
# load data
path = 'all_grades_data_cleaned.csv'
df = pd.read_csv(path, index_col = 0)

In [3]:
# inspect
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49012 entries, 12 to 64185
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   section_type       49012 non-null  object 
 1   instructor_id      49012 non-null  object 
 2   facility_code      45521 non-null  object 
 3   start_time         49012 non-null  float64
 4   mon                49012 non-null  bool   
 5   tues               49012 non-null  bool   
 6   wed                49012 non-null  bool   
 7   thurs              49012 non-null  bool   
 8   fri                49012 non-null  bool   
 9   subject_name       49012 non-null  object 
 10  course_name        49012 non-null  object 
 11  a_proportion       49012 non-null  float64
 12  f_proportion       49012 non-null  float64
 13  avg_grade          49012 non-null  float64
 14  year               49012 non-null  int64  
 15  term               49012 non-null  object 
 16  class_length       49

In [4]:
# check for duplicates
df.duplicated().sum()

0

In [5]:
# check for missing values
df.isna().sum()

# 'facility_code' is the only column with missing values; will need to impute

section_type            0
instructor_id           0
facility_code        3491
start_time              0
mon                     0
tues                    0
wed                     0
thurs                   0
fri                     0
subject_name            0
course_name             0
a_proportion            0
f_proportion            0
avg_grade               0
year                    0
term                    0
class_length            0
total_time              0
weekend                 0
course_difficulty       0
dtype: int64

In [6]:
# binary-encode days of week columns

binary_cols = ['mon', 'tues', 'wed', 'thurs', 'fri', 'weekend']

for col in binary_cols:
    df[col].replace({True: 1, False: 0}, inplace = True)
    df[col].replace({True: 1, False: 0}, inplace = True)

# check
for col in binary_cols:
    print(df[col].value_counts())
    print(df[col].value_counts())

0    26559
1    22453
Name: mon, dtype: int64
0    26559
1    22453
Name: mon, dtype: int64
0    27148
1    21864
Name: tues, dtype: int64
0    27148
1    21864
Name: tues, dtype: int64
0    27180
1    21832
Name: wed, dtype: int64
0    27180
1    21832
Name: wed, dtype: int64
0    28022
1    20990
Name: thurs, dtype: int64
0    28022
1    20990
Name: thurs, dtype: int64
0    36491
1    12521
Name: fri, dtype: int64
0    36491
1    12521
Name: fri, dtype: int64
0    48747
1      265
Name: weekend, dtype: int64
0    48747
1      265
Name: weekend, dtype: int64


# Preprocessing

## Target: avg_grade

Here I assign variables to the three possible target columns in the dataset: 'a_proportion', 'f_proportion', and 'avg_grade'. I will leave aside 'a_proportion' and 'f_proportion' for now, and focus on 'avg_grade' for modeling.

In [7]:
# assign X and y
target_a = 'a_proportion'
target_f = 'f_proportion'
target_avg = 'avg_grade'

y_a = df[target_a]
y_f = df[target_f]
y_avg = df[target_avg]

X_a = df.drop(columns = [target_a, target_f, target_avg])
X_f = X_a.copy()
X_avg = X_a.copy()

# check
print(f"y_a: \n{y_a}")
print(f"y_f: \n{y_f}")
print(f"y_avg: \n{y_avg}")
print(f"X_a: \n{X_a}")
print(f"X_f: \n{X_f}")
print(f"X_avg: \n{X_avg}")

y_a: 
12       0.250000
13       0.230769
14       0.153846
15       0.500000
16       0.363636
           ...   
64181    0.111111
64182    0.209877
64183    0.261364
64184    0.225000
64185    0.611111
Name: a_proportion, Length: 49012, dtype: float64
y_f: 
12       0.000000
13       0.000000
14       0.000000
15       0.000000
16       0.000000
           ...   
64181    0.012346
64182    0.000000
64183    0.000000
64184    0.012500
64185    0.055556
Name: f_proportion, Length: 49012, dtype: float64
y_avg: 
12       3.625000
13       3.346154
14       3.500000
15       3.750000
16       3.454545
           ...   
64181    3.000000
64182    3.308642
64183    3.210227
64184    3.087500
64185    3.583333
Name: avg_grade, Length: 49012, dtype: float64
X_a: 
      section_type instructor_id facility_code  start_time  mon  tues  wed  \
12             lec         other           NaN        -1.0    0     0    0   
13             lec         other          0545       660.0    0     1    0   

In [8]:
# validate model with train/test split
X_a_train, X_a_test, y_a_train, y_a_test = train_test_split(X_a, y_a, random_state = 42)
X_f_train, X_f_test, y_f_train, y_f_test = train_test_split(X_f, y_f, random_state = 42)
X_avg_train, X_avg_test, y_avg_train, y_avg_test = train_test_split(X_avg, y_avg, random_state = 42)

# check
print(f"X_a_train shape: {X_a_train.shape}")
print(f"X_a_test shape: {X_a_test.shape}")
print(f"y_a_train shape: {y_a_train.shape}")
print(f"y_a_test shape: {y_a_test.shape}")

print(f"X_f_train shape: {X_f_train.shape}")
print(f"X_f_test shape: {X_f_test.shape}")
print(f"y_f_train shape: {y_f_train.shape}")
print(f"y_f_test shape: {y_f_test.shape}")

print(f"X_avg_train shape: {X_avg_train.shape}")
print(f"X_avg_test shape: {X_avg_test.shape}")
print(f"y_avg_train shape: {y_avg_train.shape}")
print(f"y_avg_test shape: {y_avg_test.shape}")

X_a_train shape: (36759, 17)
X_a_test shape: (12253, 17)
y_a_train shape: (36759,)
y_a_test shape: (12253,)
X_f_train shape: (36759, 17)
X_f_test shape: (12253, 17)
y_f_train shape: (36759,)
y_f_test shape: (12253,)
X_avg_train shape: (36759, 17)
X_avg_test shape: (12253, 17)
y_avg_train shape: (36759,)
y_avg_test shape: (12253,)


In [9]:
# one-hot encode:
# 'section_type', 'instructor_id', 'subject_name', 'course_name', 'term',
# 'course_difficulty'

ohe_cols = ['section_type', 'instructor_id', 'subject_name', 'course_name',
            'term', 'course_difficulty', 'year']

ohe = OneHotEncoder(handle_unknown = 'ignore',
                   sparse = False)

ohe_tuple = (ohe, ohe_cols)

In [10]:
# one-hot encode and impute constant 'missing':
# 'facility_code'

missing_imputer = SimpleImputer(strategy = 'constant', fill_value = 'missing')

ohe = OneHotEncoder(handle_unknown = 'ignore')

facility_code_pipe = make_pipeline(missing_imputer, ohe)

facility_code_tuple = (facility_code_pipe, ['facility_code'])

In [11]:
# scale:
# 'start_time', 'class_length', 'total_time'

scaler = StandardScaler()

scale_cols = ['start_time', 'class_length', 'total_time']

scale_tuple = (scaler, scale_cols)

In [12]:
# binary encoded ('passthrough' in preprocessor):
# 'mon', 'tues', 'wed', 'thurs', 'fri', 'weekend'

# create preprocessor
preprocessor = make_column_transformer(ohe_tuple, 
                                       facility_code_tuple, 
                                       scale_tuple,
                                       remainder = 'passthrough')

In [13]:
# # check transformation by preprocessor
# X_avg_train_processed = preprocessor.fit_transform(X_avg_train)
# X_avg_test_processed = preprocessor.fit_transform(X_avg_test)

# # check for missing values
# print(np.isnan(X_avg_train_processed).sum().sum(), 'missing values in train')
# print(np.isnan(X_avg_test_processed).sum().sum(), 'missing values in test')

# # check that all data is numeric
# print('All data in train is ', X_avg_train_processed.dtype)
# print('All data in test is ', X_avg_test_processed.dtype)

# # check shape of data to infer that categorical columns were one hot encoded
# print('Shape of X_axg_train_processed data is', X_avg_train_processed.shape)
# print('Shape of X_avg_test_processed data is', X_avg_test_processed.shape)

# # check arrays to see that numeric data was scaled
# print(X_avg_train_processed)
# print(X_avg_test_processed)

# Modeling Functions

In [14]:
# create dataframe to save metrics
metrics_df = pd.DataFrame()

# check
metrics_df.shape

(0, 0)

In [15]:
# define function that will print regression metrics and store metrics in a 
# dataframe for easy viewing

def get_metrics(model_pipe, X_train, X_test, y_train, y_test, name):
  
    # calculate predictions
    train_pred = model_pipe.predict(X_train)
    test_pred = model_pipe.predict(X_test)

    # store scores
    metrics_df.at[name, 'Train R2'] = \
        r2_score(y_train, train_pred) 
    
    metrics_df.at[name, 'Test R2'] = \
        r2_score(y_test, test_pred)
        
    metrics_df.at[name, 'Train MAE'] = \
        mean_absolute_error(y_train, train_pred)
    
    metrics_df.at[name, 'Test MAE'] = \
        mean_absolute_error(y_test, test_pred)
    
    metrics_df.at[name, 'Train MAPE'] = \
        mean_absolute_percentage_error(y_train, train_pred)
    
    metrics_df.at[name, 'Test MAPE'] = \
        mean_absolute_percentage_error(y_test, test_pred)
    
    metrics_df.at[name, 'Train RMSE'] = \
        np.sqrt(mean_squared_error(y_train, train_pred))
    
    metrics_df.at[name, 'Test RMSE'] = \
        np.sqrt(mean_squared_error(y_test, test_pred))
  
    # show scores for this model only (can call metrics_df to see all scores)
    print(metrics_df.loc[name, :])

# Dummy Model

In [16]:
%%time

# time: 172 milliseconds

# instantiate
dummy_regressor = DummyRegressor(strategy = 'mean')

# combine with preprocessor
dummy_pipe = make_pipeline(preprocessor, dummy_regressor)

# fit on training data
dummy_pipe.fit(X_avg_train, y_avg_train)

# calculate, show, and store metrics
get_metrics(dummy_pipe, 
            X_avg_train, 
            X_avg_test, 
            y_avg_train, 
            y_avg_test, 
            "Dummy")

Train R2      0.000000e+00
Test R2      -1.234698e-04
Train MAE     3.083861e-01
Test MAE      3.092590e-01
Train MAPE    4.307289e+11
Test MAPE     9.236957e-02
Train RMSE    3.742595e-01
Test RMSE     3.747394e-01
Name: Dummy, dtype: float64
CPU times: total: 641 ms
Wall time: 631 ms


In [17]:
# check metrics_df
metrics_df

Unnamed: 0,Train R2,Test R2,Train MAE,Test MAE,Train MAPE,Test MAPE,Train RMSE,Test RMSE
Dummy,0.0,-0.000123,0.308386,0.309259,430728900000.0,0.09237,0.37426,0.374739


# Model 1: Linear Regression

In [18]:
%%time

# # time: 3 seconds

# instantiate
lr = LinearRegression()

# pipeline
lr_pipe = make_pipeline(preprocessor, lr)

# fit
lr_pipe.fit(X_avg_train, y_avg_train)

# evaluate
get_metrics(lr_pipe, 
            X_avg_train, 
            X_avg_test, 
            y_avg_train, 
            y_avg_test, 
            "Linear")

Train R2      6.340206e-01
Test R2       6.170476e-01
Train MAE     1.704005e-01
Test MAE      1.761343e-01
Train MAPE    4.674987e+11
Test MAPE     5.221236e-02
Train RMSE    2.264129e-01
Test RMSE     2.318862e-01
Name: Linear, dtype: float64
CPU times: total: 3.09 s
Wall time: 586 ms


# Model 2: Decision Tree Regressor

## Default

In [19]:
%%time

# time: 4 seconds

# instantiate
dt_def = DecisionTreeRegressor(random_state = 42)

# pipeline
dt_def_pipe = make_pipeline(preprocessor, dt_def)

# fit
dt_def_pipe.fit(X_avg_train, y_avg_train)

# evaluate
get_metrics(dt_def_pipe, 
            X_avg_train, 
            X_avg_test, 
            y_avg_train, 
            y_avg_test, 
            "Def DT")

Train R2      0.978962
Test R2       0.409189
Train MAE     0.012814
Test MAE      0.207481
Train MAPE    0.003885
Test MAPE     0.061587
Train RMSE    0.054284
Test RMSE     0.288023
Name: Def DT, dtype: float64
CPU times: total: 4.73 s
Wall time: 4.35 s


## Tuned on 'max_depth'

In [20]:
# get depth from default tree where max_depth = None
def_depth = dt_def_pipe['decisiontreeregressor'].get_depth()
def_depth

# this is a very deep tree, which is overfitting by a lot; for the
# tuned model, I will only try up to half this depth

110

In [21]:
# %%time

# # time: 6 minutes 27 seconds

# # instantiate
# dt_tun = DecisionTreeRegressor(random_state = 42)

# # pipeline
# dt_tun_pipe = make_pipeline(preprocessor, dt_tun)

# # params
# dt_params = {'decisiontreeregressor__max_depth': range(1, 56)}

# # gridsearch
# dt_gs = GridSearchCV(dt_tun_pipe, dt_params, scoring = 'r2')

# # fit
# dt_gs.fit(X_avg_train, y_avg_train)

# # see best parameters
# print(dt_gs.best_params_) # max_depth: 23

# # see best score
# print(dt_gs.best_score_) # 0.4709

In [22]:
# instantiate tuned model
# dt_tun_pipe = dt_gs.best_estimator_

# # print and store metrics
# get_metrics(dt_tun_pipe, 
#             X_avg_train, 
#             X_avg_test, 
#             y_avg_train, 
#             y_avg_test, 
#             "Tun DT")

In [23]:
# hard coding results into metrics_df so I don't have to
# re-run the notebook cells that take a long time to run
# every time I open it up

metrics_df.at['Tun DT', 'Train R2'] = 0.6771564
metrics_df.at['Tun DT', 'Test R2'] = 0.4805888
metrics_df.at['Tun DT', 'Train MAE'] = 0.1434812
metrics_df.at['Tun DT', 'Test MAE'] = 0.2005607
metrics_df.at['Tun DT', 'Train MAPE'] = 0.4537716
metrics_df.at['Tun DT', 'Test MAPE'] = 0.5964366
metrics_df.at['Tun DT', 'Train RMSE'] = 0.2126517
metrics_df.at['Tun DT', 'Test RMSE'] = 0.2700587

# Model 3: Bagged Tree Regressor

## Default

In [24]:
%%time

# time: 32 seconds

# instantiate
br_def = BaggingRegressor(random_state = 42)

# pipeline
br_def_pipe = make_pipeline(preprocessor, br_def)

# fit
br_def_pipe.fit(X_avg_train, y_avg_train)

# evaluate
get_metrics(br_def_pipe, 
            X_avg_train, 
            X_avg_test, 
            y_avg_train, 
            y_avg_test, 
            "Def BR")

KeyboardInterrupt: 

## Tuned on 'n_estimators'

In [25]:
# %%time
# # time: 32 minutes

# # instantiate
# br_tun = BaggingRegressor(random_state = 42)

# # pipeline
# br_tun_pipe = make_pipeline(preprocessor, br_tun)

# # params
# br_params = {'baggingregressor__n_estimators': [10, 101, 20]}

# # grid search
# br_gs = GridSearchCV(br_tun_pipe, br_params, scoring = 'r2')

# # fit
# br_gs.fit(X_avg_train, y_avg_train)

# # see best parameters
# print(br_gs.best_params_) # n_estimators: 100

# # see best score
# print(br_gs.best_score_) # .6283

In [26]:
# # instantiate tuned model
# br_tun_pipe = br_gs.best_estimator_

# # print and store metrics
# get_metrics(br_tun_pipe, 
#             X_avg_train, 
#             X_avg_test, 
#             y_avg_train, 
#             y_avg_test, 
#             "Tun BR")

In [27]:
# hard coding results into hard_metrics_df so I don't have to
# re-run the notebook cells that take a long time to run
# every time I open it up

metrics_df.at['Tun BR', 'Train R2'] = 0.9351598
metrics_df.at['Tun BR', 'Test R2'] = 0.6450013
metrics_df.at['Tun BR', 'Train MAE'] = 0.6603890
metrics_df.at['Tun BR', 'Test MAE'] = 0.1630311
metrics_df.at['Tun BR', 'Train MAPE'] = 0.1967819
metrics_df.at['Tun BR', 'Test MAPE'] = 0.4849256
metrics_df.at['Tun BR', 'Train RMSE'] = 0.9530044
metrics_df.at['Tun BR', 'Test RMSE'] = 0.2232626

# Model 4: Random Forest Regressor

## Default

In [28]:
# %%time

# # time: 5 minutes

# # instantiate
# rf_def = RandomForestRegressor(random_state = 42)

# # pipeline
# rf_def_pipe = make_pipeline(preprocessor, rf_def)

# # fit
# rf_def_pipe.fit(X_avg_train, y_avg_train)

# # evaluate
# get_metrics(rf_def_pipe, 
#             X_avg_train, 
#             X_avg_test, 
#             y_avg_train, 
#             y_avg_test, 
#             "Def RF")

In [29]:
# hard coding results into hard_metrics_df so I don't have to
# re-run the notebook cells that take a long time to run
# every time I open it up

metrics_df.at['Def RF', 'Train R2'] = 0.9351265
metrics_df.at['Def RF', 'Test R2'] = 0.6453246
metrics_df.at['Def RF', 'Train MAE'] = 0.6602899
metrics_df.at['Def RF', 'Test MAE'] = 0.1629727
metrics_df.at['Def RF', 'Train MAPE'] = 0.1943255
metrics_df.at['Def RF', 'Test MAPE'] = 0.4847181
metrics_df.at['Def RF', 'Train RMSE'] = 0.9532490
metrics_df.at['Def RF', 'Test RMSE'] = 0.2231609

## Tuned on 'max_depth' and 'max_features'

In [30]:
# # get depth from max of estimator depths in the default random forest model
# est_depths = [estimator.get_depth() for estimator 
#               in rf_def_pipe['randomforestregressor'].estimators_]
# max_depth_rf = max(est_depths)
# max_depth_rf # 183

In [31]:
# %%time
# # time: 5 hours, 20 minutes

# # instantiate
# rf_tun = RandomForestRegressor(random_state = 42, n_jobs = -1)

# # pipeline
# rf_tun_pipe = make_pipeline(preprocessor, rf_tun)

# # params
# rf_params = {}

# # max_depth (range between 1 and 20)
# rf_params['randomforestregressor__max_depth'] = range(1, 20, 2)

# # max_features (default 1; range over middle 50% of number of columns without 
# # dummies); 17 feature columns, so range over about 4-12
# rf_params['randomforestregressor__max_features'] = range(4, 12)

# # gridsearchcv
# rf_gs = GridSearchCV(rf_tun_pipe, 
#                      rf_params,
#                      scoring = 'r2')

# rf_gs.fit(X_avg_train, y_avg_train)

In [32]:
# # see best score from randomizedsearchcv
# rf_gs.best_score_

In [33]:
# # see the best parameters from the tuned model
# rf_gs.best_params_

In [34]:
# # instantiate tuned model
# rf_tun_pipe = rf_gs.best_estimator_

# # print and store metrics
# get_metrics(rf_tun_pipe, 
#             X_avg_train, 
#             X_avg_test, 
#             y_avg_train, 
#             y_avg_test, 
#             "Tun RF")

In [35]:
# hard coding results into hard_metrics_df so I don't have to
# re-run the notebook cells that take a long time to run
# every time I open it up

metrics_df.at['Tun RF', 'Train R2'] = 0.9318880
metrics_df.at['Tun RF', 'Test R2'] = 0.6335185
metrics_df.at['Tun RF', 'Train MAE'] = 0.7003219
metrics_df.at['Tun RF', 'Test MAE'] = 0.1707381
metrics_df.at['Tun RF', 'Train MAPE'] = 0.1941178
metrics_df.at['Tun RF', 'Test MAPE'] = 0.5076128
metrics_df.at['Tun RF', 'Train RMSE'] = 0.9767525
metrics_df.at['Tun RF', 'Test RMSE'] = 0.2268446

# Model 5: K-Nearest Neighbors Regressor

## Default

In [36]:
# %%time

# # 8 minutes, 24 seconds

# # instantiate
# kn_def = KNeighborsRegressor()

# # pipeline
# kn_def_pipe = make_pipeline(preprocessor, kn_def)

# # fit
# kn_def_pipe.fit(X_avg_train, y_avg_train)

# # evaluate
# get_metrics(kn_def_pipe, 
#             X_avg_train, 
#             X_avg_test, 
#             y_avg_train, 
#             y_avg_test, 
#             "Def KNN")

In [37]:
# hard coding results into hard_metrics_df so I don't have to
# re-run the notebook cells that take a long time to run
# every time I open it up

metrics_df.at['Def KNN', 'Train R2'] = 0.7435497
metrics_df.at['Def KNN', 'Test R2'] = 0.6182448
metrics_df.at['Def KNN', 'Train MAE'] = 0.1390834
metrics_df.at['Def KNN', 'Test MAE'] = 0.1711105
metrics_df.at['Def KNN', 'Train MAPE'] = 0.3920542
metrics_df.at['Def KNN', 'Test MAPE'] = 0.5073286
metrics_df.at['Def KNN', 'Train RMSE'] = 0.1895285
metrics_df.at['Def KNN', 'Test RMSE'] = 0.2315235

## Tuned on 'n_neighbors'

In [38]:
# %%time

# # 15 minutes

# # instantiate
# kn_tun = KNeighborsRegressor()

# # pipeline
# kn_tun_pipe = make_pipeline(preprocessor, kn_tun)

# # params
# kn_params = {}

# # tune n_neighbors
# kn_params['kneighborsregressor__n_neighbors'] = [10, 50, 100]

# # randomizedsearchcv
# kn_gs = GridSearchCV(kn_tun_pipe, 
#                      kn_params,
#                      scoring = 'r2')

# kn_gs.fit(X_avg_train, y_avg_train)

In [39]:
# # see best score from randomizedsearchcv
# kn_gs.best_score_

In [40]:
# # see the best parameters from the tuned model
# kn_gs.best_params_

In [41]:
# # instantiate tuned model
# kn_tun_pipe = kn_gs.best_estimator_

# # print and store metrics
# get_metrics(kn_tun_pipe, 
#             X_avg_train, 
#             X_avg_test, 
#             y_avg_train, 
#             y_avg_test, 
#             "Tun KNN")

In [42]:
# hard coding results into hard_metrics_df so I don't have to
# re-run the notebook cells that take a long time to run
# every time I open it up

metrics_df.at['Tun KNN', 'Train R2'] = 0.6868856
metrics_df.at['Tun KNN', 'Test R2'] = 0.6185721
metrics_df.at['Tun KNN', 'Train MAE'] = 0.1566485
metrics_df.at['Tun KNN', 'Test MAE'] = 0.1734479
metrics_df.at['Tun KNN', 'Train MAPE'] = 0.4410609
metrics_df.at['Tun KNN', 'Test MAPE'] = 0.5141256
metrics_df.at['Tun KNN', 'Train RMSE'] = 0.2094230
metrics_df.at['Tun KNN', 'Test RMSE'] = 0.2314242

# Model 6: Extreme Gradient Boosting Regressor

## Default

In [19]:
%%time

# 20 seconds

# instantiate
xgb_def = XGBRegressor()

# pipeline
xgb_def_pipe = make_pipeline(preprocessor, xgb_def)

# fit
xgb_def_pipe.fit(X_avg_train, y_avg_train)

# evaluate
get_metrics(xgb_def_pipe, 
            X_avg_train, 
            X_avg_test, 
            y_avg_train, 
            y_avg_test, 
            "Def XGB")

Train R2      6.434595e-01
Test R2       6.049974e-01
Train MAE     1.723396e-01
Test MAE      1.814656e-01
Train MAPE    3.277792e+11
Test MAPE     5.393175e-02
Train RMSE    2.234741e-01
Test RMSE     2.355063e-01
Name: Def XGB, dtype: float64
CPU times: total: 3min 51s
Wall time: 20.1 s


## Tuned on 'max_depth' and 'n_estimators'

In [None]:
# %%time

# # CPU time total: 10 minutes 28 seconds
# # wall time: 1 minute

# # instantiate
# xgb_tun = XGBRegressor()

# # pipeline
# xgb_tun_pipe = make_pipeline(preprocessor, xgb_tun)

# # params
# xgb_params = {}

# # tune max_depth and n_estimators
# xgb_params['xgbregressor__max_depth'] = [5, 10, 15, 20]
# xgb_params['xgbregressor__n_estimators'] = [10, 20, 30, 40, 50]

# # gridsearchcv
# xgb_gs = GridSearchCV(xgb_tun_pipe, 
#                      xgb_params,
#                      scoring = 'r2')

# xgb_gs.fit(X_avg_train, y_avg_train)

In [98]:
# # see best score from gridizedsearchcv
# xgb_gs.best_score_ # .6309

0.6309286723972292

In [99]:
# # see the best parameters from the tuned model
# xgb_gs.best_params_ # max_depth: 20, n_estimators: 50

{'xgbregressor__max_depth': 20, 'xgbregressor__n_estimators': 50}

In [47]:
# # instantiate tuned model
# xgb_tun_pipe = xgb_gs.best_estimator_

# # print and store metrics
# get_metrics(xgb_tun_pipe, 
#             X_avg_train, 
#             X_avg_test, 
#             y_avg_train, 
#             y_avg_test, 
#             "Tun XGB")

In [18]:
xgb_tun = XGBRegressor(max_depth = 20, n_estimators = 50)

In [20]:
# hard coding results into hard_metrics_df so I don't have to
# re-run the notebook cells that take a long time to run
# every time I open it up

metrics_df.at['Tun XGB', 'Train R2'] = 0.8335741
metrics_df.at['Tun XGB', 'Test R2'] = 0.6578420
metrics_df.at['Tun XGB', 'Train MAE'] = 0.1128172
metrics_df.at['Tun XGB', 'Test MAE'] = 0.1632414
metrics_df.at['Tun XGB', 'Train MAPE'] = 0.1857071
metrics_df.at['Tun XGB', 'Test MAPE'] = 0.4852992
metrics_df.at['Tun XGB', 'Train RMSE'] = 0.1526804
metrics_df.at['Tun XGB', 'Test RMSE'] = 0.2191876

# Model 7: Light Gradient Boosting Machine Regressor

## Default

In [49]:
%%time

# 2 seconds

# instantiate
lgbm_def = LGBMRegressor()

# pipeline
lgbm_def_pipe = make_pipeline(preprocessor, lgbm_def)

# fit
lgbm_def_pipe.fit(X_avg_train, y_avg_train)

# evaluate
get_metrics(lgbm_def_pipe, 
            X_avg_train, 
            X_avg_test, 
            y_avg_train, 
            y_avg_test, 
            "Def LGBM")

Train R2      6.047285e-01
Test R2       5.888200e-01
Train MAE     1.808934e-01
Test MAE      1.850531e-01
Train MAPE    4.574953e+11
Test MAPE     5.500878e-02
Train RMSE    2.352993e-01
Test RMSE     2.402805e-01
Name: Def LGBM, dtype: float64
CPU times: total: 4.19 s
Wall time: 436 ms


## Tuned on 'max_depth' and 'n_estimators'

In [50]:
# %%time

# # CPU time total: 3 minutes 42 seconds
# # wall time: 19 seconds

# # instantiate
# lgbm_tun = LGBMRegressor()

# # pipeline
# lgbm_tun_pipe = make_pipeline(preprocessor, lgbm_tun)

# # params
# lgbm_params = {}

# # tune n_neighbors
# lgbm_params['lgbmregressor__max_depth'] = [5, 10, 15, 20]
# lgbm_params['lgbmregressor__n_estimators'] = [10, 20, 30, 40, 50]

# # gridsearchcv
# lgbm_gs = GridSearchCV(lgbm_tun_pipe, 
#                        lgbm_params,
#                        scoring = 'r2')

# lgbm_gs.fit(X_avg_train, y_avg_train)

In [51]:
# # see best score from gridsearchcv
# lgbm_gs.best_score_

In [52]:
# # see the best parameters from the tuned model
# lgbm_gs.best_params_

# # max_depth: 20
# # n_estimators: 50

In [53]:
# # instantiate tuned model
# lgbm_tun_pipe = lgbm_gs.best_estimator_

# # print and store metrics
# get_metrics(lgbm_tun_pipe, 
#             X_avg_train, 
#             X_avg_test, 
#             y_avg_train, 
#             y_avg_test, 
#             "Tun LGBM")

In [54]:
# hard coding results into hard_metrics_df so I don't have to
# re-run the notebook cells that take a long time to run
# every time I open it up

metrics_df.at['Tun LGBM', 'Train R2'] = 0.5682030
metrics_df.at['Tun LGBM', 'Test R2'] = 0.5611349
metrics_df.at['Tun LGBM', 'Train MAE'] = 0.1914345
metrics_df.at['Tun LGBM', 'Test MAE'] = 0.1937340
metrics_df.at['Tun LGBM', 'Train MAPE'] = 0.4527508
metrics_df.at['Tun LGBM', 'Test MAPE'] = 0.5760116
metrics_df.at['Tun LGBM', 'Train RMSE'] = 0.2459307
metrics_df.at['Tun LGBM', 'Test RMSE'] = 0.2482379

# Model 8: Gradient Boosting Regressor

## Default

In [55]:
%%time

# 5 seconds

# instantiate
gbr_def = GradientBoostingRegressor()

# pipeline
gbr_def_pipe = make_pipeline(preprocessor, gbr_def)

# fit
gbr_def_pipe.fit(X_avg_train, y_avg_train)

# evaluate
get_metrics(gbr_def_pipe, 
            X_avg_train, 
            X_avg_test, 
            y_avg_train, 
            y_avg_test, 
            "Def GBR")

Train R2      4.528303e-01
Test R2       4.527176e-01
Train MAE     2.212064e-01
Test MAE      2.220065e-01
Train MAPE    4.547432e+11
Test MAPE     6.611475e-02
Train RMSE    2.768432e-01
Test RMSE     2.772096e-01
Name: Def GBR, dtype: float64
CPU times: total: 4.09 s
Wall time: 3.98 s


## Tuned on 'max_depth' and 'n_estimators'

In [56]:
# %%time

# # 22 minutes

# # instantiate
# gbr_tun = GradientBoostingRegressor()

# # pipeline
# gbr_tun_pipe = make_pipeline(preprocessor, gbr_tun)

# # params
# gbr_params = {}

# # tune n_neighbors
# gbr_params['gradientboostingregressor__max_depth'] = [5, 10, 15, 20]
# gbr_params['gradientboostingregressor__n_estimators'] = [10, 20, 30, 40, 50]

# # gridsearchcv
# gbr_gs = GridSearchCV(gbr_tun_pipe, 
#                      gbr_params,
#                      scoring = 'r2')

# gbr_gs.fit(X_avg_train, y_avg_train)

In [57]:
# # see best score from gridsearchcv
# gbr_gs.best_score_

In [58]:
# # see the best parameters from the tuned model
# gbr_gs.best_params_

# # max_depth: 20
# # n_estimators: 50

In [59]:
# # instantiate tuned model
# gbr_tun_pipe = gbr_gs.best_estimator_

# # print and store metrics
# get_metrics(gbr_tun_pipe, 
#             X_avg_train, 
#             X_avg_test, 
#             y_avg_train, 
#             y_avg_test, 
#             "Tun GBR")

In [60]:
# hard coding results into hard_metrics_df so I don't have to
# re-run the notebook cells that take a long time to run
# every time I open it up

metrics_df.at['Tun GBR', 'Train R2'] = 0.8786262
metrics_df.at['Tun GBR', 'Test R2'] = 0.6341327
metrics_df.at['Tun GBR', 'Train MAE'] = 0.9380641
metrics_df.at['Tun GBR', 'Test MAE'] = 0.1679331
metrics_df.at['Tun GBR', 'Train MAPE'] = 0.1161788
metrics_df.at['Tun GBR', 'Test MAPE'] = 0.4995710
metrics_df.at['Tun GBR', 'Train RMSE'] = 0.1303873
metrics_df.at['Tun GBR', 'Test RMSE'] = 0.2266545

# Feature Engineering

## Feature Selection and Transformation

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49012 entries, 12 to 64185
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   section_type       49012 non-null  object 
 1   instructor_id      49012 non-null  object 
 2   facility_code      45521 non-null  object 
 3   start_time         49012 non-null  float64
 4   mon                49012 non-null  int64  
 5   tues               49012 non-null  int64  
 6   wed                49012 non-null  int64  
 7   thurs              49012 non-null  int64  
 8   fri                49012 non-null  int64  
 9   subject_name       49012 non-null  object 
 10  course_name        49012 non-null  object 
 11  a_proportion       49012 non-null  float64
 12  f_proportion       49012 non-null  float64
 13  avg_grade          49012 non-null  float64
 14  year               49012 non-null  int64  
 15  term               49012 non-null  object 
 16  class_length       49

### Schedule

The schedule variables take up 6 columns for days of the week (Saturday and Sunday are combined in 'weekend'). However this doesn't capture the schedules of classes; there is a high probability of a class that is held on Monday also being held on Wednesday, for example. 

So, to reduce this collinearity, we will make new schedule-related columns related to the actual schedules of the courses.

In [25]:
df_fe = df.copy()

df_fe.sample(10)

Unnamed: 0,section_type,instructor_id,facility_code,start_time,mon,tues,wed,thurs,fri,subject_name,course_name,a_proportion,f_proportion,avg_grade,year,term,class_length,total_time,weekend,course_difficulty
6202,lec,other,,900.0,0,1,0,0,0,biology,special topics,1.0,0.0,4.0,7,fall,60.0,60,0,advanced
50826,lec,other,153.0,865.0,1,0,0,0,0,curriculum and instruction,teaching mathematics,1.0,0.0,4.0,14,fall,180.0,180,0,intermediate
17248,lec,other,482.0,595.0,1,0,1,0,1,spanish (spanish and portuguese),intermediate language practice with emphasis o...,0.35,0.0,3.475,11,fall,50.0,150,0,elementary
46381,lab,other,408.0,800.0,0,0,0,0,1,chemical and biological engineering,transport phenomena lab,0.333333,0.0,3.583333,15,spring,245.0,245,0,intermediate
22295,lec,225453.0,140.0,570.0,1,0,1,0,0,general business,professional communication,0.875,0.0,3.9375,12,fall,75.0,150,0,intermediate
22343,lec,671145.0,140.0,570.0,1,0,1,0,0,general business,professional communication,0.653846,0.0,3.730769,17,spring,75.0,150,0,intermediate
31288,lec,other,155.0,870.0,1,0,1,0,0,history,explorations in american history (h),0.25,0.0,3.291667,18,fall,75.0,150,0,elementary
33969,lec,271216.0,48.0,800.0,1,0,1,0,1,english,other,0.870968,0.0,3.919355,10,spring,50.0,150,0,intermediate
5071,lec,other,408.0,720.0,0,1,0,0,0,chemical and biological engineering,special topics in chemical engineering,0.714286,0.0,3.857143,16,fall,50.0,50,0,advanced
38345,lec,other,469.0,595.0,1,0,1,0,0,afro-american studies,other,0.395604,0.021978,3.241758,11,spring,50.0,100,0,elementary


In [26]:
# new column: how many days per week the course is held (numeric, to be 
# one-hot encoded); weekend counts as one day (can go back to original
# data and change if needed)

df_fe['days_per_week'] = df_fe['mon'] + df_fe['tues'] + df_fe['wed'] + \
                         df_fe['thurs'] + df_fe['fri'] + df_fe['weekend']

# check
df_fe.sample(10)

Unnamed: 0,section_type,instructor_id,facility_code,start_time,mon,tues,wed,thurs,fri,subject_name,...,a_proportion,f_proportion,avg_grade,year,term,class_length,total_time,weekend,course_difficulty,days_per_week
31487,lec,other,482.0,530.0,1,0,1,0,1,spanish (spanish and portuguese),...,0.294118,0.0,3.558824,11,fall,50.0,150,0,intermediate,3
19793,lec,other,482.0,570.0,0,1,0,1,0,french (french and italian),...,0.230769,0.0,3.269231,12,fall,75.0,150,0,elementary,2
31673,lec,other,56.0,660.0,1,0,1,0,0,gender and women’s studies,...,0.253406,0.008174,3.307902,8,fall,50.0,100,0,elementary,2
2010,lec,4387544.0,129.0,390.0,1,1,1,1,1,naval science,...,1.0,0.0,4.0,9,fall,90.0,450,0,elementary,5
19790,lec,other,482.0,870.0,0,1,0,1,0,french (french and italian),...,0.304348,0.0,3.565217,8,fall,75.0,150,0,elementary,2
46735,lec,other,46.0,595.0,1,0,1,0,0,environmental studies - gaylord nelson institute,...,0.20614,0.004386,3.300439,8,fall,50.0,100,0,elementary,2
52835,lab,319547.0,25.0,465.0,1,0,1,0,0,physical educ activity progm,...,0.411765,0.117647,3.205882,8,fall,50.0,100,0,elementary,2
33217,lec,other,521.0,660.0,1,0,1,0,1,geoscience,...,0.335294,0.029412,2.923529,14,spring,50.0,150,0,elementary,3
15953,lec,other,18.0,595.0,1,0,1,0,1,english,...,0.842105,0.0,3.842105,7,spring,50.0,150,0,elementary,3
7000,ind,4397528.0,,-1.0,0,0,0,0,0,communication sciences and disorders,...,1.0,0.0,4.0,10,fall,0.0,0,0,advanced,0


In [27]:
# change 1s and 0s in days of week columns to abbreviations 
# (M, T, W, R, F, E)

df_fe['mon'].replace({1: 'M', 0: ''}, inplace = True)
df_fe['tues'].replace({1: 'T', 0: ''}, inplace = True)
df_fe['wed'].replace({1: 'W', 0: ''}, inplace = True)
df_fe['thurs'].replace({1: 'R', 0: ''}, inplace = True)
df_fe['fri'].replace({1: 'F', 0: ''}, inplace = True)
df_fe['weekend'].replace({1: 'E', 0: ''}, inplace = True)

# check
df_fe.sample(10)

Unnamed: 0,section_type,instructor_id,facility_code,start_time,mon,tues,wed,thurs,fri,subject_name,...,a_proportion,f_proportion,avg_grade,year,term,class_length,total_time,weekend,course_difficulty,days_per_week
50969,lec,other,18,660.0,M,,,,,english,...,1.0,0.0,4.0,9,spring,115.0,115,,intermediate,1
33975,lec,other,482,800.0,M,,W,,F,hebrew,...,0.222222,0.0,3.111111,7,fall,50.0,150,,intermediate,3
62957,lec,other,106,725.0,,,W,,F,food science,...,0.315789,0.0,3.473684,8,fall,50.0,100,,intermediate,2
22606,lec,280437.0,76,570.0,,T,,R,,life sciences communication,...,0.55,0.0,3.675,16,fall,75.0,150,,elementary,2
45905,lec,other,469,865.0,,T,,R,,communication arts,...,0.145455,0.012121,3.115152,16,spring,50.0,100,,elementary,2
31340,lec,3216300.0,47,660.0,M,,W,,F,chemistry,...,0.240664,0.024896,2.751037,10,fall,50.0,150,,intermediate,3
52454,lec,other,482,530.0,M,T,W,R,,languages and cultures of asia - languages,...,0.769231,0.0,3.807692,15,fall,50.0,200,,intermediate,4
39690,lab,921047.0,31,530.0,,T,,R,,physical educ activity progm,...,1.0,0.0,4.0,9,spring,50.0,100,,elementary,2
31144,lec,2601318.0,46,660.0,,,,R,,latin (classics),...,0.394737,0.013158,3.309211,8,spring,50.0,50,,elementary,1
35629,lec,4384631.0,545,570.0,,T,,R,,communication arts,...,0.733333,0.0,3.766667,8,fall,75.0,150,,advanced,2


In [28]:
# concatenate days of week columns into one

df_fe['schedule_days'] = df_fe['mon'] + df_fe['tues'] + df_fe['wed'] + df_fe['thurs'] + df_fe['fri'] + df_fe['weekend']

# check
df_fe['schedule_days'].value_counts()

TR        15434
MWF        9089
MW         7137
           3576
T          2414
W          2268
M          1948
MTWR       1813
R          1492
MTWRF      1151
MTRF        870
F           869
E           245
MF          205
WF          162
MWR          88
MTWF         69
MTR          31
TRF          31
TWRF         26
MR           21
FE           18
WR           10
MWRF          9
MTF           7
MT            5
MRF           5
TF            4
RF            4
MTW           3
TWR           3
TW            2
ME            1
MTWRFE        1
WRF           1
Name: schedule_days, dtype: int64

In [29]:
# replace '' with 'none'
df_fe['schedule_days'] = df_fe['schedule_days'].apply(lambda x: 'none' if x == '' else x)

# check
df_fe['schedule_days'].value_counts()

TR        15434
MWF        9089
MW         7137
none       3576
T          2414
W          2268
M          1948
MTWR       1813
R          1492
MTWRF      1151
MTRF        870
F           869
E           245
MF          205
WF          162
MWR          88
MTWF         69
MTR          31
TRF          31
TWRF         26
MR           21
FE           18
WR           10
MWRF          9
MTF           7
MT            5
MRF           5
TF            4
RF            4
MTW           3
TWR           3
TW            2
ME            1
MTWRFE        1
WRF           1
Name: schedule_days, dtype: int64

In [30]:
# bin all schedules with fewer than 100 values into 'other'

keep_schedules = ['TR', 'MWF', 'MW', 'none', 'T', 'W', 'M', 'MTWR', 
                 'R', 'MTWRF', 'MTRF', 'F', 'E', 'MF', 'WF']

df_fe['schedule_days'] = df_fe['schedule_days'].apply(lambda x: x if x in keep_schedules else 'other')

# check
df_fe['schedule_days'].value_counts()

TR       15434
MWF       9089
MW        7137
none      3576
T         2414
W         2268
M         1948
MTWR      1813
R         1492
MTWRF     1151
MTRF       870
F          869
other      339
E          245
MF         205
WF         162
Name: schedule_days, dtype: int64

In [31]:
# drop original columns
df_fe.drop(columns = ['mon', 'tues', 'wed', 'thurs', 'fri', 'weekend'], 
           inplace = True)

# check
df_fe.sample(10)

Unnamed: 0,section_type,instructor_id,facility_code,start_time,subject_name,course_name,a_proportion,f_proportion,avg_grade,year,term,class_length,total_time,course_difficulty,days_per_week,schedule_days
1007,lec,other,545,1140.0,communication arts,introduction to speech composition,0.083333,0.0,3.25,12,fall,50.0,150,elementary,3,other
30628,lec,other,482,725.0,slavic (slavic languages),other,1.0,0.0,4.0,8,fall,50.0,150,intermediate,3,MWF
21460,lec,367808.0,140,870.0,communication arts,other,0.583333,0.0,3.666667,7,spring,180.0,180,advanced,1,W
3742,lec,815179.0,140,1050.0,environmental studies - gaylord nelson institute,special topics: social perspectives in environ...,0.333333,0.0,3.5,17,spring,165.0,165,intermediate,1,M
41923,lec,other,402,595.0,communication arts,other,0.183673,0.0,3.316327,7,spring,50.0,100,intermediate,2,MW
54508,lab,4841162.0,469,660.0,art department,life drawing i,0.363636,0.0,3.318182,18,fall,150.0,300,elementary,2,MW
22549,lec,659531.0,140,570.0,accounting and information systems,financial statement analysis,0.348837,0.0,3.5,15,fall,75.0,150,advanced,2,MW
63024,lec,other,76,530.0,life sciences communication,other,0.619048,0.0,3.761905,15,spring,150.0,150,advanced,1,F
32114,lec,other,47,660.0,chemistry,other,0.526316,0.052632,3.421053,11,spring,50.0,150,advanced,3,MWF
22032,lec,other,140,660.0,marketing,marketing strategy,0.321429,0.0,3.553571,18,fall,75.0,150,intermediate,2,TR


### Total time

In [32]:
# drop 'total_time', since its information is encoded in class_length and
# 'schedule_days'/'days_per_week'

df_fe.drop(columns = 'total_time', inplace = True)

# check
df_fe.columns

Index(['section_type', 'instructor_id', 'facility_code', 'start_time',
       'subject_name', 'course_name', 'a_proportion', 'f_proportion',
       'avg_grade', 'year', 'term', 'class_length', 'course_difficulty',
       'days_per_week', 'schedule_days'],
      dtype='object')

### Course name

In [33]:
# course_name is a finer distinction of 'subject_name'; let's get rid of it

df_fe.drop(columns = 'course_name', inplace = True)

# check
df_fe.columns

Index(['section_type', 'instructor_id', 'facility_code', 'start_time',
       'subject_name', 'a_proportion', 'f_proportion', 'avg_grade', 'year',
       'term', 'class_length', 'course_difficulty', 'days_per_week',
       'schedule_days'],
      dtype='object')

### Start time

In [34]:
# check range of start times
df_fe['start_time'].describe()

count    49012.000000
mean       676.647494
std        241.312605
min         -1.000000
25%        570.000000
50%        660.000000
75%        865.000000
max       1260.000000
Name: start_time, dtype: float64

In [35]:
# check start times without -1 (no start time)
df_fe[df_fe['start_time'] != -1].describe()

Unnamed: 0,start_time,a_proportion,f_proportion,avg_grade,year,class_length,days_per_week
count,45414.0,45414.0,45414.0,45414.0,45414.0,45414.0,45414.0
mean,730.335249,0.490533,0.008711,3.489751,12.188818,79.89948,2.19952
std,153.560322,0.282317,0.024151,0.3697,3.396662,45.183048,0.917262
min,390.0,0.0,0.0,0.0,7.0,40.0,0.0
25%,595.0,0.250253,0.0,3.244444,9.0,50.0,2.0
50%,725.0,0.4375,0.0,3.529412,12.0,75.0,2.0
75%,865.0,0.714286,0.0,3.790323,15.0,75.0,3.0
max,1260.0,1.0,1.0,4.0,18.0,600.0,6.0


In [36]:
# start_time is given in minutes, so it is a numerical variable; courses
# without an assigned start_time are coded with -1
# start_time shouldn't be treated as a numeric variable since there is not a
# natural size order to times; for example, it's possible that courses in the
# afternoon give out higher grades than both courses in the mornings and
# evenings

# we will bin courses by start_time to morning, afternoon, evening, and none
# then we will be able to one-hot encode this variable

# times:
# -1 = 'none'
# earliest start time is 6:30am (390)
# 390 - 479 = early morning (6:30-7:59am) # 1.5 hours
# 480 - 599 = mid-morning (8-9:59am) # 2 hours
# 600 - 719 = late morning (10-11:59am) # 2 hours
# 720 - 839 = early afternoon (noon-1:59pm) # 2 hours
# 840 - 959 = mid-afternoon (2-3:59pm) # 2 hours
# 960 - 1079 = late afternoon (4-5:59pm) # 2 hours
# 1080 - 1260 = evening (6-9pm) # 3 hours
# last start time is 9pm (1260)

for index in df_fe['start_time'].index:
    if df_fe.at[index, 'start_time'] == -1:
        df_fe.at[index, 'start_time'] = 'none'
    elif df_fe.at[index, 'start_time'] < 480:
        df_fe.at[index, 'start_time'] = 'early morning'
    elif df_fe.at[index, 'start_time'] < 600:
        df_fe.at[index, 'start_time'] = 'mid-morning'
    elif df_fe.at[index, 'start_time'] < 720:
        df_fe.at[index, 'start_time'] = 'late morning'
    elif df_fe.at[index, 'start_time'] < 840:
        df_fe.at[index, 'start_time'] = 'early afternoon'
    elif df_fe.at[index, 'start_time'] < 960:
        df_fe.at[index, 'start_time'] = 'mid-afternoon'
    elif df_fe.at[index, 'start_time'] < 1080:
        df_fe.at[index, 'start_time'] = 'late afternoon'
    else:
        df_fe.at[index, 'start_time'] = 'evening'
        
# check
df_fe['start_time'].value_counts()

mid-morning        12654
early afternoon    11369
late morning        8020
mid-afternoon       8006
late afternoon      3914
none                3598
early morning        755
evening              696
Name: start_time, dtype: int64

### Class length

In [37]:
# bin 'class_length'
df_fe['class_length'].describe()

count    49012.000000
mean        74.034012
std         48.227477
min          0.000000
25%         50.000000
50%         75.000000
75%         75.000000
max        600.000000
Name: class_length, dtype: float64

In [38]:
df_fe['class_length'].value_counts()

50.0     19284
75.0     15419
0.0       3598
150.0     3371
120.0     1991
         ...  
340.0        1
325.0        1
113.0        1
40.0         1
520.0        1
Name: class_length, Length: 61, dtype: int64

In [39]:
# inspect without 0-length classes
class_length_no_zero = df_fe[df_fe['class_length'] != 0]
class_length_no_zero['class_length'].describe()

count    45414.000000
mean        79.899480
std         45.183048
min         40.000000
25%         50.000000
50%         75.000000
75%         75.000000
max        600.000000
Name: class_length, dtype: float64

In [40]:
# bin into less than 75 and 75 or more
df_fe['class_length'] = df_fe['class_length'].apply(lambda x: 'shorter' 
                                                    if x < 75 else 'longer')

# check
df_fe['class_length'].value_counts()

longer     25590
shorter    23422
Name: class_length, dtype: int64

## Create new preprocessor

In [41]:
df_fe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49012 entries, 12 to 64185
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   section_type       49012 non-null  object 
 1   instructor_id      49012 non-null  object 
 2   facility_code      45521 non-null  object 
 3   start_time         49012 non-null  object 
 4   subject_name       49012 non-null  object 
 5   a_proportion       49012 non-null  float64
 6   f_proportion       49012 non-null  float64
 7   avg_grade          49012 non-null  float64
 8   year               49012 non-null  int64  
 9   term               49012 non-null  object 
 10  class_length       49012 non-null  object 
 11  course_difficulty  49012 non-null  object 
 12  days_per_week      49012 non-null  int64  
 13  schedule_days      49012 non-null  object 
dtypes: float64(3), int64(2), object(9)
memory usage: 6.6+ MB


In [42]:
df_fe.sample(10)

Unnamed: 0,section_type,instructor_id,facility_code,start_time,subject_name,a_proportion,f_proportion,avg_grade,year,term,class_length,course_difficulty,days_per_week,schedule_days
32657,lec,other,0469,late morning,music,0.347826,0.043478,3.391304,9,fall,shorter,elementary,3,MWF
5127,lec,other,0407,early afternoon,mechanical engineering,0.583333,0.0,3.666667,10,fall,shorter,advanced,3,MWF
16540,lec,other,0469,early afternoon,philosophy,0.444444,0.0,3.666667,11,fall,shorter,intermediate,4,MTWR
18739,lec,3213526.0,0140,early afternoon,literature in translation,0.421053,0.0,3.578947,9,fall,longer,intermediate,2,TR
6211,lec,other,,mid-afternoon,medical sciences - veterinary medicine,1.0,0.0,4.0,12,fall,longer,advanced,5,MTWRF
46417,sem,other,0451c,late afternoon,neuroscience training program,0.941176,0.0,3.941176,16,fall,shorter,advanced,1,W
5359,lec,other,0407,mid-afternoon,mechanical engineering,1.0,0.0,4.0,16,spring,shorter,advanced,3,MWF
23549,lec,other,0407,mid-afternoon,german,0.833333,0.0,3.916667,11,spring,longer,intermediate,2,TR
45439,lab,807999.0,0060,mid-morning,microbiology,0.394737,0.0,3.315789,8,fall,longer,intermediate,2,TR
20962,lec,other,0400,early afternoon,counseling psychology,0.75,0.125,3.4375,12,fall,longer,elementary,1,T


In [43]:
# binary encode 'term'
df_fe['term'].replace({'fall': 0, 'spring': 1}, inplace = True)

# rename column
df_fe.rename(columns = {'term': 'is_spring'}, inplace = True)

# check
df_fe.sample(10)

Unnamed: 0,section_type,instructor_id,facility_code,start_time,subject_name,a_proportion,f_proportion,avg_grade,year,is_spring,class_length,course_difficulty,days_per_week,schedule_days
11283,lec,1112569.0,online,none,electrical and computer engineering,0.555556,0.0,3.5,17,1,shorter,intermediate,0,none
12925,lec,other,off campus,mid-morning,nursing,0.956522,0.0,3.978261,10,0,longer,intermediate,2,MW
49882,lec,other,0482,late morning,italian (french and italian),0.421053,0.0,3.263158,15,0,shorter,elementary,4,MTWR
525,lec,other,0545,early afternoon,communication arts,0.076923,0.0,3.269231,17,0,shorter,elementary,3,MWF
19006,lec,3437415.0,0482,early afternoon,spanish (spanish and portuguese),0.764706,0.0,3.705882,16,0,shorter,intermediate,3,MWF
36159,lec,747510.0,0140,mid-morning,"finance, investment and banking",0.324324,0.0,3.445946,9,1,longer,intermediate,2,MW
46078,lec,other,0482,mid-morning,languages and cultures of asia - languages,0.642857,0.0,3.571429,7,0,shorter,intermediate,4,MTWR
47333,lec,other,0047,mid-afternoon,integrated liberal studies,0.285714,0.0,3.380952,8,1,longer,elementary,2,TR
15262,sem,other,0053,early afternoon,environmental studies - gaylord nelson institute,1.0,0.0,4.0,13,1,longer,advanced,1,M
29120,lec,3125250.0,0032,late morning,dance,0.923077,0.0,3.923077,14,1,longer,intermediate,2,TR


In [44]:
# binary enocde 'class_length'
df_fe['class_length'].replace({'shorter': 0, 'longer': 1}, inplace = True)

# rename column
df_fe.rename(columns = {'class_length': 'is_longer'}, inplace = True)

# check
df_fe.sample(10)

Unnamed: 0,section_type,instructor_id,facility_code,start_time,subject_name,a_proportion,f_proportion,avg_grade,year,is_spring,is_longer,course_difficulty,days_per_week,schedule_days
2484,lec,464620.0,140.0,mid-morning,general business,0.19708,0.0,3.051095,10,1,0,intermediate,2,MW
21216,lec,other,46.0,late morning,consumer science,0.794118,0.0,3.897059,9,1,1,advanced,2,TR
19554,lec,other,56.0,late morning,american indian studies,0.6,0.0,3.66,17,0,1,elementary,2,TR
11359,ind,other,,none,integrated liberal studies,0.666667,0.0,3.5,10,0,0,intermediate,0,none
23101,lec,other,452.0,early afternoon,english,0.62963,0.0,3.814815,13,1,1,advanced,2,TR
19344,sem,3437385.0,545.0,mid-afternoon,communication arts,0.666667,0.0,3.666667,12,0,1,advanced,1,F
34496,lec,other,57.0,mid-morning,physics,0.375,0.03125,3.265625,14,1,0,intermediate,3,MWF
42464,lec,other,84.0,early afternoon,communication sciences and disorders,0.372093,0.0,3.366279,14,1,1,elementary,2,TR
17665,lec,other,46.0,mid-morning,spanish (spanish and portuguese),1.0,0.0,4.0,14,0,0,intermediate,3,MWF
23257,lec,other,48.0,mid-morning,mathematics,0.15,0.0,3.075,13,0,0,elementary,2,TR


In [45]:
# ohe 

# instantiate onehotencoder
ohe = OneHotEncoder(sparse = False, 
                    handle_unknown = 'ignore')

# grab columns
ohe_cols = ['section_type', 
            'instructor_id', 
            'facility_code', 
            'start_time',
            'subject_name', 
            'year',  
            'course_difficulty', 
            'schedule_days', 
            'days_per_week']

# make tuple
ohe_tuple = (ohe, ohe_cols)

In [46]:
# already binary (passthrough): 'is_spring', 'is_longer'

preprocessor2 = make_column_transformer(ohe_tuple,
                                       remainder = 'passthrough')

# add scaler to make sure all numbers are scaled
preprocessor2_and_scaler = make_pipeline(preprocessor2, 
                                         StandardScaler(with_mean = False))

# check
preprocessor2_and_scaler

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False),
                                                  ['section_type',
                                                   'instructor_id',
                                                   'facility_code',
                                                   'start_time', 'subject_name',
                                                   'year', 'course_difficulty',
                                                   'schedule_days',
                                                   'days_per_week'])])),
                ('standardscaler', StandardScaler(with_mean=False))])

## Run models with new features

The highest-performing models with our original features are below:

In [47]:
metrics_df.sort_values(by = 'Test R2', ascending = False).head()

Unnamed: 0,Train R2,Test R2,Train MAE,Test MAE,Train MAPE,Test MAPE,Train RMSE,Test RMSE
Dummy,0.0,-0.000123,0.308386,0.309259,430728900000.0,0.09237,0.37426,0.374739


In [48]:
# let's test the top 5 with our feature-engineered dataset to see if we 
# can improve any performances

# split df_fe into X, y and then train/test split

# split df_fe into X and y
target = 'avg_grade'
df_fe_y = df_fe[target]
df_fe_X = df_fe.drop(columns = target)

# validate model with train/test split
df_fe_X_train, df_fe_X_test, df_fe_y_train, df_fe_y_test = \
train_test_split(df_fe_X, df_fe_y, random_state = 42)

In [49]:
# instantiate, make pipeline, fit, evaluate

# Tuned XGBRegressor
fe_tun_xgb = XGBRegressor(max_depth = 20, n_estimators = 50)
fe_tun_xgb_pipe = make_pipeline(preprocessor2, fe_tun_xgb)
fe_tun_xgb_pipe.fit(df_fe_X_train, df_fe_y_train)
get_metrics(fe_tun_xgb_pipe, 
            df_fe_X_train, 
            df_fe_X_test, 
            df_fe_y_train, 
            df_fe_y_test, 
            'FE Tun XGB')

Train R2      9.736062e-01
Test R2       9.106307e-01
Train MAE     4.073753e-02
Test MAE      7.565216e-02
Train MAPE    7.472473e+09
Test MAPE     2.316157e-02
Train RMSE    6.080286e-02
Test RMSE     1.120203e-01
Name: FE Tun XGB, dtype: float64


In [50]:
# Default RandomForest
fe_def_rf = RandomForestRegressor()
fe_def_rf_pipe = make_pipeline(preprocessor2, fe_def_rf)
fe_def_rf_pipe.fit(df_fe_X_train, df_fe_y_train)
get_metrics(fe_def_rf_pipe, 
            df_fe_X_train, 
            df_fe_X_test, 
            df_fe_y_train, 
            df_fe_y_test, 
            'FE Def RF')

Train R2      9.860187e-01
Test R2       9.005035e-01
Train MAE     2.943609e-02
Test MAE      7.982169e-02
Train MAPE    1.070962e+11
Test MAPE     2.448971e-02
Train RMSE    4.425344e-02
Test RMSE     1.181970e-01
Name: FE Def RF, dtype: float64


In [None]:
# Tuned BaggingRegressor
fe_tun_br = BaggingRegressor(n_estimators = 100)
fe_tun_br_pipe = make_pipeline(preprocessor2, fe_tun_br)
fe_tun_br_pipe.fit(df_fe_X_train, df_fe_y_train)
get_metrics(fe_tun_br_pipe, 
            df_fe_X_train, 
            df_fe_X_test, 
            df_fe_y_train, 
            df_fe_y_test, 
            'FE Tun BR')

In [None]:
# Tuned GradientBoostingRegressor
fe_tun_gbr = GradientBoostingRegressor(max_depth = 20, n_estimators = 50)
fe_tun_gbr_pipe = make_pipeline(preprocessor2, fe_tun_gbr)
fe_tun_gbr_pipe.fit(df_fe_X_train, df_fe_y_train)
get_metrics(fe_tun_gbr_pipe, 
            df_fe_X_train, 
            df_fe_X_test, 
            df_fe_y_train, 
            df_fe_y_test, 
            'FE Tun GBR')

In [None]:
metrics_df.sort_values(by = 'Test R2', ascending = False)

It looks like feature selection and transformation didn't help our models!

## Feature Extraction with PCA

In [None]:
df_fe.info()

In [None]:
# drop 'a_proportion' and 'f_proportion' to concentrate on 'avg_grade'
df_fe.drop(columns = ['a_proportion', 'f_proportion'], inplace = True)

# check
df_fe.columns

In [None]:
preprocessor2_and_scaler

In [None]:
# fit and transform preprocessor2_and_scaler on df_fe
preprocessed_and_scaled_df_fe = preprocessor2_and_scaler.fit_transform(df_fe)

In [None]:
# instantiate pca
pca = PCA()

# fit pca on preprocessed_and_scaled_df_fe
pca.fit(preprocessed_and_scaled_df_fe)

In [None]:
# determine number of pcs to use
# plot explained variance ratios of first 30 pcs
plt.plot(range(1, 31), pca.explained_variance_ratio_[:30], marker = '.')
plt.xticks(ticks = range(1, 31), fontsize = 8)
plt.xlabel('number of pcs')
plt.ylabel('proportion of explained variance')

# find the knee point using the KneeLocator function
knee_locator = KneeLocator(range(1, 31), 
                           pca.explained_variance_ratio_[:30], 
                           curve = 'convex', 
                           direction = 'decreasing',
                           interp_method = 'polynomial')
knee = knee_locator.knee
plt.axvline(x = knee, color = 'red')

print('Knee point:', knee)

The plot above indicates that 7 would be a good possible number of principle components. Visually interpreting the plot, I would suggest either 6 or possibly 9 principle components. 

We will see how much of the given variance these PCA models can predict with some of our top models.

In [None]:
# split df_fe into X and y
target = 'avg_grade'
df_fe_y = df_fe[target]
df_fe_X = df_fe.drop(columns = target)

In [None]:
# validate model with train/test split
df_fe_X_train, df_fe_X_test, df_fe_y_train, df_fe_y_test = \
train_test_split(df_fe_X, df_fe_y, random_state = 42)

In [None]:
# try 
for num in range(6, 31):

    # create a pipeline with PCA model
    transformer = make_pipeline(preprocessor, 
                                PCA(n_components = num))

    # instantiate our best model so far (Tuned XGB), make pipeline, and fit
    xgb_tun_pca_pipe = make_pipeline(transformer, xgb_tun)
    xgb_tun_pca_pipe.fit(X_avg_train, y_avg_train)
    
    xgb_tun_pca_fe_pipe = make_pipeline(transformer, xgb_tun)
    xgb_tun_pca_fe_pipe.fit(df_fe_X_train, df_fe_y_train)

    # evaluate on r2
    print(f'Number of PCs: {num}')
    print('Training R2 on df: ', xgb_tun_pca_pipe.score(X_avg_train, y_avg_train))
    print('Testing R2 on df: ', xgb_tun_pca_pipe.score(X_avg_test, y_avg_test))
    print('Training R2 on df_fe: ', xgb_tun_pca_fe_pipe.score(X_avg_train, y_avg_train))
    print('Testing R2 on df_fe: ', xgb_tun_pca_fe_pipe.score(X_avg_test, y_avg_test))
    print()

# Model Decision

In [None]:
metrics_df.sort_values(by = 'Test R2', ascending = False)

# Next Steps

1. Try to predict other targets (a_proportion, f_proportion)

2. Convert to a classification problem (average grade targets 3.0-4.0, 2.0-3.0, 1.0-2.0, < 1.0, for example), to show ROC visualizations