In [125]:
# display a sound when cell is done running
# source: https://forums.fast.ai/t/sound-alerts-in-jupyter-for-code-completion-and-exceptions/4614

## Import up sound alert dependencies
from IPython.display import Audio, display

def allDone():
    display(Audio('f1-radio-notification-By-tuna.voicemod.net.mp3', autoplay=True))

allDone()

<a href="https://colab.research.google.com/github/kellianneyang/grades-project/blob/main/preprocessing_and_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **University Courses Project,  Grade Prediction Regression: Preprocessing,  Modeling, Feature Engineering, and Model Selection**

# Preview of Notebook

**Target column: avg_letter_grade**

1. Create preprocessor object on original data.
2. Create dummy and linear regression models.
3. Create default and tuned versions of the following models: 
    - Decision Tree Regressor
    - Bagged Tree Regressor
    - Random Forest Regressor 
    - K-Nearest Neighbors Regressor
    - Extreme Gradient Boosting Regressor
    - Light Gradient Boosting Machine Regressor
    - Gradient Boosting Regressor
4. Feature engineering: select and transform columns in dataset.
5. Run the top-performing models on the feature-engineered data.
6. Compare performance of all models and select best model.

# Preliminary Steps

In [126]:
# import libraries

import warnings
warnings.filterwarnings('ignore')

# general
import numpy as np
import pandas as pd

# preprocessing
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn import set_config
set_config(display="diagram")

# modeling
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, \
GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

# evaluation
from sklearn.metrics import r2_score, mean_absolute_error, \
mean_squared_error, mean_absolute_percentage_error

# tuning
from sklearn.model_selection import GridSearchCV

In [127]:
# load data
path = 'Data/all_grades_data_cleaned.csv'
df = pd.read_csv(path, index_col = 0)

In [128]:
# inspect
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49975 entries, 0 to 50205
Data columns (total 51 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   section_type       49975 non-null  object 
 1   instructor_id      49975 non-null  object 
 2   facility_code      46160 non-null  object 
 3   start_time         49975 non-null  float64
 4   mon                49975 non-null  bool   
 5   tues               49975 non-null  bool   
 6   wed                49975 non-null  bool   
 7   thurs              49975 non-null  bool   
 8   fri                49975 non-null  bool   
 9   sat                49975 non-null  bool   
 10  sun                49975 non-null  bool   
 11  subject_name       49975 non-null  object 
 12  course_name        49975 non-null  object 
 13  a_count            49975 non-null  int64  
 14  ab_count           49975 non-null  int64  
 15  b_count            49975 non-null  int64  
 16  bc_count           499

In [129]:
# check for duplicates
df.duplicated().sum()

0

In [130]:
# check for missing values
df.isna().sum()

# 'facility_code' is the only feature column with missing values; will need 
# to impute
# 'avg_letter_grade' missing values; will drop rows when using that as target

section_type            0
instructor_id           0
facility_code        3815
start_time              0
mon                     0
tues                    0
wed                     0
thurs                   0
fri                     0
sat                     0
sun                     0
subject_name            0
course_name             0
a_count                 0
ab_count                0
b_count                 0
bc_count                0
c_count                 0
d_count                 0
f_count                 0
s_count                 0
u_count                 0
cr_count                0
n_count                 0
p_count                 0
i_count                 0
nw_count                0
nr_count                0
other_count             0
num_all_grades          0
a_proportion            0
ab_proportion           0
b_proportion            0
bc_proportion           0
c_proportion            0
d_proportion            0
f_proportion            0
s_proportion            0
u_proportion

# Preprocessing

In [131]:
# make a copy of the df to work with
df1 = df.copy()

In [132]:
# drop columns not being used as target columns in this notebook
other_targets = ['a_count', 'ab_count', 'b_count', 'bc_count', 'c_count', 
                'd_count', 'f_count', 's_count', 'u_count', 'cr_count', 
                'n_count', 'p_count', 'i_count', 'nw_count', 'nr_count', 
                'other_count', 'a_proportion', 'ab_proportion', 
                'b_proportion', 'bc_proportion', 'c_proportion', 
                'd_proportion', 'f_proportion', 's_proportion', 
                'u_proportion', 'cr_proportion', 'n_proportion', 
                'p_proportion', 'i_proportion', 'nw_proportion', 
                'nr_proportion', 'other_proportion']
df1.drop(columns = other_targets, inplace = True)

# check
col_in_df1 = 0

for col in other_targets:
    if col in df1.columns:
        col_in_df1 += 1

if col_in_df1 > 0:
    print("Try again")
else:
    print("All columns dropped from df1.")
    # print(f"col {col} in df1.columns: {col in df1.columns}")

All columns dropped from df1.


In [133]:
# delete rows without target variable
df1 = df1[~df1['avg_letter_grade'].isna()]

# check
df1.isna().sum()

section_type            0
instructor_id           0
facility_code        3678
start_time              0
mon                     0
tues                    0
wed                     0
thurs                   0
fri                     0
sat                     0
sun                     0
subject_name            0
course_name             0
num_all_grades          0
avg_letter_grade        0
year                    0
term                    0
class_length            0
course_difficulty       0
dtype: int64

In [134]:
# binary-encode days of week columns

binary_cols = ['mon', 'tues', 'wed', 'thurs', 'fri', 'sat', 'sun']

for col in binary_cols:
    df1[col].replace({True: 1, False: 0}, inplace = True)

# check
for col in binary_cols:
    print(df1[col].value_counts())

0    26800
1    22486
Name: mon, dtype: int64
0    27408
1    21878
Name: tues, dtype: int64
0    27420
1    21866
Name: wed, dtype: int64
0    28263
1    21023
Name: thurs, dtype: int64
0    36741
1    12545
Name: fri, dtype: int64
0    49058
1      228
Name: sat, dtype: int64
0    49245
1       41
Name: sun, dtype: int64


In [135]:
# assign X and y
target = 'avg_letter_grade'
y1 = df1[target]
X1 = df1.drop(columns = target)

# check
print(f"y1: \n{y1}")
print(f"X1: \n{X1}")

y1: 
0        3.625000
1        3.346154
2        3.500000
3        3.750000
4        3.454545
           ...   
50201    3.000000
50202    3.308642
50203    3.210227
50204    3.087500
50205    3.583333
Name: avg_letter_grade, Length: 49286, dtype: float64
X1: 
      section_type instructor_id facility_code  start_time  mon  tues  wed  \
0              lec         other           NaN        -1.0    0     0    0   
1              lec         other          0545       660.0    0     1    0   
2              lec         other          0545       660.0    0     1    0   
3              lec         other          0545       595.0    1     0    1   
4              lec         other          0545       595.0    1     0    1   
...            ...           ...           ...         ...  ...   ...  ...   
50201          lec         other          0093       800.0    1     1    0   
50202          lec         other          0093       800.0    1     1    0   
50203          lec         other    

In [136]:
# validate model with train/test split
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, random_state = 42)

# check
print(f"X1_train shape: {X1_train.shape}")
print(f"X1_test shape: {X1_test.shape}")
print(f"y1_train shape: {y1_train.shape}")
print(f"y1_test shape: {y1_test.shape}")

X1_train shape: (36964, 18)
X1_test shape: (12322, 18)
y1_train shape: (36964,)
y1_test shape: (12322,)


In [137]:
# one-hot encode:
# 'section_type', 'instructor_id', 'subject_name', 'course_name', 'term',
# 'course_difficulty', 'year'

ohe_cols = ['section_type', 'instructor_id', 'subject_name', 'course_name',
            'term', 'course_difficulty', 'year']

ohe = OneHotEncoder(handle_unknown = 'ignore',
                   sparse = False)

ohe_tuple = (ohe, ohe_cols)

In [138]:
# one-hot encode and impute constant 'missing':
# 'facility_code'

imp_ohe_cols = ['facility_code']

missing_imputer = SimpleImputer(strategy = 'constant', 
                                fill_value = 'missing')

ohe = OneHotEncoder(handle_unknown = 'ignore',
                   sparse = False)

imp_ohe_pipe = make_pipeline(missing_imputer, ohe)

imp_ohe_tuple = (imp_ohe_pipe, imp_ohe_cols)

In [139]:
# scale:
# 'start_time', 'class_length', 'num_all_grades'

scaler = StandardScaler()

scale_cols = ['start_time', 'class_length', 'num_all_grades']

scale_tuple = (scaler, scale_cols)

In [140]:
# binary encoded ('passthrough' in preprocessor):
# 'mon', 'tues', 'wed', 'thurs', 'fri', 'sat', 'sun'

# create preprocessor
preprocessor = make_column_transformer(ohe_tuple, 
                                       imp_ohe_tuple, 
                                       scale_tuple,
                                       remainder = 'passthrough')

# Modeling Functions

In [17]:
# create dataframe to save metrics
metrics_df = pd.DataFrame()

# check
metrics_df.shape

(0, 0)

In [18]:
# define function that will print regression metrics and store metrics in a 
# dataframe for easy viewing

def get_metrics(model_pipe, X_train, X_test, y_train, y_test, name):
  
    # calculate predictions
    train_pred = model_pipe.predict(X_train)
    test_pred = model_pipe.predict(X_test)

    # store scores
    metrics_df.at[name, 'Train R2'] = \
        r2_score(y_train, train_pred) 
    
    metrics_df.at[name, 'Test R2'] = \
        r2_score(y_test, test_pred)
        
    metrics_df.at[name, 'Train MAE'] = \
        mean_absolute_error(y_train, train_pred)
    
    metrics_df.at[name, 'Test MAE'] = \
        mean_absolute_error(y_test, test_pred)
    
    metrics_df.at[name, 'Train MAPE'] = \
        mean_absolute_percentage_error(y_train, train_pred)
    
    metrics_df.at[name, 'Test MAPE'] = \
        mean_absolute_percentage_error(y_test, test_pred)
    
    metrics_df.at[name, 'Train RMSE'] = \
        np.sqrt(mean_squared_error(y_train, train_pred))
    
    metrics_df.at[name, 'Test RMSE'] = \
        np.sqrt(mean_squared_error(y_test, test_pred))
  
    # show scores for this model only (can call metrics_df to see all scores)
    print(metrics_df.loc[name, :])

# Preliminary Models

## Dummy Model

In [19]:
%%time
# wall time: ~300 milliseconds

# instantiate
dummy_regressor = DummyRegressor(strategy = 'mean')

# combine with preprocessor
dummy_pipe = make_pipeline(preprocessor, dummy_regressor)

# fit on training data
dummy_pipe.fit(X1_train, y1_train)

# calculate, show, and store metrics
get_metrics(dummy_pipe, 
            X1_train, 
            X1_test, 
            y1_train, 
            y1_test, 
            "Dummy")

Train R2      0.000000e+00
Test R2      -1.160561e-06
Train MAE     3.094009e-01
Test MAE      3.096759e-01
Train MAPE    9.228735e-02
Test MAPE     1.285574e+12
Train RMSE    3.746891e-01
Test RMSE     3.763282e-01
Name: Dummy, dtype: float64
CPU times: total: 328 ms
Wall time: 319 ms


In [20]:
# check metrics_df
metrics_df

Unnamed: 0,Train R2,Test R2,Train MAE,Test MAE,Train MAPE,Test MAPE,Train RMSE,Test RMSE
Dummy,0.0,-1e-06,0.309401,0.309676,0.092287,1285574000000.0,0.374689,0.376328


## Model 1: Linear Regression

In [21]:
%%time
# time: ~1 second

# instantiate
lr = LinearRegression()

# pipeline
lr_pipe = make_pipeline(preprocessor, lr)

# fit
lr_pipe.fit(X1_train, y1_train)

# evaluate
get_metrics(lr_pipe, 
            X1_train, 
            X1_test, 
            y1_train, 
            y1_test, 
            "Linear")

Train R2      5.560803e-01
Test R2      -6.775095e+18
Train MAE     1.911430e-01
Test MAE      1.356573e+07
Train MAPE    5.678754e-02
Test MAPE     1.458048e+12
Train RMSE    2.496452e-01
Test RMSE     9.795445e+08
Name: Linear, dtype: float64
CPU times: total: 4.52 s
Wall time: 1.13 s


## Model 2: Decision Tree Regressor

### Default

In [22]:
%%time
# time: ~3 seconds

# instantiate
dt_def = DecisionTreeRegressor(random_state = 42)

# pipeline
dt_def_pipe = make_pipeline(preprocessor, dt_def)

# fit
dt_def_pipe.fit(X1_train, y1_train)

# evaluate
get_metrics(dt_def_pipe, 
            X1_train, 
            X1_test, 
            y1_train, 
            y1_test, 
            "Def DT")

Train R2      9.927039e-01
Test R2       3.446065e-01
Train MAE     4.586759e-03
Test MAE      2.184527e-01
Train MAPE    1.354891e-03
Test MAPE     1.461970e+12
Train RMSE    3.200495e-02
Test RMSE     3.046615e-01
Name: Def DT, dtype: float64
CPU times: total: 3.88 s
Wall time: 3.03 s


### Tuned on 'max_depth'

In [23]:
# get depth from default tree where max_depth = None
def_depth = dt_def_pipe['decisiontreeregressor'].get_depth()
def_depth

# 119

# this is a very deep tree, which is overfitting by a lot; for the
# tuned model, I will only try up to half this depth

119

In [24]:
%%time
# time: ~8 minutes

# instantiate
dt_tun = DecisionTreeRegressor(random_state = 42)

# pipeline
dt_tun_pipe = make_pipeline(preprocessor, dt_tun)

# params
dt_params = {'decisiontreeregressor__max_depth': range(1, 60)}

# gridsearch
dt_gs = GridSearchCV(dt_tun_pipe, dt_params, scoring = 'r2')

# fit
dt_gs.fit(X1_train, y1_train)

# see best parameters
print(dt_gs.best_params_) # max_depth: 14

# see best score
print(dt_gs.best_score_) # 0.4759055078360543

{'decisiontreeregressor__max_depth': 14}
0.4759055078360543
CPU times: total: 8min 25s
Wall time: 8min 26s


In [25]:
# instantiate tuned model
dt_tun_pipe = dt_gs.best_estimator_

# print and store metrics
get_metrics(dt_tun_pipe, 
            X1_train, 
            X1_test, 
            y1_train, 
            y1_test, 
            "Tun DT")

Train R2      5.971072e-01
Test R2       4.969084e-01
Train MAE     1.762565e-01
Test MAE      2.004229e-01
Train MAPE    5.192103e-02
Test MAPE     1.339670e+12
Train RMSE    2.378295e-01
Test RMSE     2.669254e-01
Name: Tun DT, dtype: float64


In [26]:
# # hard coding results into metrics_df so I don't have to re-run the notebook 
# # cells that take a long time to run every time I re-run the notebook

# metrics_df.at['Tun DT', 'Train R2'] = .5971072
# metrics_df.at['Tun DT', 'Test R2'] = .4969084
# metrics_df.at['Tun DT', 'Train MAE'] = .1762565
# metrics_df.at['Tun DT', 'Test MAE'] = .2004229
# metrics_df.at['Tun DT', 'Train MAPE'] = .5192103
# metrics_df.at['Tun DT', 'Test MAPE'] = .1339670
# metrics_df.at['Tun DT', 'Train RMSE'] = .2378295
# metrics_df.at['Tun DT', 'Test RMSE'] = .2669254

## Model 3: Bagged Tree Regressor

### Default

In [27]:
%%time
# time: ~20 seconds

# instantiate
br_def = BaggingRegressor(random_state = 42)

# pipeline
br_def_pipe = make_pipeline(preprocessor, br_def)

# fit
br_def_pipe.fit(X1_train, y1_train)

# evaluate
get_metrics(br_def_pipe, 
            X1_train, 
            X1_test, 
            y1_train, 
            y1_test, 
            "Def BR")

Train R2      9.245259e-01
Test R2       5.958595e-01
Train MAE     7.040229e-02
Test MAE      1.750572e-01
Train MAPE    2.093136e-02
Test MAPE     1.461970e+12
Train RMSE    1.029367e-01
Test RMSE     2.392394e-01
Name: Def BR, dtype: float64
CPU times: total: 16.3 s
Wall time: 16.4 s


### Tuned on 'n_estimators'

In [28]:
%%time
# time: ~15 minutes

# instantiate
br_tun = BaggingRegressor(random_state = 42)

# pipeline
br_tun_pipe = make_pipeline(preprocessor, br_tun)

# params
br_params = {'baggingregressor__n_estimators': [10, 20, 30, 40, 50]}

# grid search
br_gs = GridSearchCV(br_tun_pipe, br_params, scoring = 'r2')

# fit
br_gs.fit(X1_train, y1_train)

# see best parameters
print(br_gs.best_params_) # n_estimators: 50

# see best score
print(br_gs.best_score_) # 0.6132658185649498

allDone()

{'baggingregressor__n_estimators': 50}
0.6132658185649498


CPU times: total: 14min 43s
Wall time: 14min 44s


In [29]:
# instantiate tuned model
br_tun_pipe = br_gs.best_estimator_

# print and store metrics
get_metrics(br_tun_pipe, 
            X1_train, 
            X1_test, 
            y1_train, 
            y1_test, 
            "Tun BR")

Train R2      9.408948e-01
Test R2       6.217327e-01
Train MAE     6.544406e-02
Test MAE      1.697671e-01
Train MAPE    1.946666e-02
Test MAPE     1.461970e+12
Train RMSE    9.109278e-02
Test RMSE     2.314546e-01
Name: Tun BR, dtype: float64


In [30]:
# # hard coding results into hard_metrics_df so I don't have to re-run the 
# # notebook cells that take a long time to run every time I open it up

# metrics_df.at['Tun BR', 'Train R2'] = 0.9408948
# metrics_df.at['Tun BR', 'Test R2'] = 0.6217327
# metrics_df.at['Tun BR', 'Train MAE'] = 0.6544406
# metrics_df.at['Tun BR', 'Test MAE'] = 0.1697671
# metrics_df.at['Tun BR', 'Train MAPE'] = 0.1946666
# metrics_df.at['Tun BR', 'Test MAPE'] = 0.1461970
# metrics_df.at['Tun BR', 'Train RMSE'] = 0.9109278
# metrics_df.at['Tun BR', 'Test RMSE'] = 0.2314546

## Model 4: Random Forest Regressor

### Default

In [31]:
%%time
# time: ~3 minutes

# instantiate
rf_def = RandomForestRegressor(random_state = 42)

# pipeline
rf_def_pipe = make_pipeline(preprocessor, rf_def)

# fit
rf_def_pipe.fit(X1_train, y1_train)

# evaluate
get_metrics(rf_def_pipe, 
            X1_train, 
            X1_test, 
            y1_train, 
            y1_test, 
            "Def RF")

allDone()

Train R2      9.429108e-01
Test R2       6.249257e-01
Train MAE     6.476298e-02
Test MAE      1.689894e-01
Train MAPE    1.926236e-02
Test MAPE     1.461970e+12
Train RMSE    8.952576e-02
Test RMSE     2.304757e-01
Name: Def RF, dtype: float64


CPU times: total: 2min 31s
Wall time: 2min 31s


In [32]:
# # hard coding results into hard_metrics_df so I don't have to re-run the 
# # notebook cells that take a long time to run every time I open it up

# metrics_df.at['Def RF', 'Train R2'] = 0.9429108
# metrics_df.at['Def RF', 'Test R2'] = 0.6249257
# metrics_df.at['Def RF', 'Train MAE'] = 0.6476298
# metrics_df.at['Def RF', 'Test MAE'] = 0.1689894
# metrics_df.at['Def RF', 'Train MAPE'] = 0.1926236
# metrics_df.at['Def RF', 'Test MAPE'] = 0.1461970
# metrics_df.at['Def RF', 'Train RMSE'] = 0.8952576
# metrics_df.at['Def RF', 'Test RMSE'] = 0.2304757

### Tuned on 'max_depth' and 'max_features'

In [33]:
# get depth from max of estimator depths in the default random forest model
est_depths = [estimator.get_depth() for estimator 
              in rf_def_pipe['randomforestregressor'].estimators_]
max_depth_rf = max(est_depths)
max_depth_rf # 129

129

In [34]:
%%time
# time: ~5 hours

# instantiate
rf_tun = RandomForestRegressor(random_state = 42, n_jobs = -1)

# pipeline
rf_tun_pipe = make_pipeline(preprocessor, rf_tun)

# params
rf_params = {}

# max_depth (range up to almost half of depth of default model)
rf_params['randomforestregressor__max_depth'] = range(10, 61, 10)

# max_features (default 1; range over middle several columns without 
# dummies); 18 feature columns
rf_params['randomforestregressor__max_features'] = range(7, 12)

# gridsearchcv
rf_gs = GridSearchCV(rf_tun_pipe, 
                     rf_params,
                     scoring = 'r2')

rf_gs.fit(X1_train, y1_train)

allDone()

CPU times: total: 2min 48s
Wall time: 6min 2s


In [35]:
# see best score from randomizedsearchcv
rf_gs.best_score_

0.6114289641513905

In [36]:
# see the best parameters from the tuned model
rf_gs.best_params_

# max_depth: 60
# max_features: 11

{'randomforestregressor__max_depth': 60,
 'randomforestregressor__max_features': 11}

In [37]:
# instantiate tuned model
rf_tun_pipe = rf_gs.best_estimator_

# print and store metrics
get_metrics(rf_tun_pipe, 
            X1_train, 
            X1_test, 
            y1_train, 
            y1_test, 
            "Tun RF")

Train R2      8.783117e-01
Test R2       6.185071e-01
Train MAE     9.791210e-02
Test MAE      1.756538e-01
Train MAPE    2.910196e-02
Test MAPE     1.442124e+12
Train RMSE    1.307060e-01
Test RMSE     2.324394e-01
Name: Tun RF, dtype: float64


In [38]:
# # hard coding results into hard_metrics_df so I don't have to re-run the 
# # notebook cells that take a long time to run every time I open it up

# metrics_df.at['Tun RF', 'Train R2'] = 0.8783117
# metrics_df.at['Tun RF', 'Test R2'] = 0.6185071
# metrics_df.at['Tun RF', 'Train MAE'] = 0.9791210
# metrics_df.at['Tun RF', 'Test MAE'] = 0.1756538
# metrics_df.at['Tun RF', 'Train MAPE'] = 0.2910196
# metrics_df.at['Tun RF', 'Test MAPE'] = 0.1442124
# metrics_df.at['Tun RF', 'Train RMSE'] = 0.1307060
# metrics_df.at['Tun RF', 'Test RMSE'] = 0.2324394

## Model 5: K-Nearest Neighbors Regressor

### Default

In [39]:
%%time
# time: ~10 seconds

# instantiate
kn_def = KNeighborsRegressor()

# pipeline
kn_def_pipe = make_pipeline(preprocessor, kn_def)

# fit
kn_def_pipe.fit(X1_train, y1_train)

# evaluate
get_metrics(kn_def_pipe, 
            X1_train, 
            X1_test, 
            y1_train, 
            y1_test, 
            "Def KNN")

allDone()

Train R2      7.314605e-01
Test R2       5.949229e-01
Train MAE     1.434579e-01
Test MAE      1.753353e-01
Train MAPE    4.258531e-02
Test MAPE     1.461970e+12
Train RMSE    1.941669e-01
Test RMSE     2.395164e-01
Name: Def KNN, dtype: float64


CPU times: total: 2min 2s
Wall time: 11.1 s


In [40]:
# # hard coding results into hard_metrics_df so I don't have to
# # re-run the notebook cells that take a long time to run
# # every time I open it up

# metrics_df.at['Def KNN', 'Train R2'] = 0.7314605
# metrics_df.at['Def KNN', 'Test R2'] = 0.5949229
# metrics_df.at['Def KNN', 'Train MAE'] = 0.1434579
# metrics_df.at['Def KNN', 'Test MAE'] = 0.1753353
# metrics_df.at['Def KNN', 'Train MAPE'] = 0.4258531
# metrics_df.at['Def KNN', 'Test MAPE'] = 0.1461970
# metrics_df.at['Def KNN', 'Train RMSE'] = 0.1941669
# metrics_df.at['Def KNN', 'Test RMSE'] = 0.2395164

### Tuned on 'n_neighbors'

In [41]:
%%time
# time: ~1 minute

# instantiate
kn_tun = KNeighborsRegressor()

# pipeline
kn_tun_pipe = make_pipeline(preprocessor, kn_tun)

# params
kn_params = {}

# tune n_neighbors
kn_params['kneighborsregressor__n_neighbors'] = range(10, 311, 50)

# randomizedsearchcv
kn_gs = GridSearchCV(kn_tun_pipe, 
                     kn_params,
                     scoring = 'r2')

kn_gs.fit(X1_train, y1_train)

allDone()

CPU times: total: 12min 32s
Wall time: 1min 7s


In [42]:
# see best score from randomizedsearchcv
kn_gs.best_score_

0.5874572079058891

0.5874572079058891

In [43]:
# see the best parameters from the tuned model
kn_gs.best_params_

# n_neighbors: 10

{'kneighborsregressor__n_neighbors': 10}

In [44]:
# instantiate tuned model
kn_tun_pipe = kn_gs.best_estimator_

# print and store metrics
get_metrics(kn_tun_pipe, 
            X1_train, 
            X1_test, 
            y1_train, 
            y1_test, 
            "Tun KNN")

Train R2      6.765076e-01
Test R2       6.069203e-01
Train MAE     1.598232e-01
Test MAE      1.754335e-01
Train MAPE    4.741026e-02
Test MAPE     1.461970e+12
Train RMSE    2.131096e-01
Test RMSE     2.359428e-01
Name: Tun KNN, dtype: float64


In [45]:
# # hard coding results into hard_metrics_df so I don't have to re-run the 
# # notebook cells that take a long time to run every time I open it up

# metrics_df.at['Tun KNN', 'Train R2'] = 0.6765076
# metrics_df.at['Tun KNN', 'Test R2'] = 0.6069203
# metrics_df.at['Tun KNN', 'Train MAE'] = 0.1598232
# metrics_df.at['Tun KNN', 'Test MAE'] = 0.1754335
# metrics_df.at['Tun KNN', 'Train MAPE'] = 0.4741026
# metrics_df.at['Tun KNN', 'Test MAPE'] = 0.1461970
# metrics_df.at['Tun KNN', 'Train RMSE'] = 0.2131096
# metrics_df.at['Tun KNN', 'Test RMSE'] = 0.2359428

## Model 6: Extreme Gradient Boosting Regressor

### Default

In [46]:
%%time
# time: ~10 seconds

# instantiate
xgb_def = XGBRegressor()

# pipeline
xgb_def_pipe = make_pipeline(preprocessor, xgb_def)

# fit
xgb_def_pipe.fit(X1_train, y1_train)

# evaluate
get_metrics(xgb_def_pipe, 
            X1_train, 
            X1_test, 
            y1_train, 
            y1_test, 
            "Def XGB")

allDone()

Train R2      6.627576e-01
Test R2       6.082175e-01
Train MAE     1.661186e-01
Test MAE      1.788502e-01
Train MAPE    4.933301e-02
Test MAPE     1.411779e+12
Train RMSE    2.175916e-01
Test RMSE     2.355532e-01
Name: Def XGB, dtype: float64


CPU times: total: 1min 56s
Wall time: 10.2 s


### Tuned on 'max_depth' and 'n_estimators'

In [47]:
%%time
# time: ~15 minutes

# instantiate
xgb_tun = XGBRegressor()

# pipeline
xgb_tun_pipe = make_pipeline(preprocessor, xgb_tun)

# params
xgb_params = {}

# tune max_depth and n_estimators
xgb_params['xgbregressor__max_depth'] = [20, 40, 60]
xgb_params['xgbregressor__n_estimators'] = [20, 40, 60]

# gridsearchcv
xgb_gs = GridSearchCV(xgb_tun_pipe, 
                     xgb_params,
                     scoring = 'r2')

xgb_gs.fit(X1_train, y1_train)

allDone()

CPU times: total: 2h 48min 53s
Wall time: 15min 2s


In [48]:
# see best score from gridsearchcv
xgb_gs.best_score_ # 0.6074436855706237

0.6074436855706237

In [49]:
# see the best parameters from the tuned model
xgb_gs.best_params_ 

# max_depth: 20
# n_estimators: 40

{'xgbregressor__max_depth': 20, 'xgbregressor__n_estimators': 40}

In [50]:
# instantiate tuned model
xgb_tun_pipe = xgb_gs.best_estimator_

# print and store metrics
get_metrics(xgb_tun_pipe, 
            X1_train, 
            X1_test, 
            y1_train, 
            y1_test, 
            "Tun XGB")

Train R2      8.496872e-01
Test R2       6.199189e-01
Train MAE     1.058602e-01
Test MAE      1.709640e-01
Train MAPE    3.126675e-02
Test MAPE     1.438637e+12
Train RMSE    1.452677e-01
Test RMSE     2.320089e-01
Name: Tun XGB, dtype: float64


In [51]:
# # hard coding results into hard_metrics_df so I don't have to re-run the 
# # notebook cells that take a long time to run every time I open it up

# metrics_df.at['Tun XGB', 'Train R2'] = 0.8496872
# metrics_df.at['Tun XGB', 'Test R2'] = 0.6199189
# metrics_df.at['Tun XGB', 'Train MAE'] = 0.1058602
# metrics_df.at['Tun XGB', 'Test MAE'] = 0.1709640
# metrics_df.at['Tun XGB', 'Train MAPE'] = 0.3126675
# metrics_df.at['Tun XGB', 'Test MAPE'] = 0.1438637
# metrics_df.at['Tun XGB', 'Train RMSE'] = 0.1452677
# metrics_df.at['Tun XGB', 'Test RMSE'] = 0.2320089

## Model 7: Light Gradient Boosting Machine Regressor

### Default

In [52]:
%%time
# time: ~1 second

# instantiate
lgbm_def = LGBMRegressor()

# pipeline
lgbm_def_pipe = make_pipeline(preprocessor, lgbm_def)

# fit
lgbm_def_pipe.fit(X1_train, y1_train)

# evaluate
get_metrics(lgbm_def_pipe, 
            X1_train, 
            X1_test, 
            y1_train, 
            y1_test, 
            "Def LGBM")

allDone()

Train R2      6.141563e-01
Test R2       5.922337e-01
Train MAE     1.785914e-01
Test MAE      1.835697e-01
Train MAPE    5.311389e-02
Test MAPE     1.411605e+12
Train RMSE    2.327431e-01
Test RMSE     2.403102e-01
Name: Def LGBM, dtype: float64


CPU times: total: 6.02 s
Wall time: 693 ms


### Tuned on 'max_depth' and 'n_estimators'

In [53]:
%%time
# time: ~30 seconds

# instantiate
lgbm_tun = LGBMRegressor()

# pipeline
lgbm_tun_pipe = make_pipeline(preprocessor, lgbm_tun)

# params
lgbm_params = {}

# tune n_neighbors
lgbm_params['lgbmregressor__max_depth'] = [5, 10, 15, 20]
lgbm_params['lgbmregressor__n_estimators'] = [10, 20, 30, 40, 50]

# gridsearchcv
lgbm_gs = GridSearchCV(lgbm_tun_pipe, 
                       lgbm_params,
                       scoring = 'r2')

lgbm_gs.fit(X1_train, y1_train)

allDone()

CPU times: total: 4min 46s
Wall time: 31.6 s


In [54]:
# see best score from gridsearchcv
lgbm_gs.best_score_ # 0.557453771379358

0.557453771379358

In [55]:
# see the best parameters from the tuned model
lgbm_gs.best_params_

# max_depth: 20
# n_estimators: 50

{'lgbmregressor__max_depth': 20, 'lgbmregressor__n_estimators': 50}

In [56]:
# instantiate tuned model
lgbm_tun_pipe = lgbm_gs.best_estimator_

# print and store metrics
get_metrics(lgbm_tun_pipe, 
            X1_train, 
            X1_test, 
            y1_train, 
            y1_test, 
            "Tun LGBM")

Train R2      5.731224e-01
Test R2       5.610517e-01
Train MAE     1.902449e-01
Test MAE      1.927721e-01
Train MAPE    5.657365e-02
Test MAPE     1.372656e+12
Train RMSE    2.448064e-01
Test RMSE     2.493292e-01
Name: Tun LGBM, dtype: float64


In [57]:
# # hard coding results so I don't have to re-run the notebook cells that take 
# # a long time to run every time I open it up

# metrics_df.at['Tun LGBM', 'Train R2'] = 0.5731224
# metrics_df.at['Tun LGBM', 'Test R2'] = 0.5610517
# metrics_df.at['Tun LGBM', 'Train MAE'] = 0.1902449
# metrics_df.at['Tun LGBM', 'Test MAE'] = 0.1927721
# metrics_df.at['Tun LGBM', 'Train MAPE'] = 0.5657365
# metrics_df.at['Tun LGBM', 'Test MAPE'] = 0.1372656
# metrics_df.at['Tun LGBM', 'Train RMSE'] = 0.2448064
# metrics_df.at['Tun LGBM', 'Test RMSE'] = 0.2493292

## Model 8: Gradient Boosting Regressor

### Default

In [58]:
%%time
# time: ~5 seconds

# instantiate
gbr_def = GradientBoostingRegressor()

# pipeline
gbr_def_pipe = make_pipeline(preprocessor, gbr_def)

# fit
gbr_def_pipe.fit(X1_train, y1_train)

# evaluate
get_metrics(gbr_def_pipe, 
            X1_train, 
            X1_test, 
            y1_train, 
            y1_test, 
            "Def GBR")

allDone()

Train R2      4.887404e-01
Test R2       4.883433e-01
Train MAE     2.118149e-01
Test MAE      2.117284e-01
Train MAPE    6.295529e-02
Test MAPE     1.380416e+12
Train RMSE    2.679118e-01
Test RMSE     2.691881e-01
Name: Def GBR, dtype: float64


CPU times: total: 1min 17s
Wall time: 1min 16s


### Tuned on 'max_depth' and 'n_estimators'

In [59]:
%%time
# time: ~1 hour 15 minutes

# instantiate
gbr_tun = GradientBoostingRegressor()

# pipeline
gbr_tun_pipe = make_pipeline(preprocessor, gbr_tun)

# params
gbr_params = {}

# tune n_neighbors
gbr_params['gradientboostingregressor__max_depth'] = [5, 10, 15, 20]
gbr_params['gradientboostingregressor__n_estimators'] = [10, 20, 30, 40, 50]

# gridsearchcv
gbr_gs = GridSearchCV(gbr_tun_pipe, 
                     gbr_params,
                     scoring = 'r2')

gbr_gs.fit(X1_train, y1_train)

allDone()

CPU times: total: 1h 13min 46s
Wall time: 1h 13min 51s


In [60]:
# see best score from gridsearchcv
gbr_gs.best_score_ # 0.601799080239464

0.601799080239464

In [61]:
# see the best parameters from the tuned model
gbr_gs.best_params_

# max_depth: 15
# n_estimators: 50

{'gradientboostingregressor__max_depth': 15,
 'gradientboostingregressor__n_estimators': 50}

In [62]:
# instantiate tuned model
gbr_tun_pipe = gbr_gs.best_estimator_

# print and store metrics
get_metrics(gbr_tun_pipe, 
            X1_train, 
            X1_test, 
            y1_train, 
            y1_test, 
            "Tun GBR")

Train R2      8.032706e-01
Test R2       6.166083e-01
Train MAE     1.260624e-01
Test MAE      1.747592e-01
Train MAPE    3.694152e-02
Test MAPE     1.430556e+12
Train RMSE    1.661903e-01
Test RMSE     2.330171e-01
Name: Tun GBR, dtype: float64


In [63]:
# # hard coding results so I don't have to re-run the notebook cells that take 
# # a long time to run every time I open it up

# metrics_df.at['Tun GBR', 'Train R2'] = 0.8786262
# metrics_df.at['Tun GBR', 'Test R2'] = 0.6341327
# metrics_df.at['Tun GBR', 'Train MAE'] = 0.9380641
# metrics_df.at['Tun GBR', 'Test MAE'] = 0.1679331
# metrics_df.at['Tun GBR', 'Train MAPE'] = 0.1161788
# metrics_df.at['Tun GBR', 'Test MAPE'] = 0.4995710
# metrics_df.at['Tun GBR', 'Train RMSE'] = 0.1303873
# metrics_df.at['Tun GBR', 'Test RMSE'] = 0.2266545

# Feature Engineering: Feature Selection

In [180]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49975 entries, 0 to 50205
Data columns (total 51 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   section_type       49975 non-null  object 
 1   instructor_id      49975 non-null  object 
 2   facility_code      46160 non-null  object 
 3   start_time         49975 non-null  float64
 4   mon                49975 non-null  bool   
 5   tues               49975 non-null  bool   
 6   wed                49975 non-null  bool   
 7   thurs              49975 non-null  bool   
 8   fri                49975 non-null  bool   
 9   sat                49975 non-null  bool   
 10  sun                49975 non-null  bool   
 11  subject_name       49975 non-null  object 
 12  course_name        49975 non-null  object 
 13  a_count            49975 non-null  int64  
 14  ab_count           49975 non-null  int64  
 15  b_count            49975 non-null  int64  
 16  bc_count           499

In [181]:
# make copy of df for feature selection
df_fe = df.copy()

In [182]:
# drop duplicates
df_fe.drop_duplicates(inplace = True)

# check
df_fe.duplicated().sum()

0

In [183]:
# check for missing values
df_fe.isna().sum()

section_type            0
instructor_id           0
facility_code        3815
start_time              0
mon                     0
tues                    0
wed                     0
thurs                   0
fri                     0
sat                     0
sun                     0
subject_name            0
course_name             0
a_count                 0
ab_count                0
b_count                 0
bc_count                0
c_count                 0
d_count                 0
f_count                 0
s_count                 0
u_count                 0
cr_count                0
n_count                 0
p_count                 0
i_count                 0
nw_count                0
nr_count                0
other_count             0
num_all_grades          0
a_proportion            0
ab_proportion           0
b_proportion            0
bc_proportion           0
c_proportion            0
d_proportion            0
f_proportion            0
s_proportion            0
u_proportion

In [184]:
# drop rows with missing values in target column 'avg_letter_grade'
df_fe = df_fe[~df_fe['avg_letter_grade'].isna()]

# check
df_fe.isna().sum()

section_type            0
instructor_id           0
facility_code        3678
start_time              0
mon                     0
tues                    0
wed                     0
thurs                   0
fri                     0
sat                     0
sun                     0
subject_name            0
course_name             0
a_count                 0
ab_count                0
b_count                 0
bc_count                0
c_count                 0
d_count                 0
f_count                 0
s_count                 0
u_count                 0
cr_count                0
n_count                 0
p_count                 0
i_count                 0
nw_count                0
nr_count                0
other_count             0
num_all_grades          0
a_proportion            0
ab_proportion           0
b_proportion            0
bc_proportion           0
c_proportion            0
d_proportion            0
f_proportion            0
s_proportion            0
u_proportion

In [185]:
# drop grade columns (except target column 'avg_letter_grade')

other_targets = ['a_count', 'ab_count', 'b_count', 'bc_count', 'c_count', 
                'd_count', 'f_count', 's_count', 'u_count', 'cr_count', 
                'n_count', 'p_count', 'i_count', 'nw_count', 'nr_count', 
                'other_count', 'a_proportion', 'ab_proportion', 
                'b_proportion', 'bc_proportion', 'c_proportion', 
                'd_proportion', 'f_proportion', 's_proportion', 
                'u_proportion', 'cr_proportion', 'n_proportion', 
                'p_proportion', 'i_proportion', 'nw_proportion', 
                'nr_proportion', 'other_proportion']

df_fe.drop(columns = other_targets, inplace = True)

# check
df_fe.columns

Index(['section_type', 'instructor_id', 'facility_code', 'start_time', 'mon',
       'tues', 'wed', 'thurs', 'fri', 'sat', 'sun', 'subject_name',
       'course_name', 'num_all_grades', 'avg_letter_grade', 'year', 'term',
       'class_length', 'course_difficulty'],
      dtype='object')

## Schedule

The schedule variables take up 7 columns for days of the week. However this doesn't capture the schedules of classes; there is a high probability of a class that is held on Monday also being held on Wednesday, for example. 

So, to reduce this collinearity, we will make new schedule-related columns related to the actual schedules of the courses.

In [186]:
df_fe[['mon', 'tues', 'wed', 'thurs', 'fri', 'sat', 'sun']]

Unnamed: 0,mon,tues,wed,thurs,fri,sat,sun
0,False,False,False,False,False,False,False
1,False,True,False,True,False,False,False
2,False,True,False,True,False,False,False
3,True,False,True,False,True,False,False
4,True,False,True,False,True,False,False
...,...,...,...,...,...,...,...
50201,True,True,False,False,True,False,False
50202,True,True,False,False,True,False,False
50203,True,True,False,False,True,False,False
50204,True,True,False,False,True,False,False


In [187]:
# change to 1s and 0s
df_fe['mon'].replace({True: 1, False: 0}, inplace = True)
df_fe['tues'].replace({True: 1, False: 0}, inplace = True)
df_fe['wed'].replace({True: 1, False: 0}, inplace = True)
df_fe['thurs'].replace({True: 1, False: 0}, inplace = True)
df_fe['fri'].replace({True: 1, False: 0}, inplace = True)
df_fe['sat'].replace({True: 1, False: 0}, inplace = True)
df_fe['sun'].replace({True: 1, False: 0}, inplace = True)

# check
df_fe[['mon', 'tues', 'wed', 'thurs', 'fri', 'sat', 'sun']]

Unnamed: 0,mon,tues,wed,thurs,fri,sat,sun
0,0,0,0,0,0,0,0
1,0,1,0,1,0,0,0
2,0,1,0,1,0,0,0
3,1,0,1,0,1,0,0
4,1,0,1,0,1,0,0
...,...,...,...,...,...,...,...
50201,1,1,0,0,1,0,0
50202,1,1,0,0,1,0,0
50203,1,1,0,0,1,0,0
50204,1,1,0,0,1,0,0


In [188]:
# new column: how many days per week the course is held (numeric, to be 
# one-hot encoded); weekend counts as one day (can go back to original
# data and change if needed)

df_fe['days_per_week'] = df_fe['mon'] + df_fe['tues'] + df_fe['wed'] + \
                         df_fe['thurs'] + df_fe['fri'] + df_fe['sat'] + \
                         df_fe['sun']

# check
df_fe[['mon', 'tues', 'wed', 'thurs', 'fri', 'sat', 'sun', 'days_per_week']]

Unnamed: 0,mon,tues,wed,thurs,fri,sat,sun,days_per_week
0,0,0,0,0,0,0,0,0
1,0,1,0,1,0,0,0,2
2,0,1,0,1,0,0,0,2
3,1,0,1,0,1,0,0,3
4,1,0,1,0,1,0,0,3
...,...,...,...,...,...,...,...,...
50201,1,1,0,0,1,0,0,3
50202,1,1,0,0,1,0,0,3
50203,1,1,0,0,1,0,0,3
50204,1,1,0,0,1,0,0,3


In [189]:
# change 1s and 0s in days of week columns to abbreviations 
# (M, T, W, R, F, A, U)

df_fe['mon'].replace({1: 'M', 0: ''}, inplace = True)
df_fe['tues'].replace({1: 'T', 0: ''}, inplace = True)
df_fe['wed'].replace({1: 'W', 0: ''}, inplace = True)
df_fe['thurs'].replace({1: 'R', 0: ''}, inplace = True)
df_fe['fri'].replace({1: 'F', 0: ''}, inplace = True)
df_fe['sat'].replace({1: 'A', 0: ''}, inplace = True)
df_fe['sun'].replace({1: 'U', 0: ''}, inplace = True)

# check
df_fe[['mon', 'tues', 'wed', 'thurs', 'fri', 'sat', 'sun', 'days_per_week']]

Unnamed: 0,mon,tues,wed,thurs,fri,sat,sun,days_per_week
0,,,,,,,,0
1,,T,,R,,,,2
2,,T,,R,,,,2
3,M,,W,,F,,,3
4,M,,W,,F,,,3
...,...,...,...,...,...,...,...,...
50201,M,T,,,F,,,3
50202,M,T,,,F,,,3
50203,M,T,,,F,,,3
50204,M,T,,,F,,,3


In [190]:
# concatenate days of week columns into one
df_fe['schedule_days'] = (df_fe['mon'] + df_fe['tues'] + df_fe['wed'] + 
                          df_fe['thurs'] + df_fe['fri'] + df_fe['sat'] + 
                          df_fe['sun'])

# check
df_fe['schedule_days'].value_counts()

TR         15425
MWF         9099
MW          7140
            3785
T           2421
W           2273
M           1951
MTWR        1816
R           1518
MTWRF       1164
MTRF         870
F            869
A            208
MF           206
WF           162
MWR           88
MTWF          69
U             40
MTR           31
TRF           31
TWRF          26
MR            21
FA            18
WR            10
MWRF           9
MTF            7
MT             5
MRF            5
TF             4
RF             4
MTW            3
TWR            3
TW             2
MA             1
MTWRFAU        1
WRF            1
Name: schedule_days, dtype: int64

In [191]:
# replace '' with 'none'
df_fe['schedule_days'] = df_fe['schedule_days'].apply(lambda x: 'none' 
                                                      if x == '' 
                                                      else x)

# check
df_fe['schedule_days'].value_counts()

TR         15425
MWF         9099
MW          7140
none        3785
T           2421
W           2273
M           1951
MTWR        1816
R           1518
MTWRF       1164
MTRF         870
F            869
A            208
MF           206
WF           162
MWR           88
MTWF          69
U             40
MTR           31
TRF           31
TWRF          26
MR            21
FA            18
WR            10
MWRF           9
MTF            7
MT             5
MRF            5
TF             4
RF             4
MTW            3
TWR            3
TW             2
MA             1
MTWRFAU        1
WRF            1
Name: schedule_days, dtype: int64

In [192]:
# bin all schedules with fewer than 100 values into 'other'

keep_schedules = ['TR', 'MWF', 'MW', 'none', 'T', 'W', 'M', 'MTWR', 
                 'R', 'MTWRF', 'MTRF', 'F', 'MF', 'WF']

df_fe['schedule_days'] = df_fe['schedule_days'].apply(lambda x: x 
                                                      if x in keep_schedules 
                                                      else 'other')

# check
df_fe['schedule_days'].value_counts()

TR       15425
MWF       9099
MW        7140
none      3785
T         2421
W         2273
M         1951
MTWR      1816
R         1518
MTWRF     1164
MTRF       870
F          869
other      587
MF         206
WF         162
Name: schedule_days, dtype: int64

In [193]:
# drop original columns
df_fe.drop(columns = ['mon', 'tues', 'wed', 'thurs', 'fri', 'sat', 'sun'], 
           inplace = True)

# check
for col in ['mon', 'tues', 'wed', 'thurs', 'fri', 'sat', 'sun']:
    print(f"{col} col in df_fe: {col in df_fe.columns}")

mon col in df_fe: False
tues col in df_fe: False
wed col in df_fe: False
thurs col in df_fe: False
fri col in df_fe: False
sat col in df_fe: False
sun col in df_fe: False


## Course name

In [194]:
# course_name is a finer distinction of 'subject_name' and a high-cardinality
# variable, so we will take it out to see if model performance can improve
df_fe.drop(columns = 'course_name', inplace = True)

# check
'course_name' in df_fe.columns

False

## Start time

In [195]:
# check range of start times
df_fe['start_time'].describe()

count    49286.000000
mean       673.830155
std        244.789537
min         -1.000000
25%        570.000000
50%        660.000000
75%        865.000000
max       1260.000000
Name: start_time, dtype: float64

In [196]:
# check start times without -1 (no start time)
df_fe[df_fe['start_time'] != -1].describe()

Unnamed: 0,start_time,num_all_grades,avg_letter_grade,year,class_length,days_per_week
count,45479.0,45479.0,45479.0,45479.0,45479.0,45479.0
mean,730.319488,38.544757,3.490403,12.187075,79.992854,2.199499
std,153.705467,53.615894,0.369939,3.397032,45.537697,0.91888
min,390.0,2.0,0.0,7.0,40.0,0.0
25%,595.0,14.0,3.245534,9.0,50.0,2.0
50%,725.0,20.0,3.529605,12.0,75.0,2.0
75%,865.0,37.0,3.791667,15.0,75.0,3.0
max,1260.0,712.0,4.0,18.0,600.0,7.0


In [197]:
# start_time is given in minutes, so it is a numerical variable; courses
# without an assigned start_time are coded with -1
# start_time shouldn't be treated as a numeric variable since there is not a
# natural size order to times; for example, it's possible that courses in the
# afternoon give out higher grades than both courses in the mornings and
# evenings

# we will bin courses by start_time to morning, afternoon, evening, and none
# then we will be able to one-hot encode this variable

# times:
# -1 = 'none'
# earliest start time is 6:30am (390)
# 390 - 479 = early morning (6:30-7:59am) # 1.5 hours
# 480 - 599 = mid-morning (8-9:59am) # 2 hours
# 600 - 719 = late morning (10-11:59am) # 2 hours
# 720 - 839 = early afternoon (noon-1:59pm) # 2 hours
# 840 - 959 = mid-afternoon (2-3:59pm) # 2 hours
# 960 - 1079 = late afternoon (4-5:59pm) # 2 hours
# 1080 - 1260 = evening (6-9pm) # 3 hours
# last start time is 9pm (1260)

for index in df_fe['start_time'].index:
    if df_fe.at[index, 'start_time'] == -1:
        df_fe.at[index, 'start_time'] = 'none'
    elif df_fe.at[index, 'start_time'] < 480:
        df_fe.at[index, 'start_time'] = 'early morning'
    elif df_fe.at[index, 'start_time'] < 600:
        df_fe.at[index, 'start_time'] = 'mid-morning'
    elif df_fe.at[index, 'start_time'] < 720:
        df_fe.at[index, 'start_time'] = 'late morning'
    elif df_fe.at[index, 'start_time'] < 840:
        df_fe.at[index, 'start_time'] = 'early afternoon'
    elif df_fe.at[index, 'start_time'] < 960:
        df_fe.at[index, 'start_time'] = 'mid-afternoon'
    elif df_fe.at[index, 'start_time'] < 1080:
        df_fe.at[index, 'start_time'] = 'late afternoon'
    else:
        df_fe.at[index, 'start_time'] = 'evening'
        
# check
df_fe['start_time'].value_counts()

mid-morning        12670
early afternoon    11368
late morning        8026
mid-afternoon       8014
late afternoon      3937
none                3807
early morning        767
evening              697
Name: start_time, dtype: int64

## Class length

In [198]:
# bin 'class_length'
df_fe['class_length'].describe()

count    49286.000000
mean        73.813963
std         48.678525
min          0.000000
25%         50.000000
50%         75.000000
75%         75.000000
max        600.000000
Name: class_length, dtype: float64

In [199]:
df_fe['class_length'].value_counts()

50.0     19298
75.0     15422
0.0       3807
150.0     3360
120.0     2019
         ...  
340.0        1
325.0        1
113.0        1
40.0         1
520.0        1
Name: class_length, Length: 61, dtype: int64

In [200]:
# inspect without 0-length classes
class_length_no_zero = df_fe[df_fe['class_length'] != 0]
class_length_no_zero['class_length'].describe()

count    45479.000000
mean        79.992854
std         45.537697
min         40.000000
25%         50.000000
50%         75.000000
75%         75.000000
max        600.000000
Name: class_length, dtype: float64

In [201]:
# bin into less than 75 and 75 or more
df_fe['class_length'] = df_fe['class_length'].apply(lambda x: 'shorter' 
                                                    if x < 75 
                                                    else 'longer')

# check
df_fe['class_length'].value_counts()

longer     25638
shorter    23648
Name: class_length, dtype: int64

## 'num_all_grades'

The size of a class may have some predictive power on the class's average grade, but it will not necessarily be a linear relationship. So, we will bin 'num_all_grades' here as we binned other numerical categories above. 

In [202]:
# see description of column
df_fe['num_all_grades'].describe()

count    49286.000000
mean        37.348943
std         52.927500
min          2.000000
25%         13.000000
50%         20.000000
75%         35.000000
max        712.000000
Name: num_all_grades, dtype: float64

In [203]:
# small = under 15
# medium = 16-35
# large = over 35

# bin 
for index in df_fe['num_all_grades'].index:
    if df_fe.at[index, 'num_all_grades'] < 16:
        df_fe.at[index, 'num_all_grades'] = 'small'
    elif df_fe.at[index, 'num_all_grades'] < 36:
        df_fe.at[index, 'num_all_grades'] = 'medium'
    else:
        df_fe.at[index, 'num_all_grades'] = 'large'

# check
df_fe['num_all_grades'].value_counts()

medium    20605
small     16513
large     12168
Name: num_all_grades, dtype: int64

In [204]:
# rename column
df_fe.rename(columns = {'num_all_grades': 'class_size'}, inplace = True)

# check
'num_all_grades' in df_fe.columns

False

# New preprocessor for Feature-Engineered Data

In [205]:
df_fe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49286 entries, 0 to 50205
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   section_type       49286 non-null  object 
 1   instructor_id      49286 non-null  object 
 2   facility_code      45608 non-null  object 
 3   start_time         49286 non-null  object 
 4   subject_name       49286 non-null  object 
 5   class_size         49286 non-null  object 
 6   avg_letter_grade   49286 non-null  float64
 7   year               49286 non-null  int64  
 8   term               49286 non-null  object 
 9   class_length       49286 non-null  object 
 10  course_difficulty  49286 non-null  object 
 11  days_per_week      49286 non-null  int64  
 12  schedule_days      49286 non-null  object 
dtypes: float64(1), int64(2), object(10)
memory usage: 6.3+ MB


In [207]:
for col in df_fe.columns:
    print(col)
    print(df_fe[col].value_counts())
    print()

section_type
lec    39181
lab     4767
sem     2566
ind     1988
fld      543
dis      241
Name: section_type, dtype: int64

instructor_id
other        43768
566960.0       132
496397.0       106
2601912.0       89
2601706.0       85
             ...  
2600012.0       39
777651.0        39
636841.0        38
960897.0        38
3659559.0       38
Name: instructor_id, Length: 107, dtype: int64

facility_code
0482     7446
0469     4542
0140     3446
0408     2532
0545     2336
         ... 
0039        1
0084b       1
0033        1
1400g       1
1400e       1
Name: facility_code, Length: 119, dtype: int64

start_time
mid-morning        12670
early afternoon    11368
late morning        8026
mid-afternoon       8014
late afternoon      3937
none                3807
early morning        767
evening              697
Name: start_time, dtype: int64

subject_name
spanish (spanish and portuguese)                 2965
english                                          2791
communication arts      

In [208]:
# binary encode 'term'
df_fe['term'].replace({'fall': 0, 'spring': 1}, inplace = True)

# rename column
df_fe.rename(columns = {'term': 'is_spring'}, inplace = True)

# check
print(df_fe['is_spring'].value_counts())
print('term' in df_fe.columns)

0    27630
1    21656
Name: is_spring, dtype: int64
False


In [209]:
# binary enocde 'class_length'
df_fe['class_length'].replace({'shorter': 0, 'longer': 1}, inplace = True)

# rename column
df_fe.rename(columns = {'class_length': 'is_longer'}, inplace = True)

# check
print(df_fe['is_longer'].value_counts())
print('class_length' in df_fe.columns)

1    25638
0    23648
Name: is_longer, dtype: int64
False


In [210]:
# ohe 

# instantiate onehotencoder
ohe = OneHotEncoder(sparse = False, 
                    handle_unknown = 'ignore')

# grab columns
ohe_cols = ['section_type', 
            'instructor_id', 
            'start_time',
            'subject_name', 
            'year',  
            'course_difficulty', 
            'schedule_days', 
            'days_per_week',
            'class_size']

# make tuple
ohe_tuple = (ohe, ohe_cols)

In [211]:
# impute 'missing' and ohe

# grab columns
imp_and_ohe_cols = ['facility_code']

# instantiate imputer
missing_imputer = SimpleImputer(strategy = 'constant',
                               fill_value = 'missing')

# instantiate ohe
ohe = OneHotEncoder(sparse = False,
                   handle_unknown = 'ignore')

# make pipeline
imp_and_ohe_pipe = make_pipeline(missing_imputer, ohe)

# make tuple
imp_and_ohe_tuple = (imp_and_ohe_pipe, imp_and_ohe_cols)

In [212]:
# already binary (passthrough): 'is_spring', 'is_longer'

preprocessor2 = make_column_transformer(ohe_tuple,
                                        imp_and_ohe_tuple,
                                        remainder = 'passthrough')

# check
preprocessor2

# Modeling on Feature-Engineered Data

The highest-performing models with our original features are below:

In [213]:
metrics_df.sort_values(by = 'Test R2', ascending = False).head()

Unnamed: 0,Train R2,Test R2,Train MAE,Test MAE,Train MAPE,Test MAPE,Train RMSE,Test RMSE
Def RF,0.942911,0.624926,0.064763,0.168989,0.019262,1461970000000.0,0.089526,0.230476
Tun BR,0.940895,0.621733,0.065444,0.169767,0.019467,1461970000000.0,0.091093,0.231455
Tun XGB,0.849687,0.619919,0.10586,0.170964,0.031267,1438637000000.0,0.145268,0.232009
Tun RF,0.878312,0.618507,0.097912,0.175654,0.029102,1442124000000.0,0.130706,0.232439
Tun GBR,0.803271,0.616608,0.126062,0.174759,0.036942,1430556000000.0,0.16619,0.233017


Let's test the top four models (GBR, RF, BR, and XGB) with our feature-engineered dataset to see if we can improve the performance of any models.

In [214]:
# split df_fe into X and y
target = 'avg_letter_grade'
df_fe_y = df_fe[target]
df_fe_X = df_fe.drop(columns = target)

# validate model with train/test split
df_fe_X_train, df_fe_X_test, df_fe_y_train, df_fe_y_test = \
train_test_split(df_fe_X, df_fe_y, random_state = 42)

## XGB (Default and Tuned)

In [215]:
%%time
# time: ~10 seconds

# Def XGBRegressor on feature-engineered data

fe_def_xgb = XGBRegressor()
fe_def_xgb_pipe = make_pipeline(preprocessor2, fe_def_xgb)
fe_def_xgb_pipe.fit(df_fe_X_train, df_fe_y_train)
get_metrics(fe_def_xgb_pipe, 
            df_fe_X_train, 
            df_fe_X_test, 
            df_fe_y_train, 
            df_fe_y_test, 
            'FE Def XGB')

allDone()

Train R2      6.479061e-01
Test R2       5.948829e-01
Train MAE     1.698384e-01
Test MAE      1.814810e-01
Train MAPE    5.047456e-02
Test MAPE     1.418810e+12
Train RMSE    2.223311e-01
Test RMSE     2.395283e-01
Name: FE Def XGB, dtype: float64


CPU times: total: 1min 23s
Wall time: 7.47 s


In [96]:
%%time
# time: ~15 minutes

# Tun XGBRegressor on feature-engineered data

# instantiate
fe_xgb_tun = XGBRegressor()

# pipeline
fe_xgb_tun_pipe = make_pipeline(preprocessor2, fe_xgb_tun)

# params
fe_xgb_params = {}

# tune max_depth and n_estimators
fe_xgb_params['xgbregressor__max_depth'] = [20, 40, 60]
fe_xgb_params['xgbregressor__n_estimators'] = [20, 40, 60]

# gridsearchcv
fe_xgb_gs = GridSearchCV(fe_xgb_tun_pipe, 
                         fe_xgb_params,
                         scoring = 'r2')

fe_xgb_gs.fit(df_fe_X_train, df_fe_y_train)

allDone()

CPU times: total: 2h 20min 51s
Wall time: 12min 29s


In [97]:
# see best score from gridsearchcv
fe_xgb_gs.best_score_ # 0.596268858437248

0.5766970609561108

In [98]:
# see the best parameters from the tuned model
fe_xgb_gs.best_params_ 

# max_depth: 20
# n_estimators: 40

{'xgbregressor__max_depth': 20, 'xgbregressor__n_estimators': 40}

In [99]:
# instantiate tuned model
fe_xgb_tun_pipe = fe_xgb_gs.best_estimator_

# print and store metrics
get_metrics(fe_xgb_tun_pipe, 
            df_fe_X_train, 
            df_fe_X_test, 
            df_fe_y_train, 
            df_fe_y_test, 
            "FE Tun XGB")

Train R2      7.772439e-01
Test R2       5.858570e-01
Train MAE     1.304080e-01
Test MAE      1.801500e-01
Train MAPE    3.862324e-02
Test MAPE     1.433237e+12
Train RMSE    1.768422e-01
Test RMSE     2.421819e-01
Name: FE Tun XGB, dtype: float64


In [100]:
# # hard coding results into hard_metrics_df so I don't have to re-run the 
# # notebook cells that take a long time to run every time I open it up

# metrics_df.at['FE Tun XGB', 'Train R2'] = 0.8098826
# metrics_df.at['FE Tun XGB', 'Test R2'] = 0.6046896
# metrics_df.at['FE Tun XGB', 'Train MAE'] = 0.1194765
# metrics_df.at['FE Tun XGB', 'Test MAE'] = 0.1751210
# metrics_df.at['FE Tun XGB', 'Train MAPE'] = 03.537763
# metrics_df.at['FE Tun XGB', 'Test MAPE'] = 0.1438524
# metrics_df.at['FE Tun XGB', 'Train RMSE'] = 0.1633736
# metrics_df.at['FE Tun XGB', 'Test RMSE'] = 0.2366114

In [103]:
fe_tun_xgb = XGBRegressor(max_depth = 20, n_estimators = 40)
fe_tun_xgb_pipe = make_pipeline(preprocessor2, fe_tun_xgb)
fe_tun_xgb_pipe.fit(df_fe_X_train, df_fe_y_train)
get_metrics(fe_tun_xgb_pipe, 
            df_fe_X_train, 
            df_fe_X_test, 
            df_fe_y_train, 
            df_fe_y_test, 
            'FE Tun XGB')

Train R2      8.305437e-01
Test R2       6.037640e-01
Train MAE     1.107745e-01
Test MAE      1.746133e-01
Train MAPE    3.282673e-02
Test MAPE     1.452977e+12
Train RMSE    1.542410e-01
Test RMSE     2.368882e-01
Name: FE Tun XGB, dtype: float64


## Random Forest (Default and Tuned)

In [101]:
%%time
# time: ~3 minutes

# Default RandomForest
fe_def_rf = RandomForestRegressor()
fe_def_rf_pipe = make_pipeline(preprocessor2, fe_def_rf)
fe_def_rf_pipe.fit(df_fe_X_train, df_fe_y_train)
get_metrics(fe_def_rf_pipe, 
            df_fe_X_train, 
            df_fe_X_test, 
            df_fe_y_train, 
            df_fe_y_test, 
            'FE Def RF')

allDone()

Train R2      8.757629e-01
Test R2       5.533976e-01
Train MAE     9.043927e-02
Test MAE      1.838231e-01
Train MAPE    2.699290e-02
Test MAPE     1.461666e+12
Train RMSE    1.320677e-01
Test RMSE     2.514936e-01
Name: FE Def RF, dtype: float64


CPU times: total: 2min 15s
Wall time: 2min 14s


In [102]:
# # hard coding results into hard_metrics_df so I don't have to re-run the 
# # notebook cells that take a long time to run every time I open it up

# metrics_df.at['FE Def RF', 'Train R2'] = 0.9003948
# metrics_df.at['FE Def RF', 'Test R2'] = 0.5860328
# metrics_df.at['FE Def RF', 'Train MAE'] = 0.8062810
# metrics_df.at['FE Def RF', 'Test MAE'] = 0.1762535
# metrics_df.at['FE Def RF', 'Train MAPE'] = 0.2409774
# metrics_df.at['FE Def RF', 'Test MAPE'] = 0.1453729
# metrics_df.at['FE Def RF', 'Train RMSE'] = 0.1182530
# metrics_df.at['FE Def RF', 'Test RMSE'] = 0.2421305

In [103]:
# get depth from max of estimator depths in the default random forest model
est_depths = [estimator.get_depth() for estimator 
              in fe_def_rf_pipe['randomforestregressor'].estimators_]
max_depth_fe_rf = max(est_depths)
max_depth_fe_rf # 138

138

In [104]:
%%time
# time: ~6 minutes

# instantiate
fe_rf_tun = RandomForestRegressor(random_state = 42, n_jobs = -1)

# pipeline
fe_rf_tun_pipe = make_pipeline(preprocessor2, fe_rf_tun)

# params
fe_rf_params = {}

# max_depth (range up to almost half of depth of default model)
fe_rf_params['randomforestregressor__max_depth'] = range(10, 61, 10)

# max_features (default 1; range over middle several columns without 
# dummies); 18 feature columns
fe_rf_params['randomforestregressor__max_features'] = range(7, 12)

# gridsearchcv
fe_rf_gs = GridSearchCV(fe_rf_tun_pipe, 
                        fe_rf_params,
                        scoring = 'r2')

fe_rf_gs.fit(df_fe_X_train, df_fe_y_train)

allDone()

CPU times: total: 2min 29s
Wall time: 5min 33s


In [105]:
# see best score from randomizedsearchcv
fe_rf_gs.best_score_ # 0.5877687219325837

0.5616265513022651

In [106]:
# see the best parameters from the tuned model
fe_rf_gs.best_params_

# max_depth: 60
# max_features: 11

{'randomforestregressor__max_depth': 60,
 'randomforestregressor__max_features': 11}

In [107]:
# instantiate tuned model
fe_rf_tun_pipe = fe_rf_gs.best_estimator_

# print and store metrics
get_metrics(fe_rf_tun_pipe, 
            df_fe_X_train, 
            df_fe_X_test, 
            df_fe_y_train, 
            df_fe_y_test, 
            "FE Tun RF")

Train R2      8.287258e-01
Test R2       5.645499e-01
Train MAE     1.150044e-01
Test MAE      1.871522e-01
Train MAPE    3.423011e-02
Test MAPE     1.400700e+12
Train RMSE    1.550662e-01
Test RMSE     2.483337e-01
Name: FE Tun RF, dtype: float64


In [108]:
# # hard coding results into hard_metrics_df so I don't have to re-run the 
# # notebook cells that take a long time to run every time I open it up

# metrics_df.at['FE Tun RF', 'Train R2'] = 0.8712598
# metrics_df.at['FE Tun RF', 'Test R2'] = 0.5928270
# metrics_df.at['FE Tun RF', 'Train MAE'] = 0.9760140
# metrics_df.at['FE Tun RF', 'Test MAE'] = 0.1794536
# metrics_df.at['FE Tun RF', 'Train MAPE'] = 0.2911830
# metrics_df.at['FE Tun RF', 'Test MAPE'] = 0.1449994
# metrics_df.at['FE Tun RF', 'Train RMSE'] = 0.1344399
# metrics_df.at['FE Tun RF', 'Test RMSE'] = 0.2401353

## Bagging Regressor (Default and Tuned)

In [109]:
%%time
# time: ~20 seconds

# Default BaggingRegressor on feature-engineered data

fe_def_br = BaggingRegressor()
fe_def_br_pipe = make_pipeline(preprocessor2, fe_def_br)
fe_def_br_pipe.fit(df_fe_X_train, df_fe_y_train)
get_metrics(fe_def_br_pipe, 
            df_fe_X_train, 
            df_fe_X_test, 
            df_fe_y_train, 
            df_fe_y_test, 
            'FE Def BR')

allDone()

Train R2      8.590447e-01
Test R2       5.336360e-01
Train MAE     9.467162e-02
Test MAE      1.876006e-01
Train MAPE    2.822277e-02
Test MAPE     1.461970e+12
Train RMSE    1.406733e-01
Test RMSE     2.569975e-01
Name: FE Def BR, dtype: float64


CPU times: total: 15.3 s
Wall time: 15.3 s


In [110]:
# # hard coding results into hard_metrics_df so I don't have to re-run the 
# # notebook cells that take a long time to run every time I open it up

# metrics_df.at['FE Def BR', 'Train R2'] = 0.8835032
# metrics_df.at['FE Def BR', 'Test R2'] = 0.5650689
# metrics_df.at['FE Def BR', 'Train MAE'] = 0.8532929
# metrics_df.at['FE Def BR', 'Test MAE'] = 0.1807083
# metrics_df.at['FE Def BR', 'Train MAPE'] = 0.2546195
# metrics_df.at['FE Def BR', 'Test MAPE'] = 0.1461970
# metrics_df.at['FE Def BR', 'Train RMSE'] = 0.1278875
# metrics_df.at['FE Def BR', 'Test RMSE'] = 0.2481857

In [111]:
%%time
# time: ~15 minutes

# Tuned BaggingRegressor on feature-engineered data

# instantiate
fe_br_tun = BaggingRegressor(random_state = 42)

# pipeline
fe_br_tun_pipe = make_pipeline(preprocessor2, fe_br_tun)

# params
fe_br_params = {'baggingregressor__n_estimators': [10, 20, 30, 40, 50]}

# grid search
fe_br_gs = GridSearchCV(fe_br_tun_pipe, fe_br_params, scoring = 'r2')

# fit
fe_br_gs.fit(df_fe_X_train, df_fe_y_train)

# see best parameters
print(fe_br_gs.best_params_) # n_estimators: 50

# see best score
print(fe_br_gs.best_score_) # 0.5805967099947112

allDone()

{'baggingregressor__n_estimators': 50}
0.5506607658908474


CPU times: total: 13min 25s
Wall time: 13min 26s


In [112]:
# instantiate tuned model
fe_br_tun_pipe = fe_br_gs.best_estimator_

# print and store metrics
get_metrics(fe_br_tun_pipe, 
            df_fe_X_train, 
            df_fe_X_test, 
            df_fe_y_train, 
            df_fe_y_test, 
            "FE Tun BR")

Train R2      8.740310e-01
Test R2       5.498205e-01
Train MAE     9.094618e-02
Test MAE      1.844626e-01
Train MAPE    2.713628e-02
Test MAPE     1.460752e+12
Train RMSE    1.329850e-01
Test RMSE     2.524988e-01
Name: FE Tun BR, dtype: float64


In [113]:
# # hard coding results into hard_metrics_df so I don't have to re-run the 
# # notebook cells that take a long time to run every time I open it up

# metrics_df.at['FE Tun BR', 'Train R2'] = 0.8985798
# metrics_df.at['FE Tun BR', 'Test R2'] = 0.5831421
# metrics_df.at['FE Tun BR', 'Train MAE'] = 0.8127283
# metrics_df.at['FE Tun BR', 'Test MAE'] = 0.1767775
# metrics_df.at['FE Tun BR', 'Train MAPE'] = 0.2427930
# metrics_df.at['FE Tun BR', 'Test MAPE'] = 0.1450474
# metrics_df.at['FE Tun BR', 'Train RMSE'] = 0.1193255
# metrics_df.at['FE Tun BR', 'Test RMSE'] = 0.2429744

## Gradient Boosting Regressor (Default and Tuned)

In [114]:
%%time
# time: ~1 minute

# Default Gradient Boosting Regressor

# instantiate
fe_gbr_def = GradientBoostingRegressor()

# pipeline
fe_gbr_def_pipe = make_pipeline(preprocessor2, fe_gbr_def)

# fit
fe_gbr_def_pipe.fit(df_fe_X_train, df_fe_y_train)

# evaluate
get_metrics(fe_gbr_def_pipe, 
            df_fe_X_train, 
            df_fe_X_test, 
            df_fe_y_train, 
            df_fe_y_test, 
            "FE Def GBR")

allDone()

Train R2      4.569718e-01
Test R2       4.548532e-01
Train MAE     2.192822e-01
Test MAE      2.194476e-01
Train MAPE    6.519473e-02
Test MAPE     1.330057e+12
Train RMSE    2.761101e-01
Test RMSE     2.778582e-01
Name: FE Def GBR, dtype: float64


CPU times: total: 59.7 s
Wall time: 59.7 s


In [115]:
# # hard coding results into hard_metrics_df so I don't have to re-run the 
# # notebook cells that take a long time to run every time I open it up

# metrics_df.at['FE Def GBR', 'Train R2'] = 0.4881094
# metrics_df.at['FE Def GBR', 'Test R2'] = 0.4834946
# metrics_df.at['FE Def GBR', 'Train MAE'] = 0.2112037
# metrics_df.at['FE Def GBR', 'Test MAE'] = 0.2120858
# metrics_df.at['FE Def GBR', 'Train MAPE'] = 0.6288145
# metrics_df.at['FE Def GBR', 'Test MAPE'] = 0.1378008
# metrics_df.at['FE Def GBR', 'Train RMSE'] = 0.2680770
# metrics_df.at['FE Def GBR', 'Test RMSE'] = 0.2704605

In [116]:
%%time
# time: ~1 hour

# Tuned Gradient Boosting Regressor

# instantiate
fe_gbr_tun = GradientBoostingRegressor()

# pipeline
fe_gbr_tun_pipe = make_pipeline(preprocessor2, fe_gbr_tun)

# params
fe_gbr_params = {}

# tune n_neighbors
fe_gbr_params['gradientboostingregressor__max_depth'] = [5, 10, 15, 20]
fe_gbr_params['gradientboostingregressor__n_estimators'] = [10, 20, 30, 
                                                            40, 50]

# gridsearchcv
fe_gbr_gs = GridSearchCV(fe_gbr_tun_pipe, 
                         fe_gbr_params,
                         scoring = 'r2')

fe_gbr_gs.fit(df_fe_X_train, df_fe_y_train)

allDone()

CPU times: total: 1h 11min 13s
Wall time: 1h 11min 15s


In [117]:
# see best score from randomizedsearchcv
fe_gbr_gs.best_score_ # 0.5837138019978324

0.5590917712114484

In [118]:
# see the best parameters from the tuned model
fe_gbr_gs.best_params_

# max_depth: 15
# n_estimators: 50

{'gradientboostingregressor__max_depth': 15,
 'gradientboostingregressor__n_estimators': 50}

In [119]:
# instantiate tuned model
fe_gbr_tun_pipe = fe_gbr_gs.best_estimator_

# print and store metrics
get_metrics(fe_gbr_tun_pipe, 
            df_fe_X_train, 
            df_fe_X_test, 
            df_fe_y_train, 
            df_fe_y_test, 
            "FE Tun GBR")

Train R2      7.172304e-01
Test R2       5.647684e-01
Train MAE     1.527152e-01
Test MAE      1.875028e-01
Train MAPE    4.508698e-02
Test MAPE     1.402419e+12
Train RMSE    1.992450e-01
Test RMSE     2.482714e-01
Name: FE Tun GBR, dtype: float64


In [120]:
# # hard coding results into hard_metrics_df so I don't have to re-run the 
# # notebook cells that take a long time to run every time I open it up

# metrics_df.at['FE Tun GBR', 'Train R2'] = 0.7555908
# metrics_df.at['FE Tun GBR', 'Test R2'] = 0.5935101
# metrics_df.at['FE Tun GBR', 'Train MAE'] = 0.1406146
# metrics_df.at['FE Tun GBR', 'Test MAE'] = 0.1798647
# metrics_df.at['FE Tun GBR', 'Train MAPE'] = 0.4149077
# metrics_df.at['FE Tun GBR', 'Test MAPE'] = 0.1418538
# metrics_df.at['FE Tun GBR', 'Train RMSE'] = 0.1852379
# metrics_df.at['FE Tun GBR', 'Test RMSE'] = 0.2399337

# Model Selection

In [121]:
metrics_df.sort_values(by = 'Test R2', ascending = False)

Unnamed: 0,Train R2,Test R2,Train MAE,Test MAE,Train MAPE,Test MAPE,Train RMSE,Test RMSE
Def RF,0.942911,0.6249257,0.064763,0.1689894,0.019262,1461970000000.0,0.089526,0.2304757
Tun BR,0.940895,0.6217327,0.065444,0.1697671,0.019467,1461970000000.0,0.091093,0.2314546
Tun XGB,0.849687,0.6199189,0.10586,0.170964,0.031267,1438637000000.0,0.145268,0.2320089
Tun RF,0.878312,0.6185071,0.097912,0.1756538,0.029102,1442124000000.0,0.130706,0.2324394
Tun GBR,0.803271,0.6166083,0.126062,0.1747592,0.036942,1430556000000.0,0.16619,0.2330171
Def XGB,0.662758,0.6082175,0.166119,0.1788502,0.049333,1411779000000.0,0.217592,0.2355532
Tun KNN,0.676508,0.6069203,0.159823,0.1754335,0.04741,1461970000000.0,0.21311,0.2359428
Def BR,0.924526,0.5958595,0.070402,0.1750572,0.020931,1461970000000.0,0.102937,0.2392394
Def KNN,0.731461,0.5949229,0.143458,0.1753353,0.042585,1461970000000.0,0.194167,0.2395164
Def LGBM,0.614156,0.5922337,0.178591,0.1835697,0.053114,1411605000000.0,0.232743,0.2403102


This round of feature selection did not help our models perform better.

# Export Feature-Engineered Data

In [216]:
df_fe.to_csv('Data/feature_engineered_data.csv')