In [75]:
# display a sound when cell is done running
# source: https://forums.fast.ai/t/sound-alerts-in-jupyter-for-code-completion-and-exceptions/4614

## Import up sound alert dependencies
from IPython.display import Audio, display

def allDone():
    display(Audio('f1-radio-notification-By-tuna.voicemod.net.mp3', autoplay=True))

allDone()

# **Number of Students: Regression Processing and Modeling**

# Preliminary Steps

In [76]:
# import libraries

import warnings
warnings.filterwarnings('ignore')

# general
import numpy as np
import pandas as pd

# preprocessing
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split

# modeling
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, \
GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

# evaluation
from sklearn.metrics import r2_score, mean_absolute_error, \
mean_squared_error, mean_absolute_percentage_error

# tuning
from sklearn.model_selection import GridSearchCV

# feature engineering
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn import set_config
set_config(display="diagram") # can change to text
from kneed import KneeLocator

In [77]:
# load data
path = 'Data/all_grades_data_cleaned.csv'
df = pd.read_csv(path, index_col = 0)

In [78]:
# inspect
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49975 entries, 0 to 50205
Data columns (total 51 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   section_type       49975 non-null  object 
 1   instructor_id      49975 non-null  object 
 2   facility_code      46160 non-null  object 
 3   start_time         49975 non-null  float64
 4   mon                49975 non-null  bool   
 5   tues               49975 non-null  bool   
 6   wed                49975 non-null  bool   
 7   thurs              49975 non-null  bool   
 8   fri                49975 non-null  bool   
 9   sat                49975 non-null  bool   
 10  sun                49975 non-null  bool   
 11  subject_name       49975 non-null  object 
 12  course_name        49975 non-null  object 
 13  a_count            49975 non-null  int64  
 14  ab_count           49975 non-null  int64  
 15  b_count            49975 non-null  int64  
 16  bc_count           499

In [79]:
# check for duplicates
df.duplicated().sum()

0

In [80]:
# check for missing values
df.isna().sum()

# 'facility_code' is the only feature column with missing values; will need 
# to impute
# 'avg_grade' missing values but this column will be dropped as we are using
# 'num_all_grades' as our target variable in this notebook

section_type            0
instructor_id           0
facility_code        3815
start_time              0
mon                     0
tues                    0
wed                     0
thurs                   0
fri                     0
sat                     0
sun                     0
subject_name            0
course_name             0
a_count                 0
ab_count                0
b_count                 0
bc_count                0
c_count                 0
d_count                 0
f_count                 0
s_count                 0
u_count                 0
cr_count                0
n_count                 0
p_count                 0
i_count                 0
nw_count                0
nr_count                0
other_count             0
num_all_grades          0
a_proportion            0
ab_proportion           0
b_proportion            0
bc_proportion           0
c_proportion            0
d_proportion            0
f_proportion            0
s_proportion            0
u_proportion

# Preprocessing

## Target: Number of Grades

In this notebook, we will see if we can predict the number of students in each class as a proxy for how popular each class is. We will use the number of all grades given in each class as the number of students who took the class. This is not a perfect measure since students can drop classes before getting their final grades, and this dataset does not include information on drops.

In [81]:
# drop columns that will not be used as target

drop_columns = ['a_count', 'ab_count', 'b_count', 'bc_count', 'c_count', 
                'd_count', 'f_count', 's_count', 'u_count', 'cr_count', 
                'n_count', 'p_count', 'i_count', 'nw_count', 'nr_count', 
                'other_count', 'a_proportion', 'ab_proportion', 
                'b_proportion', 'bc_proportion', 'c_proportion', 
                'd_proportion', 'f_proportion', 's_proportion', 
                'u_proportion', 'cr_proportion', 'n_proportion', 
                'p_proportion', 'i_proportion', 'nw_proportion', 
                'nr_proportion', 'other_proportion', 'avg_letter_grade']

df.drop(columns = drop_columns, inplace = True)

# check
for col in drop_columns:
    print(f"col {col} is in df: {col in df.columns}")

col a_count is in df: False
col ab_count is in df: False
col b_count is in df: False
col bc_count is in df: False
col c_count is in df: False
col d_count is in df: False
col f_count is in df: False
col s_count is in df: False
col u_count is in df: False
col cr_count is in df: False
col n_count is in df: False
col p_count is in df: False
col i_count is in df: False
col nw_count is in df: False
col nr_count is in df: False
col other_count is in df: False
col a_proportion is in df: False
col ab_proportion is in df: False
col b_proportion is in df: False
col bc_proportion is in df: False
col c_proportion is in df: False
col d_proportion is in df: False
col f_proportion is in df: False
col s_proportion is in df: False
col u_proportion is in df: False
col cr_proportion is in df: False
col n_proportion is in df: False
col p_proportion is in df: False
col i_proportion is in df: False
col nw_proportion is in df: False
col nr_proportion is in df: False
col other_proportion is in df: False
col av

## Features

Here we will manipulate our feature variables as in the average_grade_regression_preprocessing_and_modeling notebook, in the feature engineering section. This is because those manipulations made the models predicting average letter grade much more accurate, so we will hope they would do the same for this target variable.

In [82]:
df.columns

Index(['section_type', 'instructor_id', 'facility_code', 'start_time', 'mon',
       'tues', 'wed', 'thurs', 'fri', 'sat', 'sun', 'subject_name',
       'course_name', 'num_all_grades', 'year', 'term', 'class_length',
       'course_difficulty'],
      dtype='object')

### Schedule

Create a new column for how many days per week the course is held (numeric, to be one-hot encoded in preprocessor).

In [83]:
# first convert boolean values (True, False) to 1s and 0s so can be summed

days_in_week = ['mon', 'tues', 'wed', 'thurs', 'fri', 'sat', 'sun']

for day in days_in_week:
    df[day].replace({True: 1, False: 0}, inplace = True)

# check
df[['mon', 'tues', 'wed', 'thurs', 'fri', 'sat', 'sun']]

Unnamed: 0,mon,tues,wed,thurs,fri,sat,sun
0,0,0,0,0,0,0,0
1,0,1,0,1,0,0,0
2,0,1,0,1,0,0,0
3,1,0,1,0,1,0,0
4,1,0,1,0,1,0,0
...,...,...,...,...,...,...,...
50201,1,1,0,0,1,0,0
50202,1,1,0,0,1,0,0
50203,1,1,0,0,1,0,0
50204,1,1,0,0,1,0,0


In [84]:
# make new column
df['days_per_week'] = (df['mon'] + df['tues'] + df['wed'] +
                       df['thurs'] + df['fri'] + df['sat'] +
                       df['sun'])

# check
df[['mon', 'tues', 'wed', 'thurs', 'fri', 'sat', 'sun', 'days_per_week']]

Unnamed: 0,mon,tues,wed,thurs,fri,sat,sun,days_per_week
0,0,0,0,0,0,0,0,0
1,0,1,0,1,0,0,0,2
2,0,1,0,1,0,0,0,2
3,1,0,1,0,1,0,0,3
4,1,0,1,0,1,0,0,3
...,...,...,...,...,...,...,...,...
50201,1,1,0,0,1,0,0,3
50202,1,1,0,0,1,0,0,3
50203,1,1,0,0,1,0,0,3
50204,1,1,0,0,1,0,0,3


In [85]:
# change 1s and 0s to abbreviations

# change 1s and 0s in days of week columns to abbreviations 
# (M, T, W, R, F, A, U)

df['mon'].replace({1: 'M', 0: ''}, inplace = True)
df['tues'].replace({1: 'T', 0: ''}, inplace = True)
df['wed'].replace({1: 'W', 0: ''}, inplace = True)
df['thurs'].replace({1: 'R', 0: ''}, inplace = True)
df['fri'].replace({1: 'F', 0: ''}, inplace = True)
df['sat'].replace({1: 'A', 0: ''}, inplace = True)
df['sun'].replace({1: 'U', 0: ''}, inplace = True)

# check
df[['mon', 'tues', 'wed', 'thurs', 'fri', 'sat', 'sun', 'days_per_week']]

Unnamed: 0,mon,tues,wed,thurs,fri,sat,sun,days_per_week
0,,,,,,,,0
1,,T,,R,,,,2
2,,T,,R,,,,2
3,M,,W,,F,,,3
4,M,,W,,F,,,3
...,...,...,...,...,...,...,...,...
50201,M,T,,,F,,,3
50202,M,T,,,F,,,3
50203,M,T,,,F,,,3
50204,M,T,,,F,,,3


In [86]:
# concatenate days of week columns into one
df['schedule_days'] = (df['mon'] + df['tues'] + df['wed'] + 
                       df['thurs'] + df['fri'] + df['sat'] + 
                       df['sun'])

# check
df['schedule_days'].value_counts()

TR         15477
MWF         9313
MW          7150
            3899
T           2485
W           2334
M           1985
MTWR        1816
R           1534
MTWRF       1240
F            893
MTRF         870
A            217
MF           206
WF           162
MWR           88
MTWF          69
U             40
MTR           31
TRF           31
TWRF          27
MR            22
FA            18
MT            14
WR            11
MWRF           9
MTF            7
MRF            5
TF             4
RF             4
MTW            3
TWR            3
TRA            2
WRF            2
TW             2
MA             1
MTWRFAU        1
Name: schedule_days, dtype: int64

In [87]:
# replace '' with 'none'
df['schedule_days'] = df['schedule_days'].apply(lambda x: 'none' 
                                                if x == '' 
                                                else x)

# check
df['schedule_days'].value_counts()

TR         15477
MWF         9313
MW          7150
none        3899
T           2485
W           2334
M           1985
MTWR        1816
R           1534
MTWRF       1240
F            893
MTRF         870
A            217
MF           206
WF           162
MWR           88
MTWF          69
U             40
MTR           31
TRF           31
TWRF          27
MR            22
FA            18
MT            14
WR            11
MWRF           9
MTF            7
MRF            5
TF             4
RF             4
MTW            3
TWR            3
TRA            2
WRF            2
TW             2
MA             1
MTWRFAU        1
Name: schedule_days, dtype: int64

In [88]:
# bin all schedules with fewer than 100 values into 'other'

keep_schedules = ['TR', 'MWF', 'MW', 'none', 'T', 'W', 'M', 'MTWR', 
                 'R', 'MTWRF', 'MTRF', 'F', 'A', 'MF', 'WF']

df['schedule_days'] = df['schedule_days'].apply(lambda x: x 
                                                if x in keep_schedules 
                                                else 'other')

# check
df['schedule_days'].value_counts()

TR       15477
MWF       9313
MW        7150
none      3899
T         2485
W         2334
M         1985
MTWR      1816
R         1534
MTWRF     1240
F          893
MTRF       870
other      394
A          217
MF         206
WF         162
Name: schedule_days, dtype: int64

In [89]:
# drop original columns
df.drop(columns = days_in_week, inplace = True)

# check
for col in days_in_week:
    print(f"{col} col in df: {col in df.columns}")

mon col in df: False
tues col in df: False
wed col in df: False
thurs col in df: False
fri col in df: False
sat col in df: False
sun col in df: False


### Course name

In [90]:
# course_name is a finer distinction of 'subject_name'; let's get rid of it

df.drop(columns = 'course_name', inplace = True)

# check
'course_name' in df.columns

False

### Start time

In [91]:
# start_time is given in minutes, so it is a numerical variable; courses
# without an assigned start_time are coded with -1
# start_time shouldn't be treated as a numeric variable since there is not a
# natural size order to times; for example, it's possible that courses in the
# afternoon give out higher grades than both courses in the mornings and
# evenings

# we will bin courses by start_time to morning, afternoon, evening, and none
# then we will be able to one-hot encode this variable

# times:
# -1 = 'none'
# earliest start time is 6:30am (390)
# 390 - 479 = early morning (6:30-7:59am) # 1.5 hours
# 480 - 599 = mid-morning (8-9:59am) # 2 hours
# 600 - 719 = late morning (10-11:59am) # 2 hours
# 720 - 839 = early afternoon (noon-1:59pm) # 2 hours
# 840 - 959 = mid-afternoon (2-3:59pm) # 2 hours
# 960 - 1079 = late afternoon (4-5:59pm) # 2 hours
# 1080 - 1260 = evening (6-9pm) # 3 hours
# last start time is 9pm (1260)

for index in df['start_time'].index:
    if df.at[index, 'start_time'] == -1:
        df.at[index, 'start_time'] = 'none'
    elif df.at[index, 'start_time'] < 480:
        df.at[index, 'start_time'] = 'early morning'
    elif df.at[index, 'start_time'] < 600:
        df.at[index, 'start_time'] = 'mid-morning'
    elif df.at[index, 'start_time'] < 720:
        df.at[index, 'start_time'] = 'late morning'
    elif df.at[index, 'start_time'] < 840:
        df.at[index, 'start_time'] = 'early afternoon'
    elif df.at[index, 'start_time'] < 960:
        df.at[index, 'start_time'] = 'mid-afternoon'
    elif df.at[index, 'start_time'] < 1080:
        df.at[index, 'start_time'] = 'late afternoon'
    else:
        df.at[index, 'start_time'] = 'evening'
        
# check
df['start_time'].value_counts()

mid-morning        12819
early afternoon    11594
late morning        8105
mid-afternoon       8090
late afternoon      3975
none                3920
early morning        769
evening              703
Name: start_time, dtype: int64

### Class length

In [92]:
# bin into less than 75 and 75 or more
df['class_length'] = df['class_length'].apply(lambda x: 'shorter' 
                                              if x < 75 
                                              else 'longer')

# check
df['class_length'].value_counts()

longer     25813
shorter    24162
Name: class_length, dtype: int64

## Create preprocessor

In [93]:
# split into X and y
target = 'num_all_grades'
y = df[target]
X = df.drop(columns = target)

# check
print(f"y:\n{y}")
print(f"X:\n{X}")

y:
0         8.0
1        13.0
2        13.0
3        12.0
4        11.0
         ... 
50201    81.0
50202    81.0
50203    88.0
50204    80.0
50205    21.0
Name: num_all_grades, Length: 49975, dtype: float64
X:
      section_type instructor_id facility_code       start_time  \
0              lec         other           NaN             none   
1              lec         other          0545     late morning   
2              lec         other          0545     late morning   
3              lec         other          0545      mid-morning   
4              lec         other          0545      mid-morning   
...            ...           ...           ...              ...   
50201          lec         other          0093  early afternoon   
50202          lec         other          0093  early afternoon   
50203          lec         other          0093  early afternoon   
50204          lec         other          0093  early afternoon   
50205          lab         other          0021     

In [94]:
# validate model with train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

# check
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (37481, 11)
X_test shape: (12494, 11)
y_train shape: (37481,)
y_test shape: (12494,)


In [95]:
X.columns

Index(['section_type', 'instructor_id', 'facility_code', 'start_time',
       'subject_name', 'year', 'term', 'class_length', 'course_difficulty',
       'days_per_week', 'schedule_days'],
      dtype='object')

In [96]:
# one-hot encode

ohe_cols = ['section_type', 'instructor_id', 'start_time', 'days_per_week', 
            'subject_name', 'year', 'term', 'class_length', 'schedule_days',
            'course_difficulty']

ohe = OneHotEncoder(handle_unknown = 'ignore',
                    sparse = False)

ohe_tuple = (ohe, ohe_cols)

In [97]:
# impute and one-hot encode

imp_and_ohe_cols = ['facility_code']

missing_imputer = SimpleImputer(strategy = 'constant', 
                                fill_value = 'missing')

ohe = OneHotEncoder(handle_unknown = 'ignore',
                    sparse = False)

imp_and_ohe_pipe = make_pipeline(missing_imputer, ohe)

imp_and_ohe_tuple = (imp_and_ohe_pipe, imp_and_ohe_cols)

In [98]:
# create preprocessor
preprocessor = make_column_transformer(ohe_tuple,
                                      imp_and_ohe_tuple,
                                      remainder = 'drop')

# check
preprocessor

# Create helper functions

In [99]:
# create dataframe to save evaluation metrics
regression_metrics_df = pd.DataFrame()

# check
regression_metrics_df.shape

(0, 0)

In [100]:
# define function that will print regression metrics 9for current model, and 
# store metrics in regression_metrics_df along with all other models' metrics
# for easy viewing

def evaluate_regression(model_pipe, X_train, X_test, y_train, y_test, name):
    
    # calculate predictions
    train_pred = model_pipe.predict(X_train)
    test_pred = model_pipe.predict(X_test)

    # store scores
    regression_metrics_df.at[name, 'Train R2'] = \
        r2_score(y_train, train_pred) 
    
    regression_metrics_df.at[name, 'Test R2'] = \
        r2_score(y_test, test_pred)
        
    regression_metrics_df.at[name, 'Train MAE'] = \
        mean_absolute_error(y_train, train_pred)
    
    regression_metrics_df.at[name, 'Test MAE'] = \
        mean_absolute_error(y_test, test_pred)
    
    regression_metrics_df.at[name, 'Train MAPE'] = \
        mean_absolute_percentage_error(y_train, train_pred)
    
    regression_metrics_df.at[name, 'Test MAPE'] = \
        mean_absolute_percentage_error(y_test, test_pred)
    
    regression_metrics_df.at[name, 'Train RMSE'] = \
        np.sqrt(mean_squared_error(y_train, train_pred))
    
    regression_metrics_df.at[name, 'Test RMSE'] = \
        np.sqrt(mean_squared_error(y_test, test_pred))
  
    # show scores for this model only (can call regression_metrics_df to 
    # see all scores)
    print(regression_metrics_df.loc[name, :])

# Modeling: Round 1

## Dummy Model

In [101]:
%%time
# wall time: 705 milliseconds

# instantiate
dummy_regressor = DummyRegressor(strategy = 'mean')

# combine with preprocessor
dummy_pipe = make_pipeline(preprocessor, dummy_regressor)

# fit on training data
dummy_pipe.fit(X_train, y_train)

# calculate, show, and store metrics
evaluate_regression(dummy_pipe, 
                    X_train, 
                    X_test, 
                    y_train, 
                    y_test, 
                    "Dummy")

Train R2       0.000000
Test R2       -0.000227
Train MAE     30.284737
Test MAE      30.792237
Train MAPE     1.471100
Test MAPE      1.465341
Train RMSE    52.317775
Test RMSE     53.851290
Name: Dummy, dtype: float64
CPU times: total: 453 ms
Wall time: 448 ms


## Model 1: Linear Regression

In [102]:
%%time
# wall time: 2 seconds

# instantiate
lr = LinearRegression()

# pipeline
lr_pipe = make_pipeline(preprocessor, lr)

# fit
lr_pipe.fit(X_train, y_train)

# evaluate
evaluate_regression(lr_pipe, 
                    X_train, 
                    X_test, 
                    y_train, 
                    y_test, 
                    "Linear")

Train R2      3.964443e-01
Test R2      -3.196729e+19
Train MAE     2.387622e+01
Test MAE      4.917438e+09
Train MAPE    1.138980e+00
Test MAPE     3.693067e+08
Train RMSE    4.064507e+01
Test RMSE     3.044386e+11
Name: Linear, dtype: float64
CPU times: total: 4.86 s
Wall time: 1.31 s


## Model 2: Decision Tree Regressor

### Default

In [103]:
%%time
# wall time: 7 seconds

# instantiate
dt_def = DecisionTreeRegressor(random_state = 42)

# pipeline
dt_def_pipe = make_pipeline(preprocessor, dt_def)

# fit
dt_def_pipe.fit(X_train, y_train)

# evaluate
evaluate_regression(dt_def_pipe, 
                    X_train, 
                    X_test, 
                    y_train, 
                    y_test, 
                    "Def DT")

Train R2       0.931858
Test R2        0.552381
Train MAE      2.999277
Test MAE      14.998740
Train MAPE     0.118540
Test MAPE      0.528410
Train RMSE    13.657047
Test RMSE     36.024753
Name: Def DT, dtype: float64
CPU times: total: 4.77 s
Wall time: 3.91 s


### Tuned on 'max_depth'

In [104]:
# # get depth from default tree where max_depth = None
# def_depth = dt_def_pipe['decisiontreeregressor'].get_depth()
# def_depth

# # 133

# # this is a very deep tree, which is overfitting by a lot; for the
# # tuned model, I will only try up to half this depth

In [105]:
# %%time
# # wall time: 4 minutes

# # instantiate
# dt_tun = DecisionTreeRegressor(random_state = 42)

# # pipeline
# dt_tun_pipe = make_pipeline(preprocessor, dt_tun)

# # params
# dt_params = {'decisiontreeregressor__max_depth': range(1, 67, 5)}

# # gridsearch
# dt_gs = GridSearchCV(dt_tun_pipe, dt_params, scoring = 'r2')

# # fit
# dt_gs.fit(X_train, y_train)

# # see best parameters
# print(dt_gs.best_params_) # max_depth: 26

# # see best score
# print(dt_gs.best_score_) # 0.552244552061277

# allDone()

In [106]:
# # instantiate tuned model
# dt_tun_pipe = dt_gs.best_estimator_

# # print and store metrics
# evaluate_regression(dt_tun_pipe, 
#                     X_train, 
#                     X_test, 
#                     y_train, 
#                     y_test, 
#                     "Tun DT")

In [107]:
# hard coding results into regression_metrics_df so I don't have to re-run the 
# notebook cells that take a long time to run every time I open it up

regression_metrics_df.at['Tun DT', 'Train R2'] = 0.816726
regression_metrics_df.at['Tun DT', 'Test R2'] = 0.567127
regression_metrics_df.at['Tun DT', 'Train MAE'] = 10.464698
regression_metrics_df.at['Tun DT', 'Test MAE'] = 16.181360
regression_metrics_df.at['Tun DT', 'Train MAPE'] = 0.492257
regression_metrics_df.at['Tun DT', 'Test MAPE'] = 0.639247
regression_metrics_df.at['Tun DT', 'Train RMSE'] = 22.397496
regression_metrics_df.at['Tun DT', 'Test RMSE'] = 35.426392

## Model 3: Bagged Tree Regressor

### Default

In [108]:
# %%time
# # wall time:  33 seconds

# # instantiate
# br_def = BaggingRegressor(random_state = 42)

# # pipeline
# br_def_pipe = make_pipeline(preprocessor, br_def)

# # fit
# br_def_pipe.fit(X_train, y_train)

# # evaluate
# evaluate_regression(br_def_pipe, 
#                     X_train, 
#                     X_test, 
#                     y_train, 
#                     y_test, 
#                     "Def BR")

In [109]:
# hard coding results into regression_metrics_df so I don't have to re-run the 
# notebook cells that take a long time to run every time I open it up

regression_metrics_df.at['Def BR', 'Train R2'] = 0.898944
regression_metrics_df.at['Def BR', 'Test R2'] = 0.667539
regression_metrics_df.at['Def BR', 'Train MAE'] = 6.327471
regression_metrics_df.at['Def BR', 'Test MAE'] = 13.434552
regression_metrics_df.at['Def BR', 'Train MAPE'] = 0.238370
regression_metrics_df.at['Def BR', 'Test MAPE'] = 0.495508
regression_metrics_df.at['Def BR', 'Train RMSE'] = 16.631432
regression_metrics_df.at['Def BR', 'Test RMSE'] = 31.046835

### BR Tuned 1: 'n_estimators'

In [110]:
# %%time
# # time: 31 minutes

# # instantiate
# br_tun = BaggingRegressor(random_state = 42)

# # pipeline
# br_tun_pipe = make_pipeline(preprocessor, br_tun)

# # params
# br_params = {'baggingregressor__n_estimators': [10, 20, 30, 40, 50]}

# # grid search
# br_gs = GridSearchCV(br_tun_pipe, br_params, scoring = 'r2')

# # fit
# br_gs.fit(X_train, y_train)

# # see best parameters
# print(br_gs.best_params_) # n_estimators: 

# # see best score
# print(br_gs.best_score_) # 

# allDone()

In [111]:
# # instantiate tuned model
# br_tun_pipe = br_gs.best_estimator_

# # print and store metrics
# evaluate_regression(br_tun_pipe, 
#                     X_train, 
#                     X_test, 
#                     y_train, 
#                     y_test, 
#                     "Tun BR")

In [112]:
# hard coding results into hard_metrics_df so I don't have to re-run the 
# notebook cells that take a long time to run every time I open it up

regression_metrics_df.at['Tun BR', 'Train R2'] = 0.907775
regression_metrics_df.at['Tun BR', 'Test R2'] = 0.678912
regression_metrics_df.at['Tun BR', 'Train MAE'] = 6.087626
regression_metrics_df.at['Tun BR', 'Test MAE'] = 13.107256
regression_metrics_df.at['Tun BR', 'Train MAPE'] = 0.230614
regression_metrics_df.at['Tun BR', 'Test MAPE'] = 0.485255
regression_metrics_df.at['Tun BR', 'Train RMSE'] = 15.888121
regression_metrics_df.at['Tun BR', 'Test RMSE'] = 30.511182

### BR Tuned 2: 'max_features' and 'max_samples'

In [182]:
# %%time
# # time: 3 minutes 18 seconds

# # instantiate
# br_tun2 = BaggingRegressor(random_state = 42)

# # pipeline
# br_tun2_pipe = make_pipeline(preprocessor, br_tun2)

# # params
# br2_params = {'baggingregressor__max_features': [.25, .5, .75],
#               'baggingregressor__max_samples': [.25, .5, .75]}

# # grid search
# br2_gs = GridSearchCV(br_tun2_pipe, br2_params, scoring = 'r2')

# # fit
# br2_gs.fit(X_train, y_train)

# allDone()

CPU times: total: 3min 18s
Wall time: 3min 18s


In [183]:
# # see best parameters
# print(br2_gs.best_params_) 
# # max_features: .75
# # max_samples: .75

{'baggingregressor__max_features': 0.75, 'baggingregressor__max_samples': 0.75}


In [184]:
# # see best score
# print(br2_gs.best_score_) # 0.6495152892833793

0.6495152892833793


In [185]:
# # instantiate tuned model
# br_tun2_pipe = br2_gs.best_estimator_

# # print and store metrics
# evaluate_regression(br_tun2_pipe, 
#                     X_train, 
#                     X_test, 
#                     y_train, 
#                     y_test, 
#                     "Tun 2 BR")

Train R2       0.855164
Test R2        0.675588
Train MAE      9.061913
Test MAE      14.971682
Train MAPE     0.369960
Test MAPE      0.621645
Train RMSE    19.910724
Test RMSE     30.668712
Name: Tun 2 BR, dtype: float64


In [186]:
# hard coding results into hard_metrics_df so I don't have to re-run the 
# notebook cells that take a long time to run every time I open it up

regression_metrics_df.at['Tun 2 BR', 'Train R2'] = 0.855164
regression_metrics_df.at['Tun 2 BR', 'Test R2'] = 0.675588
regression_metrics_df.at['Tun 2 BR', 'Train MAE'] = 9.061913
regression_metrics_df.at['Tun 2 BR', 'Test MAE'] = 14.971682
regression_metrics_df.at['Tun 2 BR', 'Train MAPE'] = 0.369960
regression_metrics_df.at['Tun 2 BR', 'Test MAPE'] = 0.621645
regression_metrics_df.at['Tun 2 BR', 'Train RMSE'] = 19.910724
regression_metrics_df.at['Tun 2 BR', 'Test RMSE'] = 30.668712

## Model 4: Random Forest Regressor

### Default

In [113]:
# %%time
# # time: 5 minutes

# # instantiate
# rf_def = RandomForestRegressor(random_state = 42)

# # pipeline
# rf_def_pipe = make_pipeline(preprocessor, rf_def)

# # fit
# rf_def_pipe.fit(X_train, y_train)

# # evaluate
# evaluate_regression(rf_def_pipe, 
#                     X_train, 
#                     X_test, 
#                     y_train, 
#                     y_test, 
#                     "Def RF")

# allDone()

In [114]:
# hard coding results into hard_metrics_df so I don't have to re-run the 
# notebook cells that take a long time to run every time I open it up

regression_metrics_df.at['Def RF', 'Train R2'] = 0.908955
regression_metrics_df.at['Def RF', 'Test R2'] = 0.679376
regression_metrics_df.at['Def RF', 'Train MAE'] = 6.054180
regression_metrics_df.at['Def RF', 'Test MAE'] = 13.062209
regression_metrics_df.at['Def RF', 'Train MAPE'] = 0.229691
regression_metrics_df.at['Def RF', 'Test MAPE'] = 0.482821
regression_metrics_df.at['Def RF', 'Train RMSE'] = 15.786208
regression_metrics_df.at['Def RF', 'Test RMSE'] = 30.489094

### RF Tuned 1: 'max_depth' and 'max_features'

In [115]:
# # get depth from max of estimator depths in the default random forest model
# est_depths = [estimator.get_depth() for estimator 
#               in rf_def_pipe['randomforestregressor'].estimators_]
# max_depth_rf = max(est_depths)
# max_depth_rf # 154

In [116]:
# %%time
# # time: 8 minutes 29 seconds

# # instantiate
# rf_tun = RandomForestRegressor(random_state = 42, n_jobs = -1)

# # pipeline
# rf_tun_pipe = make_pipeline(preprocessor, rf_tun)

# # params
# rf_params = {}

# # max_depth (range up to almost half of depth of default model)
# rf_params['randomforestregressor__max_depth'] = range(10, 61, 10)

# # max_features (default 1; range over middle several columns without 
# # dummies); 18 feature columns
# rf_params['randomforestregressor__max_features'] = range(7, 12)

# # gridsearchcv
# rf_gs = GridSearchCV(rf_tun_pipe, 
#                      rf_params,
#                      scoring = 'r2')

# rf_gs.fit(X_train, y_train)

# allDone()

In [117]:
# # see best score from gridsearchcv
# rf_gs.best_score_ # 0.6396938929084938

In [118]:
# # see the best parameters from the tuned model
# rf_gs.best_params_

# # max_depth: 60
# # max_features: 11

In [119]:
# # instantiate tuned model
# rf_tun_pipe = rf_gs.best_estimator_

# # print and store metrics
# evaluate_regression(rf_tun_pipe, 
#                     X_train, 
#                     X_test, 
#                     y_train, 
#                     y_test, 
#                     "Tun RF")

In [120]:
# hard coding results into hard_metrics_df so I don't have to re-run the 
# notebook cells that take a long time to run every time I open it up

regression_metrics_df.at['Tun RF', 'Train R2'] = 0.874848
regression_metrics_df.at['Tun RF', 'Test R2'] = 0.662458
regression_metrics_df.at['Tun RF', 'Train MAE'] = 8.797796
regression_metrics_df.at['Tun RF', 'Test MAE'] = 15.259658
regression_metrics_df.at['Tun RF', 'Train MAPE'] = 0.374132
regression_metrics_df.at['Tun RF', 'Test MAPE'] = 0.632943
regression_metrics_df.at['Tun RF', 'Train RMSE'] = 18.508387
regression_metrics_df.at['Tun RF', 'Test RMSE'] = 31.283174

### RF Tuned 2: 'max_features' and 'n_estimators'

In [187]:
# %%time
# # time: 7 minutes 41 seconds

# # instantiate
# rf_tun2 = RandomForestRegressor(random_state = 42, n_jobs = -1)

# # pipeline
# rf_tun2_pipe = make_pipeline(preprocessor, rf_tun2)

# # params
# rf2_params = {}
# rf2_params['randomforestregressor__max_features'] = [.25, .5, .75]
# rf2_params['randomforestregressor__n_estimators'] = [10, 50, 100]

# # gridsearchcv
# rf2_gs = GridSearchCV(rf_tun2_pipe, 
#                      rf2_params,
#                      scoring = 'r2')

# rf2_gs.fit(X_train, y_train)

# allDone()

CPU times: total: 4min 34s
Wall time: 7min 41s


In [191]:
# # see best score from gridsearchcv
# rf2_gs.best_score_ # 0.6864281542491636

0.6864281542491636

In [192]:
# # see the best parameters from the tuned model
# rf2_gs.best_params_

# # max_features: 0.5
# # n_estimators: 100

{'randomforestregressor__max_features': 0.5,
 'randomforestregressor__n_estimators': 100}

In [193]:
# # instantiate tuned model
# rf_tun2_pipe = rf2_gs.best_estimator_

# # print and store metrics
# evaluate_regression(rf_tun2_pipe, 
#                     X_train, 
#                     X_test, 
#                     y_train, 
#                     y_test, 
#                     "Tun 2 RF")

Train R2       0.909118
Test R2        0.695232
Train MAE      6.071088
Test MAE      12.917984
Train MAPE     0.231531
Test MAPE      0.481957
Train RMSE    15.772064
Test RMSE     29.725659
Name: Tun 2 RF, dtype: float64


In [194]:
# hard coding results into hard_metrics_df so I don't have to re-run the 
# notebook cells that take a long time to run every time I open it up

regression_metrics_df.at['Tun 2 RF', 'Train R2'] = 0.909118
regression_metrics_df.at['Tun 2 RF', 'Test R2'] = 0.695232
regression_metrics_df.at['Tun 2 RF', 'Train MAE'] = 6.071088
regression_metrics_df.at['Tun 2 RF', 'Test MAE'] = 12.917984
regression_metrics_df.at['Tun 2 RF', 'Train MAPE'] = 0.231531
regression_metrics_df.at['Tun 2 RF', 'Test MAPE'] = 0.481957
regression_metrics_df.at['Tun 2 RF', 'Train RMSE'] = 15.772064
regression_metrics_df.at['Tun 2 RF', 'Test RMSE'] = 29.725659

## Model 5: K-Nearest Neighbors Regressor

### Default

In [121]:
# %%time
# # time: 12 seconds

# # instantiate
# kn_def = KNeighborsRegressor()

# # pipeline
# kn_def_pipe = make_pipeline(preprocessor, kn_def)

# # fit
# kn_def_pipe.fit(X_train, y_train)

# # evaluate
# evaluate_regression(kn_def_pipe, 
#                     X_train, 
#                     X_test, 
#                     y_train, 
#                     y_test, 
#                     "Def KN")

# allDone()

In [122]:
# hard coding results so I don't have to re-run the notebook cells that take 
# a long time to run every time I open it up

regression_metrics_df.at['Def KNN', 'Train R2'] = 0.728828
regression_metrics_df.at['Def KNN', 'Test R2'] = 0.602546
regression_metrics_df.at['Def KNN', 'Train MAE'] = 12.618852
regression_metrics_df.at['Def KNN', 'Test MAE'] = 15.934048
regression_metrics_df.at['Def KNN', 'Train MAPE'] = 0.478908
regression_metrics_df.at['Def KNN', 'Test MAPE'] = 0.611594
regression_metrics_df.at['Def KNN', 'Train RMSE'] = 27.244059
regression_metrics_df.at['Def KNN', 'Test RMSE'] = 33.946147

### Tuned on 'n_neighbors'

In [123]:
# %%time
# # time: 

# # instantiate
# kn_tun = KNeighborsRegressor()

# # pipeline
# kn_tun_pipe = make_pipeline(preprocessor, kn_tun)

# # params
# kn_params = {}

# # tune n_neighbors
# kn_params['kneighborsregressor__n_neighbors'] = range(10, 311, 50)

# # gridsearchcv
# kn_gs = GridSearchCV(kn_tun_pipe, 
#                      kn_params,
#                      scoring = 'r2')

# kn_gs.fit(X_train, y_train)

# allDone()

In [124]:
# # see best score from gridsearchcv
# kn_gs.best_score_ # 0.5779756101534584

In [125]:
# # see the best parameters from the tuned model
# kn_gs.best_params_

# # n_neighbors: 10

In [126]:
# # instantiate tuned model
# kn_tun_pipe = kn_gs.best_estimator_

# # print and store metrics
# evaluate_regression(kn_tun_pipe, 
#                     X_train, 
#                     X_test, 
#                     y_train, 
#                     y_test, 
#                     "Tun KNN")

In [127]:
# hard coding results so I don't have to re-run the notebook cells that take 
# a long time to run every time I open it up

regression_metrics_df.at['Tun KNN', 'Train R2'] = 0.674011
regression_metrics_df.at['Tun KNN', 'Test R2'] = 0.608170
regression_metrics_df.at['Tun KNN', 'Train MAE'] = 14.439231
regression_metrics_df.at['Tun KNN', 'Test MAE'] = 16.487466
regression_metrics_df.at['Tun KNN', 'Train MAPE'] = 0.575196
regression_metrics_df.at['Tun KNN', 'Test MAPE'] = 0.650102
regression_metrics_df.at['Tun KNN', 'Train RMSE'] = 29.871061
regression_metrics_df.at['Tun KNN', 'Test RMSE'] = 33.705102

## Model 6: Extreme Gradient Boosting Regressor

### Default

In [128]:
%%time
# time: 10 seconds

# instantiate
xgb_def = XGBRegressor()

# pipeline
xgb_def_pipe = make_pipeline(preprocessor, xgb_def)

# fit
xgb_def_pipe.fit(X_train, y_train)

# evaluate
evaluate_regression(xgb_def_pipe, 
                    X_train, 
                    X_test, 
                    y_train, 
                    y_test, 
                    "Def XGB")

allDone()

Train R2       0.723095
Test R2        0.667440
Train MAE     14.561838
Test MAE      15.973772
Train MAPE     0.629571
Test MAPE      0.673783
Train RMSE    27.530559
Test RMSE     31.051453
Name: Def XGB, dtype: float64


CPU times: total: 1min 37s
Wall time: 9.61 s


### Tuned on 'max_depth' and 'n_estimators'

In [129]:
# %%time
# # time: 15 minues

# # instantiate
# xgb_tun = XGBRegressor()

# # pipeline
# xgb_tun_pipe = make_pipeline(preprocessor, xgb_tun)

# # params
# xgb_params = {}

# # tune max_depth and n_estimators
# xgb_params['xgbregressor__max_depth'] = [20, 40, 60]
# xgb_params['xgbregressor__n_estimators'] = [20, 40, 60]

# # gridsearchcv
# xgb_gs = GridSearchCV(xgb_tun_pipe, 
#                      xgb_params,
#                      scoring = 'r2')

# xgb_gs.fit(X_train, y_train)

# allDone()

In [130]:
# # see best score from gridizedsearchcv
# xgb_gs.best_score_ # 0.6610792782418022

In [131]:
# # see the best parameters from the tuned model
# xgb_gs.best_params_ 

# # max_depth: 20
# # n_estimators: 40

In [132]:
# # instantiate tuned model
# xgb_tun_pipe = xgb_gs.best_estimator_

# # print and store metrics
# evaluate_regression(xgb_tun_pipe, 
#                     X_train, 
#                     X_test, 
#                     y_train, 
#                     y_test, 
#                     "Tun XGB")

In [133]:
# hard coding results so I don't have to re-run the notebook cells that take 
# a long time to run every time I open it up

regression_metrics_df.at['Tun XGB', 'Train R2'] = 0.886501
regression_metrics_df.at['Tun XGB', 'Test R2'] = 0.670250
regression_metrics_df.at['Tun XGB', 'Train MAE'] = 8.081317
regression_metrics_df.at['Tun XGB', 'Test MAE'] = 13.811605
regression_metrics_df.at['Tun XGB', 'Train MAPE'] = 0.356713
regression_metrics_df.at['Tun XGB', 'Test MAPE'] = 0.539004
regression_metrics_df.at['Tun XGB', 'Train RMSE'] = 17.625647
regression_metrics_df.at['Tun XGB', 'Test RMSE'] = 30.919982

## Model 7: Light Gradient Boosting Machine Regressor

### Default

In [134]:
%%time
# time: 655 milliseconds

# instantiate
lgbm_def = LGBMRegressor()

# pipeline
lgbm_def_pipe = make_pipeline(preprocessor, lgbm_def)

# fit
lgbm_def_pipe.fit(X_train, y_train)

# evaluate
evaluate_regression(lgbm_def_pipe, 
                    X_train, 
                    X_test, 
                    y_train, 
                    y_test, 
                    "Def LGBM")

allDone()

Train R2       0.664614
Test R2        0.642854
Train MAE     16.033627
Test MAE      16.841458
Train MAPE     0.679746
Test MAPE      0.703099
Train RMSE    30.298529
Test RMSE     32.178775
Name: Def LGBM, dtype: float64


CPU times: total: 4.67 s
Wall time: 900 ms


### Tuned on 'max_depth' and 'n_estimators'

In [135]:
# %%time
# # time: 19 seconds

# # instantiate
# lgbm_tun = LGBMRegressor()

# # pipeline
# lgbm_tun_pipe = make_pipeline(preprocessor, lgbm_tun)

# # params
# lgbm_params = {}

# # tune n_neighbors
# lgbm_params['lgbmregressor__max_depth'] = [5, 10, 15, 20]
# lgbm_params['lgbmregressor__n_estimators'] = [10, 25, 50]

# # gridsearchcv
# lgbm_gs = GridSearchCV(lgbm_tun_pipe, 
#                        lgbm_params,
#                        scoring = 'r2')

# lgbm_gs.fit(X_train, y_train)

# allDone()

In [136]:
# # see best score from gridsearchcv
# lgbm_gs.best_score_ # 0.592515714044081

In [137]:
# # see the best parameters from the tuned model
# lgbm_gs.best_params_

# # max_depth: 20
# # n_estimators: 50 

In [138]:
# # instantiate tuned model
# lgbm_tun_pipe = lgbm_gs.best_estimator_

# # print and store metrics
# evaluate_regression(lgbm_tun_pipe, 
#                     X_train, 
#                     X_test, 
#                     y_train, 
#                     y_test, 
#                     "Tun LGBM")

In [139]:
# hard coding results so I don't have to re-run the notebook cells that take 
# a long time to run every time I open it up

regression_metrics_df.at['Tun LGBM', 'Train R2'] = 0.619190
regression_metrics_df.at['Tun LGBM', 'Test R2'] = 0.603308
regression_metrics_df.at['Tun LGBM', 'Train MAE'] = 17.430325
regression_metrics_df.at['Tun LGBM', 'Test MAE'] = 18.123538
regression_metrics_df.at['Tun LGBM', 'Train MAPE'] = 0.754229
regression_metrics_df.at['Tun LGBM', 'Test MAPE'] = 0.773916
regression_metrics_df.at['Tun LGBM', 'Train RMSE'] = 32.285186
regression_metrics_df.at['Tun LGBM', 'Test RMSE'] = 33.913575

## Model 8: Gradient Boosting Regressor

### Default

In [140]:
# %%time
# # time: 1 minute 18 seconds

# # instantiate
# gbr_def = GradientBoostingRegressor()

# # pipeline
# gbr_def_pipe = make_pipeline(preprocessor, gbr_def)

# # fit
# gbr_def_pipe.fit(X_train, y_train)

# # evaluate
# evaluate_regression(gbr_def_pipe, 
#                     X_train, 
#                     X_test, 
#                     y_train, 
#                     y_test, 
#                     "Def GBR")

# allDone()

In [141]:
# hard coding results so I don't have to re-run the notebook cells that take 
# a long time to run every time I open it up

regression_metrics_df.at['Def GBR', 'Train R2'] = 0.438490
regression_metrics_df.at['Def GBR', 'Test R2'] = 0.422539
regression_metrics_df.at['Def GBR', 'Train MAE'] = 21.744673
regression_metrics_df.at['Def GBR', 'Test MAE'] = 22.299318
regression_metrics_df.at['Def GBR', 'Train MAPE'] = 0.985450
regression_metrics_df.at['Def GBR', 'Test MAPE'] = 0.995671
regression_metrics_df.at['Def GBR', 'Train RMSE'] = 39.203798
regression_metrics_df.at['Def GBR', 'Test RMSE'] = 40.917422

### Tuned on 'max_depth' and 'n_estimators'

In [142]:
# %%time
# # time: 43 minutes

# # instantiate
# gbr_tun = GradientBoostingRegressor()

# # pipeline
# gbr_tun_pipe = make_pipeline(preprocessor, gbr_tun)

# # params
# gbr_params = {}

# # tune n_neighbors
# gbr_params['gradientboostingregressor__max_depth'] = [10, 15, 20]
# gbr_params['gradientboostingregressor__n_estimators'] = [20, 30, 40]

# # gridsearchcv
# gbr_gs = GridSearchCV(gbr_tun_pipe, 
#                      gbr_params,
#                      scoring = 'r2')

# gbr_gs.fit(X_train, y_train)

# allDone()

In [143]:
# # see best score from gridsearchcv
# gbr_gs.best_score_ # 0.6530051823484924

In [144]:
# # see the best parameters from the tuned model
# gbr_gs.best_params_

# # max_depth: 20
# # n_estimators: 40

In [145]:
# # instantiate tuned model
# gbr_tun_pipe = gbr_gs.best_estimator_

# # print and store metrics
# evaluate_regression(gbr_tun_pipe, 
#                     X_train, 
#                     X_test, 
#                     y_train, 
#                     y_test, 
#                     "Tun GBR")

In [146]:
# hard coding results so I don't have to re-run the notebook cells that take 
# a long time to run every time I open it up

regression_metrics_df.at['Tun GBR', 'Train R2'] = 0.858627
regression_metrics_df.at['Tun GBR', 'Test R2'] = 0.654365
regression_metrics_df.at['Tun GBR', 'Train MAE'] = 10.075562
regression_metrics_df.at['Tun GBR', 'Test MAE'] = 14.710098
regression_metrics_df.at['Tun GBR', 'Train MAPE'] = 0.462800
regression_metrics_df.at['Tun GBR', 'Test MAPE'] = 0.602836
regression_metrics_df.at['Tun GBR', 'Train RMSE'] = 19.671273
regression_metrics_df.at['Tun GBR', 'Test RMSE'] = 31.655980

# Model Decision

Let's see the results of all the models we've made so far.

In [147]:
regression_metrics_df.sort_values(by = 'Test R2', ascending = False)

Unnamed: 0,Train R2,Test R2,Train MAE,Test MAE,Train MAPE,Test MAPE,Train RMSE,Test RMSE
Def RF,0.908955,0.679376,6.05418,13.06221,0.229691,0.482821,15.786208,30.48909
Tun BR,0.907775,0.678912,6.087626,13.10726,0.230614,0.485255,15.888121,30.51118
Tun XGB,0.886501,0.67025,8.081317,13.81161,0.356713,0.539004,17.625647,30.91998
Def BR,0.898944,0.667539,6.327471,13.43455,0.23837,0.495508,16.631432,31.04684
Def XGB,0.723095,0.6674398,14.561838,15.97377,0.629571,0.6737828,27.530559,31.05145
Tun RF,0.874848,0.662458,8.797796,15.25966,0.374132,0.632943,18.508387,31.28317
Tun GBR,0.858627,0.654365,10.075562,14.7101,0.4628,0.602836,19.671273,31.65598
Def LGBM,0.664614,0.6428543,16.033627,16.84146,0.679746,0.7030994,30.298529,32.17878
Tun KNN,0.674011,0.60817,14.439231,16.48747,0.575196,0.650102,29.871061,33.7051
Tun LGBM,0.61919,0.603308,17.430325,18.12354,0.754229,0.773916,32.285186,33.91358
