<a href="https://colab.research.google.com/github/kellianneyang/grades-project/blob/main/preprocessing_and_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Grades Project: Preprocessing and Modeling**

# Preliminary Steps

In [None]:
# import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.dummy import DummyRegressor
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, \
mean_absolute_percentage_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, \
GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

In [None]:
# mount drive
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
# load data
path = '/content/drive/MyDrive/Coding Dojo/Project 2: Grades/all_grades_data_cleaned.csv'
df = pd.read_csv(path)

In [None]:
# inspect
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49012 entries, 0 to 49011
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   section_type       49012 non-null  object 
 1   instructor_id      49012 non-null  object 
 2   facility_code      45521 non-null  object 
 3   start_time         49012 non-null  float64
 4   mon                49012 non-null  bool   
 5   tues               49012 non-null  bool   
 6   wed                49012 non-null  bool   
 7   thurs              49012 non-null  bool   
 8   fri                49012 non-null  bool   
 9   subject_name       49012 non-null  object 
 10  course_name        49012 non-null  object 
 11  a_proportion       49012 non-null  float64
 12  f_proportion       49012 non-null  float64
 13  avg_grade          49012 non-null  float64
 14  year               49012 non-null  int64  
 15  term               49012 non-null  object 
 16  class_length       490

In [None]:
# check for duplicates
df.duplicated().sum()

0

In [None]:
# check for missing values
df.isna().sum()

# 'facility_code' is the only column with missing values; will need to impute

section_type            0
instructor_id           0
facility_code        3491
start_time              0
mon                     0
tues                    0
wed                     0
thurs                   0
fri                     0
subject_name            0
course_name             0
a_proportion            0
f_proportion            0
avg_grade               0
year                    0
term                    0
class_length            0
total_time              0
weekend                 0
course_difficulty       0
dtype: int64

# Preprocessing

In [None]:
# assign X and y
target_a = 'a_proportion'
target_f = 'f_proportion'
target_avg = 'avg_grade'

y_a = df[target_a]
y_f = df[target_f]
y_avg = df[target_avg]

X_a = df.drop(columns = [target_a, target_f, target_avg])
X_f = X_a.copy()
X_avg = X_a.copy()

# check
print(f"y_a: \n{y_a}")
print(f"y_f: \n{y_f}")
print(f"y_avg: \n{y_avg}")
print(f"X_a: \n{X_a}")
print(f"X_f: \n{X_f}")
print(f"X_avg: \n{X_avg}")

y_a: 
0        0.250000
1        0.230769
2        0.153846
3        0.500000
4        0.363636
           ...   
49007    0.111111
49008    0.209877
49009    0.261364
49010    0.225000
49011    0.611111
Name: a_proportion, Length: 49012, dtype: float64
y_f: 
0        0.000000
1        0.000000
2        0.000000
3        0.000000
4        0.000000
           ...   
49007    0.012346
49008    0.000000
49009    0.000000
49010    0.012500
49011    0.055556
Name: f_proportion, Length: 49012, dtype: float64
y_avg: 
0        3.625000
1        3.346154
2        3.500000
3        3.750000
4        3.454545
           ...   
49007    3.000000
49008    3.308642
49009    3.210227
49010    3.087500
49011    3.583333
Name: avg_grade, Length: 49012, dtype: float64
X_a: 
      section_type instructor_id facility_code  start_time    mon   tues  \
0              lec         other           NaN        -1.0  False  False   
1              lec         other          0545       660.0  False   True   
2    

In [None]:
# validate model with train/test split
X_a_train, X_a_test, y_a_train, y_a_test = train_test_split(X_a, y_a, random_state = 42)
X_f_train, X_f_test, y_f_train, y_f_test = train_test_split(X_f, y_f, random_state = 42)
X_avg_train, X_avg_test, y_avg_train, y_avg_test = train_test_split(X_avg, y_avg, random_state = 42)

# check
print(f"X_a_train shape: {X_a_train.shape}")
print(f"X_a_test shape: {X_a_test.shape}")
print(f"y_a_train shape: {y_a_train.shape}")
print(f"y_a_test shape: {y_a_test.shape}")

print(f"X_f_train shape: {X_f_train.shape}")
print(f"X_f_test shape: {X_f_test.shape}")
print(f"y_f_train shape: {y_f_train.shape}")
print(f"y_f_test shape: {y_f_test.shape}")

print(f"X_avg_train shape: {X_avg_train.shape}")
print(f"X_avg_test shape: {X_avg_test.shape}")
print(f"y_avg_train shape: {y_avg_train.shape}")
print(f"y_avg_test shape: {y_avg_test.shape}")

X_a_train shape: (36759, 17)
X_a_test shape: (12253, 17)
y_a_train shape: (36759,)
y_a_test shape: (12253,)
X_f_train shape: (36759, 17)
X_f_test shape: (12253, 17)
y_f_train shape: (36759,)
y_f_test shape: (12253,)
X_avg_train shape: (36759, 17)
X_avg_test shape: (12253, 17)
y_avg_train shape: (36759,)
y_avg_test shape: (12253,)


For the rest of this notebook, I will be using the target variable 'avg_grade'. 

In [None]:
# check columns to plan preprocessing steps
column_check_train = list(X_avg_train.columns)
column_check_test = list(X_avg_test.columns)

In [None]:
# binary encode ('passthrough' in preprocessor):
# 'mon', 'tues', 'wed', 'thurs', 'fri', 'weekend'

binary_cols = ['mon', 'tues', 'wed', 'thurs', 'fri', 'weekend']

for col in binary_cols:
  X_avg_train[col].replace({True: 1, False: 0}, inplace = True)
  X_avg_test[col].replace({True: 1, False: 0}, inplace = True)
  column_check_train.remove(col)
  column_check_test.remove(col)

# check
display(X_avg_train[binary_cols])
display(X_avg_test[binary_cols])

Unnamed: 0,mon,tues,wed,thurs,fri,weekend
6898,0,0,0,0,0,0
46896,1,0,1,0,0,0
22270,1,0,1,0,1,0
30130,0,1,0,1,0,0
35714,0,0,1,0,0,0
...,...,...,...,...,...,...
11284,1,0,1,0,1,0
44732,1,1,1,1,0,0
38158,1,1,0,1,1,0
860,1,0,1,0,1,0


Unnamed: 0,mon,tues,wed,thurs,fri,weekend
34805,0,0,0,1,0,0
5146,0,0,0,0,0,0
29624,0,1,0,1,0,0
10172,1,0,1,0,1,0
644,1,0,1,0,1,0
...,...,...,...,...,...,...
30853,0,1,0,1,0,0
20761,0,1,0,1,0,0
4998,1,0,0,0,0,0
23647,0,1,0,1,0,0


In [None]:
# check
for col in binary_cols:
  print(X_avg_train[col].value_counts())
  print(X_avg_test[col].value_counts())

0    19922
1    16837
Name: mon, dtype: int64
0    6637
1    5616
Name: mon, dtype: int64
0    20291
1    16468
Name: tues, dtype: int64
0    6857
1    5396
Name: tues, dtype: int64
0    20407
1    16352
Name: wed, dtype: int64
0    6773
1    5480
Name: wed, dtype: int64
0    20995
1    15764
Name: thurs, dtype: int64
0    7027
1    5226
Name: thurs, dtype: int64
0    27397
1     9362
Name: fri, dtype: int64
0    9094
1    3159
Name: fri, dtype: int64
0    36562
1      197
Name: weekend, dtype: int64
0    12185
1       68
Name: weekend, dtype: int64


In [None]:
# check columns left
print(column_check_train)
print(column_check_test)

['section_type', 'instructor_id', 'facility_code', 'start_time', 'subject_name', 'course_name', 'year', 'term', 'class_length', 'total_time', 'course_difficulty']
['section_type', 'instructor_id', 'facility_code', 'start_time', 'subject_name', 'course_name', 'year', 'term', 'class_length', 'total_time', 'course_difficulty']


In [None]:
# one-hot encode:
# 'section_type', 'instructor_id', 'subject_name', 'course_name', 'term',
# 'course_difficulty'

ohe_cols = ['section_type', 'instructor_id', 'subject_name', 'course_name',
            'term', 'course_difficulty', 'year']

for col in ohe_cols:
  column_check_train.remove(col)
  column_check_test.remove(col)

ohe = OneHotEncoder(handle_unknown = 'ignore')

ohe_tuple = (ohe, ohe_cols)

# check columns left
print(column_check_train)
print(column_check_test)

['facility_code', 'start_time', 'class_length', 'total_time']
['facility_code', 'start_time', 'class_length', 'total_time']


In [None]:
# one-hot encode and impute constant 'missing':
# 'facility_code'

missing_imputer = SimpleImputer(strategy = 'constant', fill_value = 'missing')

ohe = OneHotEncoder(handle_unknown = 'ignore')

facility_code_pipe = make_pipeline(missing_imputer, ohe)

facility_code_tuple = (facility_code_pipe, ['facility_code'])

column_check_train.remove('facility_code')
column_check_test.remove('facility_code')

# check columns left
print(column_check_train)
print(column_check_test)

['start_time', 'class_length', 'total_time']
['start_time', 'class_length', 'total_time']


In [None]:
# scale:
# 'start_time', 'class_length', 'total_time'

scaler = StandardScaler()

scale_cols = ['start_time', 'class_length', 'total_time']

for col in scale_cols:
  column_check_train.remove(col)
  column_check_test.remove(col)

scale_tuple = (scaler, scale_cols)

# check columns left
print(column_check_train)
print(column_check_test)

[]
[]


In [None]:
# create preprocessor
preprocessor = make_column_transformer(ohe_tuple, 
                                       facility_code_tuple, 
                                       scale_tuple,
                                       remainder = 'passthrough')

In [None]:
# # check transformation by preprocessor
# X_avg_train_processed = preprocessor.fit_transform(X_avg_train)
# X_avg_test_processed = preprocessor.fit_transform(X_avg_test)

# # check for missing values
# print(np.isnan(X_avg_train_processed).sum().sum(), 'missing values in train')
# print(np.isnan(X_avg_test_processed).sum().sum(), 'missing values in test')

# # check that all data is numeric
# print('All data in train is ', X_avg_train_processed.dtype)
# print('All data in test is ', X_avg_test_processed.dtype)

# # check shape of data to infer that categorical columns were one hot encoded
# print('Shape of X_axg_train_processed data is', X_avg_train_processed.shape)
# print('Shape of X_avg_test_processed data is', X_avg_test_processed.shape)

# # check arrays to see that numeric data was scaled
# print(X_avg_train_processed)
# print(X_avg_test_processed)

# Modeling Functions

In [None]:
# create dataframe to save metrics
metrics_df = pd.DataFrame()

# check
metrics_df

In [None]:
mean_absolute_percentage_error,
# define function that will print regression metrics and store metrics in a 
# dataframe for easy viewing
def get_metrics(model_pipe, X_train, X_test, y_train, y_test, name):
  
  # calculate predictions
  train_pred = model_pipe.predict(X_train)
  test_pred = model_pipe.predict(X_test)

  # store scores
  metrics_df.at[name, 'Train R2'] = r2_score(y_train, 
                                             train_pred)
  metrics_df.at[name, 'Test R2'] = r2_score(y_test, 
                                            test_pred)
  metrics_df.at[name, 'Train MAE'] = mean_absolute_error(y_train, 
                                                         train_pred)
  metrics_df.at[name, 'Test MAE'] = mean_absolute_error(y_test, 
                                                        test_pred)
  metrics_df.at[name, 'Train MAPE'] = mean_absolute_percentage_error(y_train, 
                                                                     train_pred)
  metrics_df.at[name, 'Test MAPE'] = mean_absolute_percentage_error(y_test, 
                                                                    test_pred)
  metrics_df.at[name, 'Train RMSE'] = np.sqrt(mean_squared_error(y_train, 
                                                                 train_pred))
  metrics_df.at[name, 'Test RMSE'] = np.sqrt(mean_squared_error(y_test, 
                                                                test_pred))
  
  # show scores for this model only (can call metrics_df to see all scores)
  print(metrics_df.loc[name, :])

# Dummy Model

In [None]:
# instantiate
dummy_regressor = DummyRegressor(strategy = 'mean')

# combine with preprocessor
dummy_pipe = make_pipeline(preprocessor, dummy_regressor)

# fit on training data
dummy_pipe.fit(X_avg_train, y_avg_train)

# calculate, show, and store metrics
get_metrics(dummy_pipe, 
            X_avg_train, 
            X_avg_test, 
            y_avg_train, 
            y_avg_test, 
            "Dummy")

Train R2      0.000000e+00
Test R2      -1.234698e-04
Train MAE     3.083861e-01
Test MAE      3.092590e-01
Train MAPE    4.307289e+11
Test MAPE     9.236957e-02
Train RMSE    3.742595e-01
Test RMSE     3.747394e-01
Name: Dummy, dtype: float64


In [None]:
# check metrics_df
metrics_df

Unnamed: 0,Train R2,Test R2,Train MAE,Test MAE,Train MAPE,Test MAPE,Train RMSE,Test RMSE
Dummy,0.0,-0.000123,0.308386,0.309259,430728900000.0,0.09237,0.37426,0.374739


In [None]:
# hard coding results into hard_metrics_df so I don't have to
# re-run the notebook cells that take a long time to run
# every time I open it up

hard_metrics_df = pd.DataFrame()

hard_metrics_df.at['Dummy', 'Train R2'] = 0.0000000
hard_metrics_df.at['Dummy', 'Test R2'] = 0.1234698
hard_metrics_df.at['Dummy', 'Train MAE'] = 0.3083861
hard_metrics_df.at['Dummy', 'Test MAE'] = 0.3092590
hard_metrics_df.at['Dummy', 'Train MAPE'] = 0.4307289
hard_metrics_df.at['Dummy', 'Test MAPE'] = 0.9236957
hard_metrics_df.at['Dummy', 'Train RMSE'] = 0.3742595
hard_metrics_df.at['Dummy', 'Test RMSE'] = 0.3747394

# Model 1: Linear Regression

In [None]:
# instantiate
lr = LinearRegression()

# pipeline
lr_pipe = make_pipeline(preprocessor, lr)

# fit
lr_pipe.fit(X_avg_train, y_avg_train)

# evaluate
get_metrics(lr_pipe, 
            X_avg_train, 
            X_avg_test, 
            y_avg_train, 
            y_avg_test, 
            "Linear")

Train R2      6.340206e-01
Test R2       6.170483e-01
Train MAE     1.704007e-01
Test MAE      1.761336e-01
Train MAPE    4.674913e+11
Test MAPE     5.221217e-02
Train RMSE    2.264129e-01
Test RMSE     2.318860e-01
Name: Linear, dtype: float64


In [None]:
# hard coding results into hard_metrics_df so I don't have to
# re-run the notebook cells that take a long time to run
# every time I open it up

hard_metrics_df.at['Linear', 'Train R2'] = 0.6340206
hard_metrics_df.at['Linear', 'Test R2'] = 0.6170483
hard_metrics_df.at['Linear', 'Train MAE'] = 0.1704007
hard_metrics_df.at['Linear', 'Test MAE'] = 0.1761336
hard_metrics_df.at['Linear', 'Train MAPE'] = 0.4674913
hard_metrics_df.at['Linear', 'Test MAPE'] = 0.5221217
hard_metrics_df.at['Linear', 'Train RMSE'] = 0.2264129
hard_metrics_df.at['Linear', 'Test RMSE'] = 0.2318860

# Model 2: Decision Tree Regressor

## Default

In [None]:
# # instantiate
# dt_def = DecisionTreeRegressor(random_state = 42)

# # pipeline
# dt_def_pipe = make_pipeline(preprocessor, dt_def)

# # fit
# dt_def_pipe.fit(X_avg_train, y_avg_train)

# # evaluate
# get_metrics(dt_def_pipe, 
#             X_avg_train, 
#             X_avg_test, 
#             y_avg_train, 
#             y_avg_test, 
#             "Def DT")

In [None]:
# hard coding results into hard_metrics_df so I don't have to
# re-run the notebook cells that take a long time to run
# every time I open it up

hard_metrics_df.at['Def DT', 'Train R2'] = 0.978962
hard_metrics_df.at['Def DT', 'Test R2'] = 0.409189
hard_metrics_df.at['Def DT', 'Train MAE'] = 0.012814
hard_metrics_df.at['Def DT', 'Test MAE'] = 0.207481
hard_metrics_df.at['Def DT', 'Train MAPE'] = 0.003885
hard_metrics_df.at['Def DT', 'Test MAPE'] = 0.061587
hard_metrics_df.at['Def DT', 'Train RMSE'] = 0.054284
hard_metrics_df.at['Def DT', 'Test RMSE'] = 0.288023

- Very overfit (train R2 .98, test R2 .41)

## Tuned on 'max_depth'

In [None]:
# # get depth from default tree where max_depth = None
# def_depth = dt_def_pipe['decisiontreeregressor'].get_depth()
# def_depth

In [None]:
# # tune max_depth with gridsearch
# # 555 models
# # took 28 minutes

# # instantiate
# dt_tun = DecisionTreeRegressor(random_state = 42)

# # pipeline
# dt_tun_pipe = make_pipeline(preprocessor, dt_tun)

# # params
# dt_params = {'decisiontreeregressor__max_depth': range(1, (def_depth + 1))}

# # grid search
# dt_gs = GridSearchCV(dt_tun_pipe, dt_params, scoring = 'r2')

# # fit
# dt_gs.fit(X_avg_train, y_avg_train)

# # see best parameters
# print(dt_gs.best_params_)

# # see best score
# print(dt_gs.best_score_)

In [None]:
# # instantiate tuned model
# dt_tun_pipe = dt_gs.best_estimator_

# # print and store metrics
# get_metrics(dt_tun_pipe, 
#             X_avg_train, 
#             X_avg_test, 
#             y_avg_train, 
#             y_avg_test, 
#             "Tun DT")

In [None]:
# hard coding results into hard_metrics_df so I don't have to
# re-run the notebook cells that take a long time to run
# every time I open it up

hard_metrics_df.at['Tun DT', 'Train R2'] = 0.6771564
hard_metrics_df.at['Tun DT', 'Test R2'] = 0.4805888
hard_metrics_df.at['Tun DT', 'Train MAE'] = 0.1434812
hard_metrics_df.at['Tun DT', 'Test MAE'] = 0.2005607
hard_metrics_df.at['Tun DT', 'Train MAPE'] = 0.4537716
hard_metrics_df.at['Tun DT', 'Test MAPE'] = 0.5964366
hard_metrics_df.at['Tun DT', 'Train RMSE'] = 0.2126517
hard_metrics_df.at['Tun DT', 'Test RMSE'] = 0.2700587

- Overfit (train R2 .68, test R2 .48)
- Underfit (not predicting much variance in target data, but more than default decision tree model)

# Model 3: Bagged Tree Regressor

## Default

In [None]:
# # instantiate
# br_def = BaggingRegressor(random_state = 42) # default class_weight = None

# # pipeline
# br_def_pipe = make_pipeline(preprocessor, br_def)

# # fit
# br_def_pipe.fit(X_avg_train, y_avg_train)

# # evaluate
# get_metrics(br_def_pipe, 
#             X_avg_train, 
#             X_avg_test, 
#             y_avg_train, 
#             y_avg_test, 
#             "Def BR")

In [None]:
# hard coding results into hard_metrics_df so I don't have to
# re-run the notebook cells that take a long time to run
# every time I open it up

hard_metrics_df.at['Def BR', 'Train R2'] = 0.9196325
hard_metrics_df.at['Def BR', 'Test R2'] = 0.6168629
hard_metrics_df.at['Def BR', 'Train MAE'] = 0.7091014
hard_metrics_df.at['Def BR', 'Test MAE'] = 0.1693241
hard_metrics_df.at['Def BR', 'Train MAPE'] = 0.9801354
hard_metrics_df.at['Def BR', 'Test MAPE'] = 0.5034662
hard_metrics_df.at['Def BR', 'Train RMSE'] = 0.1060995
hard_metrics_df.at['Def BR', 'Test RMSE'] = 0.2319421

- Overfit (train R2 .92, test R2 .62)
- Less underfit than decision tree models (predicts more variance in data)

## Tuned on 'n_estimators'

In [None]:
# %%time
# # tune n_estimators with gridsearch
# # 44 minutes

# # instantiate
# br_tun = BaggingRegressor(random_state = 42)

# # pipeline
# br_tun_pipe = make_pipeline(preprocessor, br_tun)

# # params
# br_params = {'baggingregressor__n_estimators': [10, 101, 20]}

# # grid search
# br_gs = GridSearchCV(br_tun_pipe, br_params, scoring = 'r2')

# # fit
# br_gs.fit(X_avg_train, y_avg_train)

# # see best parameters
# print(br_gs.best_params_)

# # see best score
# print(br_gs.best_score_)

In [None]:
# # instantiate tuned model
# br_tun_pipe = br_gs.best_estimator_

# # print and store metrics
# get_metrics(br_tun_pipe, 
#             X_avg_train, 
#             X_avg_test, 
#             y_avg_train, 
#             y_avg_test, 
#             "Tun BR")

In [None]:
# hard coding results into hard_metrics_df so I don't have to
# re-run the notebook cells that take a long time to run
# every time I open it up

hard_metrics_df.at['Tun BR', 'Train R2'] = 0.9351598
hard_metrics_df.at['Tun BR', 'Test R2'] = 0.6450013
hard_metrics_df.at['Tun BR', 'Train MAE'] = 0.6603890
hard_metrics_df.at['Tun BR', 'Test MAE'] = 0.1630311
hard_metrics_df.at['Tun BR', 'Train MAPE'] = 0.1967819
hard_metrics_df.at['Tun BR', 'Test MAPE'] = 0.4849256
hard_metrics_df.at['Tun BR', 'Train RMSE'] = 0.9530044
hard_metrics_df.at['Tun BR', 'Test RMSE'] = 0.2232626

- Overfit (train R2 .94, test R2 .65)
- Improvement of .3 in test R2 from default bagging regressor

# Model 4: Random Forest Regressor

## Default

In [None]:
# %%time

# # instantiate
# rf_def = RandomForestRegressor(random_state = 42)

# # pipeline
# rf_def_pipe = make_pipeline(preprocessor, rf_def)

# # fit
# rf_def_pipe.fit(X_avg_train, y_avg_train)

# # evaluate
# get_metrics(rf_def_pipe, 
#             X_avg_train, 
#             X_avg_test, 
#             y_avg_train, 
#             y_avg_test, 
#             "Def RF")

In [None]:
# hard coding results into hard_metrics_df so I don't have to
# re-run the notebook cells that take a long time to run
# every time I open it up

hard_metrics_df.at['Def RF', 'Train R2'] = 0.9351265
hard_metrics_df.at['Def RF', 'Test R2'] = 0.6453246
hard_metrics_df.at['Def RF', 'Train MAE'] = 0.6602899
hard_metrics_df.at['Def RF', 'Test MAE'] = 0.1629727
hard_metrics_df.at['Def RF', 'Train MAPE'] = 0.1943255
hard_metrics_df.at['Def RF', 'Test MAPE'] = 0.4847181
hard_metrics_df.at['Def RF', 'Train RMSE'] = 0.9532490
hard_metrics_df.at['Def RF', 'Test RMSE'] = 0.2231609

- Still very overfit (train R2 .94 and test R2 .65)
- About as good as tuned bagging regressor in predicting variance in target

## Tuned on 'max_depth' and 'max_features'

In [None]:
# # get depth from max of estimator depths in the default random forest model
# est_depths = [estimator.get_depth() for estimator 
#               in rf_def_pipe['randomforestregressor'].estimators_]
# max_depth_rf = max(est_depths)
# max_depth_rf

In [None]:
# %%time

# # tune max_depth and max_features with gridsearch
# # 320 models
# # took 5 hours, 20 minutes

# # instantiate
# rf_tun = RandomForestRegressor(random_state = 42, n_jobs = -1)

# # pipeline
# rf_tun_pipe = make_pipeline(preprocessor, rf_tun)

# # params
# rf_params = {}

# # max_depth (range between 1 and estimated max_depth of default)
# rf_params['randomforestregressor__max_depth'] = range(1, max_depth_rf+1, 25)
# # 8 values

# # max_features (default 1; range over middle 50% of number of columns without 
# # dummies); 17 feature columns, so range over about 4-12
# rf_params['randomforestregressor__max_features'] = range(4, 12)
# # 8 values

# # randomizedsearchcv
# rf_gs = GridSearchCV(rf_tun_pipe, 
#                      rf_params,
#                      scoring = 'r2')

# rf_gs.fit(X_avg_train, y_avg_train)

In [None]:
# # see best score from randomizedsearchcv
# rf_gs.best_score_

In [None]:
# # see the best parameters from the tuned model
# rf_gs.best_params_

In [None]:
# # instantiate tuned model
# rf_tun_pipe = rf_gs.best_estimator_

# # print and store metrics
# get_metrics(rf_tun_pipe, 
#             X_avg_train, 
#             X_avg_test, 
#             y_avg_train, 
#             y_avg_test, 
#             "Tun RF")

In [None]:
# hard coding results into hard_metrics_df so I don't have to
# re-run the notebook cells that take a long time to run
# every time I open it up

hard_metrics_df.at['Tun RF', 'Train R2'] = 0.9318880
hard_metrics_df.at['Tun RF', 'Test R2'] = 0.6335185
hard_metrics_df.at['Tun RF', 'Train MAE'] = 0.7003219
hard_metrics_df.at['Tun RF', 'Test MAE'] = 0.1707381
hard_metrics_df.at['Tun RF', 'Train MAPE'] = 0.1941178
hard_metrics_df.at['Tun RF', 'Test MAPE'] = 0.5076128
hard_metrics_df.at['Tun RF', 'Train RMSE'] = 0.9767525
hard_metrics_df.at['Tun RF', 'Test RMSE'] = 0.2268446

- Still very overfit and not as high of a test R2 score as the default RF

# Model 5: K-Nearest Neighbors Regressor

## Default

In [None]:
# %%time

# # 8 minutes, 24 seconds

# # instantiate
# kn_def = KNeighborsRegressor()

# # pipeline
# kn_def_pipe = make_pipeline(preprocessor, kn_def)

# # fit
# kn_def_pipe.fit(X_avg_train, y_avg_train)

# # evaluate
# get_metrics(kn_def_pipe, 
#             X_avg_train, 
#             X_avg_test, 
#             y_avg_train, 
#             y_avg_test, 
#             "Def KNN")

In [None]:
# hard coding results into hard_metrics_df so I don't have to
# re-run the notebook cells that take a long time to run
# every time I open it up

hard_metrics_df.at['Def KNN', 'Train R2'] = 0.7435497
hard_metrics_df.at['Def KNN', 'Test R2'] = 0.6182448
hard_metrics_df.at['Def KNN', 'Train MAE'] = 0.1390834
hard_metrics_df.at['Def KNN', 'Test MAE'] = 0.1711105
hard_metrics_df.at['Def KNN', 'Train MAPE'] = 0.3920542
hard_metrics_df.at['Def KNN', 'Test MAPE'] = 0.5073286
hard_metrics_df.at['Def KNN', 'Train RMSE'] = 0.1895285
hard_metrics_df.at['Def KNN', 'Test RMSE'] = 0.2315235

- Significantly less ovrefit than the other models, but not as predictive as the RF.

## Tuned on 'n_neighbors'

In [None]:
# %%time

# # 15 minutes

# # instantiate
# kn_tun = KNeighborsRegressor()

# # pipeline
# kn_tun_pipe = make_pipeline(preprocessor, kn_tun)

# # params
# kn_params = {}

# # tune n_neighbors
# kn_params['kneighborsregressor__n_neighbors'] = [10, 50, 100]

# # randomizedsearchcv
# kn_gs = GridSearchCV(kn_tun_pipe, 
#                      kn_params,
#                      scoring = 'r2')

# kn_gs.fit(X_avg_train, y_avg_train)

In [None]:
# # see best score from randomizedsearchcv
# kn_gs.best_score_

In [None]:
# # see the best parameters from the tuned model
# kn_gs.best_params_

In [None]:
# # instantiate tuned model
# kn_tun_pipe = kn_gs.best_estimator_

# # print and store metrics
# get_metrics(kn_tun_pipe, 
#             X_avg_train, 
#             X_avg_test, 
#             y_avg_train, 
#             y_avg_test, 
#             "Tun KNN")

In [None]:
# hard coding results into hard_metrics_df so I don't have to
# re-run the notebook cells that take a long time to run
# every time I open it up

hard_metrics_df.at['Tun KNN', 'Train R2'] = 0.6868856
hard_metrics_df.at['Tun KNN', 'Test R2'] = 0.6185721
hard_metrics_df.at['Tun KNN', 'Train MAE'] = 0.1566485
hard_metrics_df.at['Tun KNN', 'Test MAE'] = 0.1734479
hard_metrics_df.at['Tun KNN', 'Train MAPE'] = 0.4410609
hard_metrics_df.at['Tun KNN', 'Test MAPE'] = 0.5141256
hard_metrics_df.at['Tun KNN', 'Train RMSE'] = 0.2094230
hard_metrics_df.at['Tun KNN', 'Test RMSE'] = 0.2314242

# Model 6: Extreme Gradient Boosting Regressor

In [None]:
# %%time

# # 5.5 seconds

# # instantiate
# xgb_def = XGBRegressor()

# # pipeline
# xgb_def_pipe = make_pipeline(preprocessor, xgb_def)

# # fit
# xgb_def_pipe.fit(X_avg_train, y_avg_train)

# # evaluate
# get_metrics(xgb_def_pipe, 
#             X_avg_train, 
#             X_avg_test, 
#             y_avg_train, 
#             y_avg_test, 
#             "Def XGB")

In [None]:
# hard coding results into hard_metrics_df so I don't have to
# re-run the notebook cells that take a long time to run
# every time I open it up

hard_metrics_df.at['Def XGB', 'Train R2'] = 0.6434595
hard_metrics_df.at['Def XGB', 'Test R2'] = 0.6049974
hard_metrics_df.at['Def XGB', 'Train MAE'] = 0.1723396
hard_metrics_df.at['Def XGB', 'Test MAE'] = 0.1814656
hard_metrics_df.at['Def XGB', 'Train MAPE'] = 0.3277792
hard_metrics_df.at['Def XGB', 'Test MAPE'] = 0.5393175
hard_metrics_df.at['Def XGB', 'Train RMSE'] = 0.2234741
hard_metrics_df.at['Def XGB', 'Test RMSE'] = 0.2355063

# Model 7: Light Gradient Boosting Machine Regressor

In [None]:
%%time

# 2 seconds

# instantiate
lgbm_def = LGBMRegressor()

# pipeline
lgbm_def_pipe = make_pipeline(preprocessor, lgbm_def)

# fit
lgbm_def_pipe.fit(X_avg_train, y_avg_train)

# evaluate
get_metrics(lgbm_def_pipe, 
            X_avg_train, 
            X_avg_test, 
            y_avg_train, 
            y_avg_test, 
            "Def LGBM")

Train R2      6.047285e-01
Test R2       5.888200e-01
Train MAE     1.808934e-01
Test MAE      1.850531e-01
Train MAPE    4.574953e+11
Test MAPE     5.500878e-02
Train RMSE    2.352993e-01
Test RMSE     2.402805e-01
Name: Def LGBM, dtype: float64
CPU times: user 3.68 s, sys: 29.8 ms, total: 3.71 s
Wall time: 2.95 s


In [None]:
# hard coding results into hard_metrics_df so I don't have to
# re-run the notebook cells that take a long time to run
# every time I open it up

hard_metrics_df.at['Def LGBM', 'Train R2'] = 0.6047285
hard_metrics_df.at['Def LGBM', 'Test R2'] = 0.5888200
hard_metrics_df.at['Def LGBM', 'Train MAE'] = 0.1808934
hard_metrics_df.at['Def LGBM', 'Test MAE'] = 0.1850531
hard_metrics_df.at['Def LGBM', 'Train MAPE'] = 0.4574953
hard_metrics_df.at['Def LGBM', 'Test MAPE'] = 0.5500878
hard_metrics_df.at['Def LGBM', 'Train RMSE'] = 0.2352993
hard_metrics_df.at['Def LGBM', 'Test RMSE'] = 0.2402805

# Model 8: Gradient Boosting Regressor

In [None]:
# %%time

# # 4.5 seconds

# # instantiate
# gbr_def = GradientBoostingRegressor()

# # pipeline
# gbr_def_pipe = make_pipeline(preprocessor, gbr_def)

# # fit
# gbr_def_pipe.fit(X_avg_train, y_avg_train)

# # evaluate
# get_metrics(gbr_def_pipe, 
#             X_avg_train, 
#             X_avg_test, 
#             y_avg_train, 
#             y_avg_test, 
#             "Def GBR")

In [None]:
# hard coding results into hard_metrics_df so I don't have to
# re-run the notebook cells that take a long time to run
# every time I open it up

hard_metrics_df.at['Def GBR', 'Train R2'] = 0.4528303
hard_metrics_df.at['Def GBR', 'Test R2'] = 0.4527176
hard_metrics_df.at['Def GBR', 'Train MAE'] = 0.2212064
hard_metrics_df.at['Def GBR', 'Test MAE'] = 0.2220065
hard_metrics_df.at['Def GBR', 'Train MAPE'] = 0.4547432
hard_metrics_df.at['Def GBR', 'Test MAPE'] = 0.6611475
hard_metrics_df.at['Def GBR', 'Train RMSE'] = 0.2768432
hard_metrics_df.at['Def GBR', 'Test RMSE'] = 0.2772096

# Model Decision

In [None]:
hard_metrics_df.sort_values(by = 'Test R2', ascending = False)

Unnamed: 0,Train R2,Test R2,Train MAE,Test MAE,Train MAPE,Test MAPE,Train RMSE,Test RMSE
Def RF,0.935126,0.645325,0.66029,0.162973,0.194326,0.484718,0.953249,0.223161
Tun BR,0.93516,0.645001,0.660389,0.163031,0.196782,0.484926,0.953004,0.223263
Tun RF,0.931888,0.633518,0.700322,0.170738,0.194118,0.507613,0.976753,0.226845
Tun KNN,0.686886,0.618572,0.156648,0.173448,0.441061,0.514126,0.209423,0.231424
Def KNN,0.74355,0.618245,0.139083,0.17111,0.392054,0.507329,0.189528,0.231523
Linear,0.634021,0.617048,0.170401,0.176134,0.467491,0.522122,0.226413,0.231886
Def BR,0.919632,0.616863,0.709101,0.169324,0.980135,0.503466,0.106099,0.231942
Def XGB,0.643459,0.604997,0.17234,0.181466,0.327779,0.539318,0.223474,0.235506
Def LGBM,0.604729,0.58882,0.180893,0.185053,0.457495,0.550088,0.235299,0.240281
Tun DT,0.677156,0.480589,0.143481,0.200561,0.453772,0.596437,0.212652,0.270059


# Next Steps

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49012 entries, 0 to 49011
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   section_type       49012 non-null  object 
 1   instructor_id      49012 non-null  object 
 2   facility_code      45521 non-null  object 
 3   start_time         49012 non-null  float64
 4   mon                49012 non-null  bool   
 5   tues               49012 non-null  bool   
 6   wed                49012 non-null  bool   
 7   thurs              49012 non-null  bool   
 8   fri                49012 non-null  bool   
 9   subject_name       49012 non-null  object 
 10  course_name        49012 non-null  object 
 11  a_proportion       49012 non-null  float64
 12  f_proportion       49012 non-null  float64
 13  avg_grade          49012 non-null  float64
 14  year               49012 non-null  int64  
 15  term               49012 non-null  object 
 16  class_length       490

1. Tune hyperparameters for boosting models

2. Try to bin the course names and subject names into department- or subject-related categories

3. Feature engineering?
  - Take out either course_name or subject_name, since they should be at least somewhat correlated?
  - Take out or somehow combine day-of-week columns to reduce number of columns describing schedule?
  - Take out either class_length or total_time, since they should be at least somewhat correlated?

4. Try to predict other targets (a_proportion, f_proportion)

5. Convert to a classification problem (average grade targets 3.0-4.0, 2.0-3.0, 1.0-2.0, < 1.0, for example), to show ROC visualizations