In [1]:
# Imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn import metrics
import statsmodels.api as sm

from sklearn.preprocessing import OneHotEncoder

from sklearn.neighbors import KNeighborsRegressor

from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import BaggingRegressor, VotingRegressor

from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor

In [2]:
shift_df = pd.read_csv('data/shift_details.csv')

In [3]:
shift_df.head()

Unnamed: 0,SITE_NAME,DATE1,DAY_OF_WEEK,INSPECTOR_ID,PAY_VOL,SHIFT_START,SHIFT_END,TOTALINSP,NUMINVASIVE,TOWN,WATERBODY,SHIFT_LENGTH,DATE,month,year
0,Launch Drive,05/28/2021,Fri,4771,1,12:00,18:00,33.0,0.0,Launch Drive,Cobbosseecontee Lake,360.0,2021-05-28,5,2021
1,East Winthrop,05/28/2021,Fri,4485,1,12:00,18:00,2.0,0.0,Winthrop,Cobbosseecontee Lake,360.0,2021-05-28,5,2021
2,Augusta West Kampground,05/28/2021,Fri,4769,1,12:00,18:00,1.0,0.0,Winthrop,Annabessacook Lake,360.0,2021-05-28,5,2021
3,Whippoorwill Road,05/28/2021,Fri,4174,1,12:00,18:00,13.0,0.0,Litchfield,Woodbury Pond,360.0,2021-05-28,5,2021
4,Thorofare Rd,05/29/2021,Sat,4944,1,07:00,17:00,11.0,0.0,Litchfield,Pleasant Pond,600.0,2021-05-29,5,2021


In [4]:
y = shift_df['TOTALINSP']
X = shift_df.drop(columns=['DATE1', 'TOTALINSP', 'NUMINVASIVE', 'TOWN', 'WATERBODY', 'DATE', 'INSPECTOR_ID'])

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=.3)

In [37]:
X_train

Unnamed: 0,SITE_NAME,DAY_OF_WEEK,PAY_VOL,SHIFT_START,SHIFT_END,SHIFT_LENGTH,month,year
1510,Old Kents Hill Road,Mon,1,09:00,19:00,600.0,5,2023
1061,Launch Drive Cobbosseecontee,Sun,1,13:00,19:00,360.0,7,2022
2040,East Winthrop Cobbosseecontee,Thu,1,13:00,19:00,360.0,8,2023
123,Whippoorwill Road,Sun,1,13:00,19:00,360.0,6,2021
729,Rt 41 North Basin Maranacook,Sun,1,13:00,19:00,360.0,6,2022
...,...,...,...,...,...,...,...,...
1638,East Winthrop Cobbosseecontee,Tue,1,13:00,19:00,360.0,6,2023
1095,Launch Drive Cobbosseecontee,Fri,1,07:00,13:00,360.0,7,2022
1130,Rt 41 North Basin Maranacook,Sun,1,07:00,13:00,360.0,7,2022
1294,Rt 41 North Basin Maranacook,Sun,1,13:00,19:00,360.0,8,2022


In [12]:
# Calculate the baseline r**2
shift_df['y_preds'] = np.mean(shift_df.TOTALINSP)

In [13]:
metrics.mean_squared_error(shift_df.TOTALINSP, shift_df.y_preds)

294.8412280701754

In [15]:
metrics.r2_score(shift_df.TOTALINSP, shift_df.y_preds)

0.0

In [None]:
# The baseline r2 is very low- 0

In [6]:
categorical_columns = ['SITE_NAME', 'DAY_OF_WEEK', 'month', 'year']
oh = OneHotEncoder(handle_unknown='ignore', drop='first')
X_train_transformed = oh.fit_transform(X_train[categorical_columns])
X_test_transformed= oh.transform(X_test[categorical_columns])

In [43]:
# Basic Linear Regression Model
lr = LinearRegression()
lr.fit(X_train_transformed, y_train)

In [44]:
lr.score(X_train_transformed, y_train)

0.3142477893028155

In [46]:
lr.score(X_test_transformed, y_test)

0.26526984756509553

In [16]:
# Random forest model
rf = RandomForestRegressor(n_estimators=100, n_jobs=2, random_state=42, oob_score=True)
rf.fit(X_train_transformed, y_train)
print(rf.score(X_train_transformed, y_train))
print(rf.score(X_test_transformed, y_test))

0.5823881813796656
0.28061943955375523


In [17]:
# Extra Trees model
et = ExtraTreesRegressor(n_estimators=100, n_jobs=2, random_state=42)
et.fit(X_train_transformed, y_train)
print(et.score(X_train_transformed, y_train))
print(et.score(X_test_transformed, y_test))

0.5982039094617482
0.19747301885075785


Both tree models are very ovefit- adding regularization and gridsearching over to find best params could improve the models and improve the testing r2. However, despite being overfit, the random forest model with an r2 of .28 is promising. Both models are potentially decent starting points and using a combination of new data (weather) and feature engineering (holidays and inspector experience) could further improve the models' success. 

In [20]:
# AdaBoost Model
ada = AdaBoostRegressor()
ada.fit(X_train_transformed, y_train)
print(ada.score(X_train_transformed, y_train))
print(ada.score(X_test_transformed, y_test))

0.1631178499776783
0.057888955598118574


In [22]:
# Gradient Boost Model
grad = GradientBoostingRegressor()
grad.fit(X_train_transformed, y_train)
print(grad.score(X_train_transformed, y_train))
print(grad.score(X_test_transformed, y_test))

0.4110300421256725
0.2711720175379255


In [27]:
# KNN model
sc = StandardScaler()
sc.fit(X_train_transformed)
X_train_sc = sc.tranform(X_train_transformed)
X_test_sc = sc.transform(X_test_transformed)

knn = KNeighborsRegressor()
knn.fit(X_train_sc, y_train)
print(knn.score(X_train_sc, y_train))
print(knn.score(X_test_sc, y_test))

ValueError: Cannot center sparse matrices: pass `with_mean=False` instead. See docstring for motivation and alternatives.