In [38]:
# Imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV, KFold
from sklearn import metrics
import statsmodels.api as sm

from sklearn.preprocessing import OneHotEncoder

from sklearn.neighbors import KNeighborsRegressor

from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import BaggingRegressor, VotingRegressor

from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor

In [2]:
shift_df = pd.read_csv('data/shift_details.csv')

In [3]:
shift_df.head()

Unnamed: 0,SITE_NAME,DATE1,DAY_OF_WEEK,INSPECTOR_ID,PAY_VOL,SHIFT_START,SHIFT_END,TOTALINSP,NUMINVASIVE,TOWN,WATERBODY,SHIFT_LENGTH,DATE,month,year
0,Launch Drive,05/28/2021,Fri,4771,1,12:00,18:00,33.0,0.0,Launch Drive,Cobbosseecontee Lake,360.0,2021-05-28,5,2021
1,East Winthrop,05/28/2021,Fri,4485,1,12:00,18:00,2.0,0.0,Winthrop,Cobbosseecontee Lake,360.0,2021-05-28,5,2021
2,Augusta West Kampground,05/28/2021,Fri,4769,1,12:00,18:00,1.0,0.0,Winthrop,Annabessacook Lake,360.0,2021-05-28,5,2021
3,Whippoorwill Road,05/28/2021,Fri,4174,1,12:00,18:00,13.0,0.0,Litchfield,Woodbury Pond,360.0,2021-05-28,5,2021
4,Thorofare Rd,05/29/2021,Sat,4944,1,07:00,17:00,11.0,0.0,Litchfield,Pleasant Pond,600.0,2021-05-29,5,2021


In [4]:
y = shift_df['TOTALINSP']
X = shift_df.drop(columns=['DATE1', 'TOTALINSP', 'NUMINVASIVE', 'TOWN', 'WATERBODY', 'DATE', 'INSPECTOR_ID'])

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=.3)

In [37]:
X_train

Unnamed: 0,SITE_NAME,DAY_OF_WEEK,PAY_VOL,SHIFT_START,SHIFT_END,SHIFT_LENGTH,month,year
1510,Old Kents Hill Road,Mon,1,09:00,19:00,600.0,5,2023
1061,Launch Drive Cobbosseecontee,Sun,1,13:00,19:00,360.0,7,2022
2040,East Winthrop Cobbosseecontee,Thu,1,13:00,19:00,360.0,8,2023
123,Whippoorwill Road,Sun,1,13:00,19:00,360.0,6,2021
729,Rt 41 North Basin Maranacook,Sun,1,13:00,19:00,360.0,6,2022
...,...,...,...,...,...,...,...,...
1638,East Winthrop Cobbosseecontee,Tue,1,13:00,19:00,360.0,6,2023
1095,Launch Drive Cobbosseecontee,Fri,1,07:00,13:00,360.0,7,2022
1130,Rt 41 North Basin Maranacook,Sun,1,07:00,13:00,360.0,7,2022
1294,Rt 41 North Basin Maranacook,Sun,1,13:00,19:00,360.0,8,2022


In [12]:
# Calculate the baseline r**2
shift_df['y_preds'] = np.mean(shift_df.TOTALINSP)

In [13]:
metrics.mean_squared_error(shift_df.TOTALINSP, shift_df.y_preds)

294.8412280701754

In [15]:
metrics.r2_score(shift_df.TOTALINSP, shift_df.y_preds)

0.0

In [None]:
# The baseline r2 is very low- 0

In [6]:
categorical_columns = ['SITE_NAME', 'DAY_OF_WEEK', 'month', 'year']
oh = OneHotEncoder(handle_unknown='ignore', drop='first')
X_train_transformed = oh.fit_transform(X_train[categorical_columns])
X_test_transformed= oh.transform(X_test[categorical_columns])

In [43]:
# Basic Linear Regression Model
lr = LinearRegression()
lr.fit(X_train_transformed, y_train)

In [44]:
lr.score(X_train_transformed, y_train)

0.3142477893028155

In [46]:
lr.score(X_test_transformed, y_test)

0.26526984756509553

In [16]:
# Random forest model
rf = RandomForestRegressor(n_estimators=100, n_jobs=2, random_state=42, oob_score=True)
rf.fit(X_train_transformed, y_train)
print(rf.score(X_train_transformed, y_train))
print(rf.score(X_test_transformed, y_test))

0.5823881813796656
0.28061943955375523


In [17]:
# Extra Trees model
et = ExtraTreesRegressor(n_estimators=100, n_jobs=2, random_state=42)
et.fit(X_train_transformed, y_train)
print(et.score(X_train_transformed, y_train))
print(et.score(X_test_transformed, y_test))

0.5982039094617482
0.19747301885075785


Both tree models are very ovefit- adding regularization and gridsearching over to find best params could improve the models and improve the testing r2. However, despite being overfit, the random forest model with an r2 of .28 is promising. Both models are potentially decent starting points and using a combination of new data (weather) and feature engineering (holidays and inspector experience) could further improve the models' success. 

In [20]:
# AdaBoost Model
ada = AdaBoostRegressor()
ada.fit(X_train_transformed, y_train)
print(ada.score(X_train_transformed, y_train))
print(ada.score(X_test_transformed, y_test))

0.1631178499776783
0.057888955598118574


In [22]:
# Gradient Boost Model
grad = GradientBoostingRegressor()
grad.fit(X_train_transformed, y_train)
print(grad.score(X_train_transformed, y_train))
print(grad.score(X_test_transformed, y_test))

0.4110300421256725
0.2711720175379255


In [27]:
# KNN model
sc = StandardScaler()
sc.fit(X_train_transformed)
X_train_sc = sc.tranform(X_train_transformed)
X_test_sc = sc.transform(X_test_transformed)

knn = KNeighborsRegressor()
knn.fit(X_train_sc, y_train)
print(knn.score(X_train_sc, y_train))
print(knn.score(X_test_sc, y_test))

ValueError: Cannot center sparse matrices: pass `with_mean=False` instead. See docstring for motivation and alternatives.

In [2]:
# Initial model with weather

df = pd.read_csv('data/df_with_weather.csv')

In [3]:
df.head()

Unnamed: 0,DATE1,DAY_OF_WEEK,SITE_NAME,TOWN,WATERBODY,INSPECTOR_ID,PAY_VOL,SHIFT_START,TRAILERS,SHIFT_END,SHIFT_LENGTH,TOTALINSP,PRCP,TMAX,TMIN,TOBS
0,2021-05-28,Fri,Launch Drive,Monmouth,Cobbosseecontee Lake,4771,Paid,12:00,10.0,18:00,360,33.0,0.0,72.0,45.0,49.0
1,2021-05-28,Fri,Whippoorwill Road,Litchfield,Woodbury Pond,4174,Paid,12:00,1.0,18:00,360,13.0,0.0,72.0,45.0,49.0
2,2021-05-28,Fri,Augusta West Kampground,Winthrop,Annabessacook Lake,4769,Paid,12:00,0.0,18:00,360,1.0,0.0,72.0,45.0,49.0
3,2021-05-28,Fri,East Winthrop,Winthrop,Cobbosseecontee Lake,4485,Paid,12:00,1.0,18:00,360,2.0,0.0,72.0,45.0,49.0
4,2021-05-29,Sat,Thorofare Rd,Litchfield,Pleasant Pond,4944,Paid,7:00,1.0,17:00,600,11.0,0.02,62.0,45.0,46.0


In [96]:
df.drop(columns=['TRAILERS'], inplace=True)

In [97]:
df.isna().sum()

DATE1            0
DAY_OF_WEEK      0
SITE_NAME        0
TOWN             0
WATERBODY        0
INSPECTOR_ID     0
PAY_VOL          0
SHIFT_START      0
SHIFT_END        0
SHIFT_LENGTH     0
TOTALINSP        2
PRCP            27
TMAX            91
TMIN            95
TOBS            55
dtype: int64

In [98]:
df.shape

(2669, 15)

In [99]:
df.dropna(inplace=True)

In [100]:
df.isna().sum()

DATE1           0
DAY_OF_WEEK     0
SITE_NAME       0
TOWN            0
WATERBODY       0
INSPECTOR_ID    0
PAY_VOL         0
SHIFT_START     0
SHIFT_END       0
SHIFT_LENGTH    0
TOTALINSP       0
PRCP            0
TMAX            0
TMIN            0
TOBS            0
dtype: int64

In [101]:
df.shape

(2565, 15)

In [102]:
df.head()

Unnamed: 0,DATE1,DAY_OF_WEEK,SITE_NAME,TOWN,WATERBODY,INSPECTOR_ID,PAY_VOL,SHIFT_START,SHIFT_END,SHIFT_LENGTH,TOTALINSP,PRCP,TMAX,TMIN,TOBS
0,2021-05-28,Fri,Launch Drive,Monmouth,Cobbosseecontee Lake,4771,Paid,12:00,18:00,360,33.0,0.0,72.0,45.0,49.0
1,2021-05-28,Fri,Whippoorwill Road,Litchfield,Woodbury Pond,4174,Paid,12:00,18:00,360,13.0,0.0,72.0,45.0,49.0
2,2021-05-28,Fri,Augusta West Kampground,Winthrop,Annabessacook Lake,4769,Paid,12:00,18:00,360,1.0,0.0,72.0,45.0,49.0
3,2021-05-28,Fri,East Winthrop,Winthrop,Cobbosseecontee Lake,4485,Paid,12:00,18:00,360,2.0,0.0,72.0,45.0,49.0
4,2021-05-29,Sat,Thorofare Rd,Litchfield,Pleasant Pond,4944,Paid,7:00,17:00,600,11.0,0.02,62.0,45.0,46.0


In [103]:
# Convert 'date1' to datetime
df['DATE'] = pd.to_datetime(df['DATE1'], format='%Y/%m/%d')

# Create 'month' and 'year' columns
df['month'] = df['DATE'].dt.month
df['year'] = df['DATE'].dt.year

In [104]:
df.head()

Unnamed: 0,DATE1,DAY_OF_WEEK,SITE_NAME,TOWN,WATERBODY,INSPECTOR_ID,PAY_VOL,SHIFT_START,SHIFT_END,SHIFT_LENGTH,TOTALINSP,PRCP,TMAX,TMIN,TOBS,DATE,month,year
0,2021-05-28,Fri,Launch Drive,Monmouth,Cobbosseecontee Lake,4771,Paid,12:00,18:00,360,33.0,0.0,72.0,45.0,49.0,2021-05-28,5,2021
1,2021-05-28,Fri,Whippoorwill Road,Litchfield,Woodbury Pond,4174,Paid,12:00,18:00,360,13.0,0.0,72.0,45.0,49.0,2021-05-28,5,2021
2,2021-05-28,Fri,Augusta West Kampground,Winthrop,Annabessacook Lake,4769,Paid,12:00,18:00,360,1.0,0.0,72.0,45.0,49.0,2021-05-28,5,2021
3,2021-05-28,Fri,East Winthrop,Winthrop,Cobbosseecontee Lake,4485,Paid,12:00,18:00,360,2.0,0.0,72.0,45.0,49.0,2021-05-28,5,2021
4,2021-05-29,Sat,Thorofare Rd,Litchfield,Pleasant Pond,4944,Paid,7:00,17:00,600,11.0,0.02,62.0,45.0,46.0,2021-05-29,5,2021


In [105]:
df.SHIFT_START = pd.to_datetime(df.SHIFT_START, format='%H:%M')

In [106]:
df.SHIFT_END = pd.to_datetime(df.SHIFT_END)

In [107]:
X = df.drop(columns=['DATE1', 'TOWN', 'WATERBODY', 'INSPECTOR_ID', 'TOTALINSP', 'TOBS', 'DATE'])
y = df.TOTALINSP            

In [108]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=.2)

In [109]:
categorical_columns = ['SITE_NAME', 'DAY_OF_WEEK', 'month', 'year', 'PAY_VOL']
oh = OneHotEncoder(handle_unknown='ignore', drop='first')
X_train_transformed = oh.fit_transform(X_train[categorical_columns])
X_test_transformed= oh.transform(X_test[categorical_columns])

In [110]:
lr = LinearRegression()
lr.fit(X_train_transformed, y_train)
print(lr.score(X_train_transformed, y_train))
print(lr.score(X_test_transformed, y_test))

0.3513358726642821
0.33841873049366566


In [111]:
rf = RandomForestRegressor()
rf.fit(X_train_transformed, y_train)
print(rf.score(X_train_transformed, y_train))
print(rf.score(X_test_transformed, y_test))      

0.6434682293497731
0.3232590228474176


In [112]:
et = ExtraTreesRegressor()
et.fit(X_train_transformed, y_train)
print(et.score(X_train_transformed, y_train))
print(et.score(X_test_transformed, y_test))

0.6573212547527139
0.25420966053172644


From these initial models, adding weather data seems to improve the model performance significantly. The models become significantly less overgit. The next step will be to try to add in the holidays as another column, and then to use feature engineering with shift length, shift start and shift end to attempt to somehow capture time of day.

The models were all less overfit, however the tree models are still overfit. Parameter optimization could help reduce the overfit as well to balance out the metrics of the tree based models between the training and test scores. 

#### Feature Engineering Options:
- Transform the shift_start and end_shift columns from numbers to a categorical column of morning, late morning, afternoon, late afternoon, and evening
- Interaction variables?
- Holidays

In [113]:
# Converting times to 'time of day'
df.head(1)

Unnamed: 0,DATE1,DAY_OF_WEEK,SITE_NAME,TOWN,WATERBODY,INSPECTOR_ID,PAY_VOL,SHIFT_START,SHIFT_END,SHIFT_LENGTH,TOTALINSP,PRCP,TMAX,TMIN,TOBS,DATE,month,year
0,2021-05-28,Fri,Launch Drive,Monmouth,Cobbosseecontee Lake,4771,Paid,1900-01-01 12:00:00,2023-12-03 18:00:00,360,33.0,0.0,72.0,45.0,49.0,2021-05-28,5,2021


In [114]:
df.SHIFT_START.value_counts()

1900-01-01 07:00:00    902
1900-01-01 13:00:00    604
1900-01-01 09:00:00    241
1900-01-01 12:00:00    164
1900-01-01 08:00:00    119
                      ... 
1900-01-01 08:17:00      1
1900-01-01 07:40:00      1
1900-01-01 04:41:00      1
1900-01-01 12:55:00      1
1900-01-01 09:40:00      1
Name: SHIFT_START, Length: 139, dtype: int64

In [115]:
df.dtypes

DATE1                   object
DAY_OF_WEEK             object
SITE_NAME               object
TOWN                    object
WATERBODY               object
INSPECTOR_ID             int64
PAY_VOL                 object
SHIFT_START     datetime64[ns]
SHIFT_END       datetime64[ns]
SHIFT_LENGTH             int64
TOTALINSP              float64
PRCP                   float64
TMAX                   float64
TMIN                   float64
TOBS                   float64
DATE            datetime64[ns]
month                    int64
year                     int64
dtype: object

In [116]:
df['SHIFT_START_CATEGORY'] = pd.cut(pd.to_datetime(df['SHIFT_START'], format='%H:%M').dt.hour,
                                    bins =[-1, 8, 11, 14, 17, 24],
                                    labels=['Early Morning', 'Morning', 'Afternoon', 'Late Afternoon', 'Evening'])
                        

In [117]:
df.isna().sum()

DATE1                   0
DAY_OF_WEEK             0
SITE_NAME               0
TOWN                    0
WATERBODY               0
INSPECTOR_ID            0
PAY_VOL                 0
SHIFT_START             0
SHIFT_END               0
SHIFT_LENGTH            0
TOTALINSP               0
PRCP                    0
TMAX                    0
TMIN                    0
TOBS                    0
DATE                    0
month                   0
year                    0
SHIFT_START_CATEGORY    0
dtype: int64

In [118]:
df.SHIFT_START_CATEGORY.value_counts()

Early Morning     1202
Afternoon          871
Morning            371
Late Afternoon     105
Evening             16
Name: SHIFT_START_CATEGORY, dtype: int64

In [119]:
df['SHIFT_END_CATEGORY'] = pd.cut(pd.to_datetime(df['SHIFT_END'], format='%H:%M').dt.hour,
                                    bins =[-1, 8, 11, 14, 17, 19, 24],
                                    labels=['Early Morning', 'Morning', 'Afternoon', 'Late Afternoon', 'Evening', 'Night'])
                        

In [120]:
df.SHIFT_END_CATEGORY.value_counts()

Evening           1004
Afternoon          810
Late Afternoon     511
Morning            108
Early Morning       88
Night               44
Name: SHIFT_END_CATEGORY, dtype: int64

In [121]:
X1 = df.drop(columns=['DATE1', 'TOWN', 'WATERBODY', 'INSPECTOR_ID', 'TOTALINSP', 'TOBS', 'DATE', 'SHIFT_START', 'SHIFT_END'])
y1 = df.TOTALINSP            

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, random_state=42, test_size=.2)

In [122]:
categorical_columns = ['SITE_NAME', 'DAY_OF_WEEK', 'month', 'year', 'PAY_VOL', 'SHIFT_START_CATEGORY', 'SHIFT_END_CATEGORY']
oh = OneHotEncoder(handle_unknown='ignore', drop='first')
X1_train_transformed = oh.fit_transform(X1_train[categorical_columns])
X1_test_transformed= oh.transform(X1_test[categorical_columns])

In [123]:
lr1 = LinearRegression()
lr1.fit(X1_train_transformed, y1_train)
print(lr1.score(X1_train_transformed, y1_train))
print(lr1.score(X1_test_transformed, y1_test))

0.43299897430845313
0.45589238044822944


In [124]:
rf1 = RandomForestRegressor()
rf1.fit(X1_train_transformed, y1_train)
print(rf1.score(X1_train_transformed, y1_train))
print(rf1.score(X1_test_transformed, y1_test))     

0.7985128527367691
0.3845439939140577


In [125]:
et1 =ExtraTreesRegressor()
et1.fit(X1_train_transformed, y1_train)
print(et1.score(X1_train_transformed, y1_train))
print(et1.score(X1_test_transformed, y1_test))      

0.8333177482959641
0.18467138568428598


In [126]:
# List of holidays
# 2021: Monday, May 31 Memorial Day, Sunday July 4 (Observed Monday), Monday September 6 (Labor Day), Sunday May 9 (Mother's Day), Father's Day (Sunday, June 20)
# 2022: Sunday May 08, Mother's Day, 'Monday May 30 Memorial Day, Monday September 5 Labor Day, Monday July 4, Sunday June 19 Father's Day
# 2023: Monday, May 29 Memorial Day, Monday June19 Juneteenth, Tuesday, July 04, Monday September 04 Labor Day, Mother's Day Sunday May 14, Father's Day Sunday June 18
#'2021-05-09', '2022-05-30', , '2023-06-19'
holiday_list = ['2021-05-31', '2021-07-04', '2021-09-06', '2021-06-20',
                '2022-05-08', '2022-05-30', '2022-07-04', '2022-06-19',
                '2023-05-29', '2023-07-04', '2023-09-04', '2023-06-18']


In [127]:
df['holiday']= np.where(df['DATE'].isin(holiday_list), 1, 0)

In [128]:
df.sample(25)

Unnamed: 0,DATE1,DAY_OF_WEEK,SITE_NAME,TOWN,WATERBODY,INSPECTOR_ID,PAY_VOL,SHIFT_START,SHIFT_END,SHIFT_LENGTH,...,PRCP,TMAX,TMIN,TOBS,DATE,month,year,SHIFT_START_CATEGORY,SHIFT_END_CATEGORY,holiday
272,2021-07-16,Fri,Augusta West Kampground,Winthrop,Annabessacook Lake,4769,Paid,1900-01-01 12:00:00,2023-12-03 18:00:00,360,...,0.0,88.0,67.0,74.0,2021-07-16,7,2021,Afternoon,Evening,0
1523,2022-08-28,Sun,Launch Drive Cobbosseecontee,Monmouth,Cobbosseecontee Lake,5580,Paid,1900-01-01 07:00:00,2023-12-03 13:00:00,360,...,0.0,78.0,62.0,67.0,2022-08-28,8,2022,Early Morning,Afternoon,0
1671,2022-09-17,Sat,East Winthrop Cobbosseecontee,Winthrop,Cobbosseecontee Lake,3504,Paid,1900-01-01 13:00:00,2023-12-03 19:00:00,360,...,0.0,67.0,44.0,50.0,2022-09-17,9,2022,Afternoon,Evening,0
2043,2023-07-09,Sun,Holmes Road Annabessacook,Winthrop,Annabessacook Lake,5648,Paid,1900-01-01 13:00:00,2023-12-03 19:00:00,360,...,0.01,79.0,68.0,70.0,2023-07-09,7,2023,Afternoon,Evening,0
2199,2023-07-26,Wed,East Winthrop Cobbosseecontee,Winthrop,Cobbosseecontee Lake,5670,Paid,1900-01-01 07:00:00,2023-12-03 13:00:00,360,...,1.24,88.0,65.0,71.0,2023-07-26,7,2023,Early Morning,Afternoon,0
361,2021-07-31,Sat,East Winthrop,Winthrop,Cobbosseecontee Lake,5367,Paid,1900-01-01 07:00:00,2023-12-03 13:00:00,360,...,0.0,75.0,56.0,63.0,2021-07-31,7,2021,Early Morning,Afternoon,0
561,2021-08-28,Sat,Wilson Pond Road,Monmouth,Wilson Pond,4770,Paid,1900-01-01 07:00:00,2023-12-03 17:00:00,600,...,0.0,84.0,54.0,62.0,2021-08-28,8,2021,Early Morning,Late Afternoon,0
484,2021-08-15,Sun,Augusta West Kampground,Winthrop,Annabessacook Lake,4769,Paid,1900-01-01 08:00:00,2023-12-03 18:00:00,600,...,0.34,89.0,60.0,65.0,2021-08-15,8,2021,Early Morning,Evening,0
570,2021-08-28,Sat,Old Kents Hill Road,Readfield,Torsey Lake,5451,Volunteer,1900-01-01 15:00:00,2023-12-03 17:00:00,120,...,0.0,84.0,54.0,62.0,2021-08-28,8,2021,Late Afternoon,Late Afternoon,0
744,2022-06-12,Sun,Launch Drive Cobbosseecontee,Monmouth,Cobbosseecontee Lake,5005,Paid,1900-01-01 13:00:00,2023-12-03 19:00:00,360,...,0.01,78.0,53.0,68.0,2022-06-12,6,2022,Afternoon,Evening,0


In [129]:
X2 = df.drop(columns=['DATE1', 'TOWN', 'WATERBODY', 'INSPECTOR_ID', 'TOTALINSP', 'TOBS', 'DATE', 'SHIFT_START', 'SHIFT_END'])
y2 = df.TOTALINSP            

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, random_state=42, test_size=.2)

In [130]:
categorical_columns = ['SITE_NAME', 'DAY_OF_WEEK', 'month', 'year', 'PAY_VOL', 'SHIFT_START_CATEGORY', 'SHIFT_END_CATEGORY']
oh = OneHotEncoder(handle_unknown='ignore', drop='first')
X2_train_transformed = oh.fit_transform(X2_train[categorical_columns])
X2_test_transformed= oh.transform(X2_test[categorical_columns])

In [131]:
lr2 = LinearRegression()
lr2.fit(X2_train_transformed, y2_train)
print(lr2.score(X2_train_transformed, y2_train))
print(lr2.score(X2_test_transformed, y2_test))

0.43299897430845313
0.45589238044822944


In [132]:
rf2 = RandomForestRegressor()
rf2.fit(X2_train_transformed, y2_train)
print(rf2.score(X2_train_transformed, y2_train))
print(rf1.score(X2_test_transformed, y2_test))    

0.7988425326030364
0.3845439939140577


In [133]:
et2 =ExtraTreesRegressor()
et2.fit(X2_train_transformed, y2_train)
print(et2.score(X2_train_transformed, y2_train))
print(et2.score(X2_test_transformed, y2_test))     

0.8333177482959641
0.18351178227378473


In [134]:
# List of holidays
# 2021: Monday, May 31 Memorial Day, Sunday July 4 (Observed Monday), Monday September 6 (Labor Day), Sunday May 9 (Mother's Day), Father's Day (Sunday, June 20)
# 2022: Sunday May 08, Mother's Day, 'Monday May 30 Memorial Day, Monday September 5 Labor Day, Monday July 4, Sunday June 19 Father's Day
# 2023: Monday, May 29 Memorial Day, Monday June19 Juneteenth, Tuesday, July 04, Monday September 04 Labor Day, Mother's Day Sunday May 14, Father's Day Sunday June 18
#'2021-05-09', '2022-05-08', , '2023-06-19'
holiday_list_weekends = ['2021-05-31', '2021-05-29', '2021-05-30', '2021-07-04', '2021-07-03', '2021-07-05', '2021-09-06', '2021-09-04', '2021-09-05', '2021-06-19', '2021-06-20',
                '2022-05-28', '2022-05-29', '2022-05-30', '2022-07-02', '2022-07-03', '2022-07-04', '2022-06-19', '2022-09-03', '2022-09-04', '2022-06-18',
                '2023-05-29', '2023-05-27', '2023-05-28', '2023-07-01', '2023-07-02', '2023-07-04', '2023-09-02', '2023-09-03', '2023-09-04', '2023-06-18', '2023-06-17']

In [135]:
df['holiday']= np.where(df['DATE'].isin(holiday_list_weekends), 1, 0)

In [136]:
X3 = df.drop(columns=['DATE1', 'TOWN', 'WATERBODY', 'INSPECTOR_ID', 'TOTALINSP', 'TOBS', 'DATE', 'SHIFT_START', 'SHIFT_END'])
y3 = df.TOTALINSP            

X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, random_state=42, test_size=.2)

In [137]:
categorical_columns = ['SITE_NAME', 'DAY_OF_WEEK', 'month', 'year', 'PAY_VOL', 'SHIFT_START_CATEGORY', 'SHIFT_END_CATEGORY']
oh = OneHotEncoder(handle_unknown='ignore', drop='first')
X3_train_transformed = oh.fit_transform(X3_train[categorical_columns])
X3_test_transformed= oh.transform(X3_test[categorical_columns])

In [138]:
lr3 = LinearRegression()
lr3.fit(X3_train_transformed, y3_train)
print(lr3.score(X3_train_transformed, y3_train))
print(lr3.score(X3_test_transformed, y3_test))

0.43299897430845313
0.45589238044822944


In [139]:
rf3 = RandomForestRegressor()
rf3.fit(X3_train_transformed, y3_train)
print(rf3.score(X3_train_transformed, y3_train))
print(rf3.score(X3_test_transformed, y3_test))    

0.7988694185175699
0.3861419926788837


In [140]:
et3 =ExtraTreesRegressor()
et3.fit(X3_train_transformed, y3_train)
print(et3.score(X3_train_transformed, y3_train))
print(et3.score(X3_test_transformed, y3_test))     

0.8333177482959641
0.187679792316212


In [46]:
df['holiday'].value_counts()

0    2139
1     426
Name: holiday, dtype: int64

In [None]:
# Filling in the weather data using forward fill rather than dropping made the models worse not better.
# Filling in the weather data using back fill was better than forward fil- compare to dropping and compare to fill using other station if possible

In [4]:
df1 = pd.read_csv('data/df1_with_weather1.csv')

In [5]:
df1.head()

Unnamed: 0,DATE1,DAY_OF_WEEK,SITE_NAME,TOWN,WATERBODY,INSPECTOR_ID,PAY_VOL,SHIFT_START,TRAILERS,SHIFT_END,SHIFT_LENGTH,TOTALINSP,NAME,PRCP,TMAX,TMIN,ID_COUNT
0,2021-05-28,Fri,Launch Drive,Monmouth,Cobbosseecontee Lake,4771,Paid,12:00,10.0,18:00,360,33.0,"WINTHROP, ME US",0.0,72.0,45.0,55
1,2021-05-28,Fri,Whippoorwill Road,Litchfield,Woodbury Pond,4174,Paid,12:00,1.0,18:00,360,13.0,"WINTHROP, ME US",0.0,72.0,45.0,131
2,2021-05-28,Fri,Augusta West Kampground,Winthrop,Annabessacook Lake,4769,Paid,12:00,0.0,18:00,360,1.0,"WINTHROP, ME US",0.0,72.0,45.0,56
3,2021-05-28,Fri,East Winthrop,Winthrop,Cobbosseecontee Lake,4485,Paid,12:00,1.0,18:00,360,2.0,"WINTHROP, ME US",0.0,72.0,45.0,16
4,2021-05-29,Sat,Thorofare Rd,Litchfield,Pleasant Pond,4944,Paid,7:00,1.0,17:00,600,11.0,"WINTHROP, ME US",0.02,62.0,45.0,57


In [6]:
df1.drop(columns=['TRAILERS'], inplace=True)
df1.dropna(inplace=True)

In [7]:
# Convert 'date1' to datetime
df1['DATE'] = pd.to_datetime(df1['DATE1'], format='%Y/%m/%d')

# Create 'month' and 'year' columns
df1['month'] = df1['DATE'].dt.month
df1['year'] = df1['DATE'].dt.year

In [8]:
df1.SHIFT_START = pd.to_datetime(df1.SHIFT_START, format='%H:%M')
df1.SHIFT_END = pd.to_datetime(df1.SHIFT_END)

In [9]:
df1['SHIFT_START_CATEGORY'] = pd.cut(pd.to_datetime(df1['SHIFT_START'], format='%H:%M').dt.hour,
                                    bins =[-1, 8, 11, 14, 17, 24],
                                    labels=['Early Morning', 'Morning', 'Afternoon', 'Late Afternoon', 'Evening'])
           

In [10]:
df1['SHIFT_END_CATEGORY'] = pd.cut(pd.to_datetime(df1['SHIFT_END'], format='%H:%M').dt.hour,
                                    bins =[-1, 8, 11, 14, 17, 19, 24],
                                    labels=['Early Morning', 'Morning', 'Afternoon', 'Late Afternoon', 'Evening', 'Night'])
                        

In [11]:
holiday_list_weekends = ['2021-05-31', '2021-05-29', '2021-05-30', '2021-07-04', '2021-07-03', '2021-07-05', '2021-09-06', '2021-09-04', '2021-09-05', '2021-06-19', '2021-06-20',
                '2022-05-28', '2022-05-29', '2022-05-30', '2022-07-02', '2022-07-03', '2022-07-04', '2022-06-19', '2022-09-03', '2022-09-04', '2022-06-18',
                '2023-05-29', '2023-05-27', '2023-05-28', '2023-07-01', '2023-07-02', '2023-07-04', '2023-09-02', '2023-09-03', '2023-09-04', '2023-06-18', '2023-06-17']
df1['holiday']= np.where(df1['DATE'].isin(holiday_list_weekends), 1, 0)

In [13]:
X1 = df1.drop(columns=['DATE1', 'TOWN', 'WATERBODY', 'INSPECTOR_ID', 'TOTALINSP', 'DATE', 'SHIFT_START', 'SHIFT_END'])
y1 = df1.TOTALINSP            

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, random_state=42, test_size=.2)

In [14]:
categorical_columns = ['SITE_NAME', 'DAY_OF_WEEK', 'month', 'year', 'PAY_VOL', 'SHIFT_START_CATEGORY', 'SHIFT_END_CATEGORY']
oh = OneHotEncoder(handle_unknown='ignore', drop='first')
X1_train_transformed = oh.fit_transform(X1_train[categorical_columns])
X1_test_transformed= oh.transform(X1_test[categorical_columns])

In [15]:
lr1 = LinearRegression()
lr1.fit(X1_train_transformed, y1_train)
print(lr1.score(X1_train_transformed, y1_train))
print(lr1.score(X1_test_transformed, y1_test))

0.42266411837465556
0.48353193229186


In [16]:
rf1 = RandomForestRegressor()
rf1.fit(X1_train_transformed, y1_train)
print(rf1.score(X1_train_transformed, y1_train))
print(rf1.score(X1_test_transformed, y1_test))    

0.7893543495511808
0.44322031454391253


In [17]:
et1 =ExtraTreesRegressor()
et1.fit(X1_train_transformed, y1_train)
print(et1.score(X1_train_transformed, y1_train))
print(et1.score(X1_test_transformed, y1_test))   

0.8243313162085231
0.20099809447507122


In [18]:
df1 = pd.read_csv('data/df1_with_weather1.csv')

In [19]:
df1.drop(columns=['TRAILERS'], inplace=True)
df1.dropna(inplace=True)

In [20]:
# Convert 'date1' to datetime
df1['DATE'] = pd.to_datetime(df1['DATE1'], format='%Y/%m/%d')

# Create 'month' and 'year' columns
df1['month'] = df1['DATE'].dt.month
df1['year'] = df1['DATE'].dt.year

In [21]:
df1.SHIFT_START = pd.to_datetime(df1.SHIFT_START, format='%H:%M')
df1.SHIFT_END = pd.to_datetime(df1.SHIFT_END)

In [22]:
df1['SHIFT_START_CATEGORY'] = pd.cut(pd.to_datetime(df1['SHIFT_START'], format='%H:%M').dt.hour,
                                    bins =[-1, 8, 11, 14, 17, 24],
                                    labels=['Early Morning', 'Morning', 'Afternoon', 'Late Afternoon', 'Evening'])
           

In [23]:
df1['SHIFT_END_CATEGORY'] = pd.cut(pd.to_datetime(df1['SHIFT_END'], format='%H:%M').dt.hour,
                                    bins =[-1, 8, 11, 14, 17, 19, 24],
                                    labels=['Early Morning', 'Morning', 'Afternoon', 'Late Afternoon', 'Evening', 'Night'])
                        

In [24]:
# Taking father's day out of the holiday weekend
holiday_list_weekends = ['2021-05-31', '2021-05-29', '2021-05-30', '2021-07-04', '2021-07-03', '2021-07-05', '2021-09-06', '2021-09-04', '2021-09-05',
                '2022-05-28', '2022-05-29', '2022-05-30', '2022-07-02', '2022-07-03', '2022-07-04', '2022-09-03', '2022-09-04',
                '2023-05-29', '2023-05-27', '2023-05-28', '2023-07-01', '2023-07-02', '2023-07-04', '2023-09-02', '2023-09-03', '2023-09-04']
df1['holiday']= np.where(df1['DATE'].isin(holiday_list_weekends), 1, 0)

In [25]:
X1 = df1.drop(columns=['DATE1', 'TOWN', 'WATERBODY', 'INSPECTOR_ID', 'TOTALINSP', 'DATE', 'SHIFT_START', 'SHIFT_END'])
y1 = df1.TOTALINSP            

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, random_state=42, test_size=.2)

In [26]:
categorical_columns = ['SITE_NAME', 'DAY_OF_WEEK', 'month', 'year', 'PAY_VOL', 'SHIFT_START_CATEGORY', 'SHIFT_END_CATEGORY']
oh = OneHotEncoder(handle_unknown='ignore', drop='first')
X1_train_transformed = oh.fit_transform(X1_train[categorical_columns])
X1_test_transformed= oh.transform(X1_test[categorical_columns])

In [27]:
lr1 = LinearRegression()
lr1.fit(X1_train_transformed, y1_train)
print(lr1.score(X1_train_transformed, y1_train))
print(lr1.score(X1_test_transformed, y1_test))

0.42266411837465556
0.48353193229186


In [28]:
rf1 = RandomForestRegressor()
rf1.fit(X1_train_transformed, y1_train)
print(rf1.score(X1_train_transformed, y1_train))
print(rf1.score(X1_test_transformed, y1_test))   

0.7903965289224963
0.44141685599080527


In [29]:
et1 =ExtraTreesRegressor()
et1.fit(X1_train_transformed, y1_train)
print(et1.score(X1_train_transformed, y1_train))
print(et1.score(X1_test_transformed, y1_test))   

0.8243313162085231
0.20093195438709655


### Progress Check-in
- Adding the weather data made a signficant improvement
    - Need to determine the best way to deal with the missing weather data (drop, backfill, fill with different station- not front fill)
- Adding the number of shifts worked by the inspector improved the tree based model
    - Should I use more feature engineering and further manipulate this data?
- Linear Regression is decent and not overfit: maybe not solid enough yet for a production model?
- Tree models very overfit, but random forest has a decent testing R2 and if overfit is reduced this may be a winning model

In [30]:
ada = AdaBoostRegressor()
ada.fit(X1_train_transformed, y1_train)
print(ada.score(X1_train_transformed, y1_train))
print(ada.score(X1_test_transformed, y1_test))

0.23092726105173633
0.2081944617287067


In [31]:
grad = GradientBoostingRegressor()
grad.fit(X1_train_transformed, y1_train)
print(grad.score(X1_train_transformed, y1_train))
print(grad.score(X1_test_transformed, y1_test))

0.5272917851498473
0.5208917615164887


In [None]:
# Try some initial gridsearching
rf1 = RandomForestRegressor()
rf1.fit(X1_train_transformed, y1_train)
print(rf1.score(X1_train_transformed, y1_train))
print(rf1.score(X1_test_transformed, y1_test))   


In [32]:
grad = GradientBoostingRegressor()

pgrid = {
    'estimator__learning_rate': [0.1, 1, 10],
    'estimator__n_estimators': [10, 100],
    'estimator__max_depth': [None, 1, 2, 3]
}

kf1 = KFold(n_splits=10, shuffle=True, random_state=42)
gs1 = GridSearchCV(grad, pgrid, cv=kf1, n_jobs=-1)

NameError: name 'KFold' is not defined

In [33]:
params = {
    'max_features': np.arange(5, X1.shape[1] + 1),
    'max_depth': np.append(np.arange(1, 50), None),
    'min_samples_leaf': [2, 3],   
    'n_estimators': [50, 100, 200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
}

rf_gb = GradientBoostingRegressor(random_state = 42)

rf_gb_county_cases = RandomizedSearchCV(rf_gb, params, n_iter=100, cv = 5, n_jobs = -1)

In [34]:
%%time
rf_gb.fit(X1_train_transformed, y1_train)

CPU times: total: 3.23 s
Wall time: 22min 10s


In [35]:
print(f'Train Score: {rf_gb.score(X1_train_transformed, y1_train)}')
print(f'Test Score: {rf_gb.score(X1_test_transformed, y1_test)}')

Train Score: 0.6150099401370822
Test Score: 0.5473442855804718


In [36]:
rf_gb.best_params_

{'n_estimators': 600,
 'min_samples_leaf': 2,
 'max_features': 10,
 'max_depth': 3}

In [39]:
params = {
    'max_features': np.arange(1, X1.shape[1] + 1), 
    'max_depth': np.append(np.arange(1, 10), None), 
    'min_samples_leaf': np.arange(1, 31) 
}
kf = KFold(n_splits=10, shuffle=True, random_state=2023)
ranfor = RandomForestRegressor(
    n_estimators=100,
    random_state=2023
)

gs2 = GridSearchCV(ranfor, params, cv=kf, n_jobs=2)

In [40]:
%%time
gs2.fit(X1_train_transformed, y1_train)

KeyboardInterrupt: 

In [None]:
print(gs2.score(X1_train_transformed, y1_train))
print(gs2.score(X1_test_transformed, y1_test))

In [None]:
gs2.best_params_