In [1]:
import utils
import math
import pandas as pd
import numpy as np
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

In [3]:
df = pd.read_csv('/Users/allen/Documents/data_512/Data/call_data_filtered.csv')

df['Original Time Queued'] = pd.to_datetime(df['Original Time Queued'])

# Features

In [4]:
ds = df.copy()

In [5]:
ds['Call_Hour'] = ds['Original Time Queued'].dt.hour
ds['Call_Weekday'] = ds['Original Time Queued'].dt.weekday

ds['Call_Hour_Bin'] = ds['Call_Hour'].apply(utils.create_hour_bin)


# extract holiday info
cal = calendar()
holidays = cal.holidays(start=ds['Original Time Queued'].min(), end=ds['Original Time Queued'].max())
ds['is_Holiday'] = ds['Original Time Queued'].dt.date.isin(pd.Series(holidays).dt.date).astype(int)

# encoding
ds = ds[['Call Type','Priority','Precinct','Sector','response_time','Call_Weekday','Call_Hour_Bin','is_Holiday']]

ds = pd.get_dummies(ds, 
               columns=['Call Type','Priority','Precinct','Sector','Call_Weekday','Call_Hour_Bin'],
               prefix=['Call Type','Priority','Precinct','Sector','Call_Weekday','Call_Hour_Bin'])

# Train and Validation

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_val, y_train, y_val = train_test_split(ds.drop(columns=['response_time']),
                                                  ds['response_time'],
                                                  test_size=0.2, 
                                                  random_state=10)

# Random Forest

In [12]:
from sklearn.ensemble import RandomForestRegressor

In [13]:
forest_reg = RandomForestRegressor(n_estimators=20, random_state=10, criterion='rmse',
                                  min_samples_leaf=10)
forest_reg.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=20,
                      n_jobs=None, oob_score=False, random_state=10, verbose=0,
                      warm_start=False)

In [14]:
pred_forest = forest_reg.predict(X_val)

In [22]:
math.sqrt(sum((y_val - pred_forest)**2))

7745.610398011465

In [None]:
# param_grid = [{'max_depth': [10, 20, 30],
#                'min_samples_leaf': [1, 2, 4],
#                'min_samples_split': [2, 5, 10],
#                'n_estimators': [100, 200, 400]}]

# forest_clf = RandomForestRegressor(n_estimators=20, random_state=10)

# grid_search_full = GridSearchCV(forest_clf, param_grid, cv=3, verbose=3, n_jobs=-1)
# grid_search_full.fit(X, y)

# Xgboost

In [2]:
import xgboost as xgb

In [9]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

In [12]:
param = {
    'objective': 'reg:squarederror',
    'eva_metric': 'rmse',
    'eta': 0.03,
    'subsample': 0.8,
    'colsample': 0.8,
    'max_depth': 6
}

num_round = 500

In [14]:
# specify validations set to watch performance
watchlist = [(dval, 'eval'), (dtrain, 'train')]
bst = xgb.train(param, 
                dtrain, 
                num_round,
                watchlist,
                early_stopping_rounds=10)

[0]	eval-rmse:21.1554	train-rmse:21.0545
Multiple eval metrics have been passed: 'train-rmse' will be used for early stopping.

Will train until train-rmse hasn't improved in 10 rounds.
[1]	eval-rmse:20.775	train-rmse:20.6748
[2]	eval-rmse:20.4102	train-rmse:20.3105
[3]	eval-rmse:20.0606	train-rmse:19.9623
[4]	eval-rmse:19.7267	train-rmse:19.628
[5]	eval-rmse:19.4067	train-rmse:19.3076
[6]	eval-rmse:19.1008	train-rmse:19.0016
[7]	eval-rmse:18.808	train-rmse:18.7085
[8]	eval-rmse:18.5287	train-rmse:18.4291
[9]	eval-rmse:18.2619	train-rmse:18.1628
[10]	eval-rmse:18.0072	train-rmse:17.9082
[11]	eval-rmse:17.7638	train-rmse:17.6651
[12]	eval-rmse:17.5323	train-rmse:17.4336
[13]	eval-rmse:17.311	train-rmse:17.2131
[14]	eval-rmse:17.1002	train-rmse:17.0029
[15]	eval-rmse:16.8995	train-rmse:16.8028
[16]	eval-rmse:16.7082	train-rmse:16.612
[17]	eval-rmse:16.5262	train-rmse:16.4312
[18]	eval-rmse:16.3532	train-rmse:16.2586
[19]	eval-rmse:16.1886	train-rmse:16.0943
[20]	eval-rmse:16.0322	train-r

[189]	eval-rmse:13.2159	train-rmse:13.1581
[190]	eval-rmse:13.2158	train-rmse:13.158
[191]	eval-rmse:13.2157	train-rmse:13.1579
[192]	eval-rmse:13.2156	train-rmse:13.1577
[193]	eval-rmse:13.2156	train-rmse:13.1577
[194]	eval-rmse:13.2155	train-rmse:13.1575
[195]	eval-rmse:13.2154	train-rmse:13.1574
[196]	eval-rmse:13.2153	train-rmse:13.1573
[197]	eval-rmse:13.2152	train-rmse:13.1572
[198]	eval-rmse:13.2151	train-rmse:13.1571
[199]	eval-rmse:13.215	train-rmse:13.157
[200]	eval-rmse:13.2149	train-rmse:13.1568
[201]	eval-rmse:13.2148	train-rmse:13.1567
[202]	eval-rmse:13.2147	train-rmse:13.1566
[203]	eval-rmse:13.2146	train-rmse:13.1565
[204]	eval-rmse:13.2146	train-rmse:13.1563
[205]	eval-rmse:13.2145	train-rmse:13.1562
[206]	eval-rmse:13.2145	train-rmse:13.156
[207]	eval-rmse:13.2144	train-rmse:13.1559
[208]	eval-rmse:13.2143	train-rmse:13.1559
[209]	eval-rmse:13.2142	train-rmse:13.1557
[210]	eval-rmse:13.2142	train-rmse:13.1555
[211]	eval-rmse:13.2141	train-rmse:13.1555
[212]	eval-rmse

KeyboardInterrupt: 