# Seattle Collisions - Model Training

This notebook covers model training focused on FB Prophet.

In [29]:
from fbprophet import Prophet
from fbprophet.serialize import model_to_json, model_from_json
from fbprophet.plot import plot_cross_validation_metric 
from fbprophet.diagnostics import performance_metrics
from fbprophet.diagnostics import cross_validation

from sklearn.neighbors import BallTree
from sklearn import metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

import json
from math import sqrt
import pandas as pd
import types
import itertools
import numpy as np
from datetime import datetime, timedelta, timezone 
import time

import os
import seaborn as sns
import matplotlib.ticker as ticker
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
%matplotlib inline

In [121]:
df = pd.read_csv('Seattle_Collisions_Final.csv', low_memory = False, parse_dates=True, index_col=0)
#print('File downloaded')
df['INCDTTM'] = pd.to_datetime(df['INCDTTM'])

In [None]:
df_weather = pd.read_csv('Seattle_Weather_Daily.csv', low_memory = False, parse_dates=True, index_col=0)
print('File downloaded')
df_weather.info()

In [None]:
def get_day_max_temperature(dfw, target_date):
    return dfw[(dfw.DATE == target_date)]['TEMPERATURE'].max()

def get_day_min_temperature(dfw, target_date):
    return dfw[(dfw.DATE == target_date)]['TEMPERATURE'].min()

def get_day_total_precipitation(dfw, target_date):
    return dfw[(dfw.DATE == target_date)]['PRECIPITATION'].sum()

def get_day_solar_azimuth(dfw, target_date):
    return dfw[(dfw.DATE == target_date)]['SOLARAZIMUTH'].max()

In [None]:
dft = df[(df.INCDTTM > '2014-12-31') & (df.INCDTTM < '2020-01-01') &\
         (df.HITPARKEDCAR == 0) & (df.PRECIPITATION >= 0) & (df.WEEKDAY.isin([0,1,2,3,4,5,6]))]
print(len(dft), len(dft)/(dft.INCDTTM.max() - dft.INCDTTM.min()).days)

In [None]:
dft = dft.INCDTTM.value_counts().resample('D').sum().to_frame().reset_index()
dft.rename({'index':'ds', 'INCDTTM':'y'}, axis=1, inplace=True)
dft['rain'] = dft.apply(lambda x: get_day_total_precipitation(df_weather, x.ds.strftime('%Y-%m-%d')), axis=1)
dft['temp'] = dft.apply(lambda x: get_day_min_temperature(df_weather, x.ds.strftime('%Y-%m-%d')), axis=1)
dft['solar_azimuth'] = dft.apply(lambda x: get_day_solar_azimuth(df_weather, x.ds.strftime('%Y-%m-%d')), axis=1)
#dft.y.plot(figsize=(18,3))
dft.head()

In [None]:
m = Prophet(daily_seasonality=True, weekly_seasonality=True, changepoint_prior_scale=0.5, growth='linear', seasonality_mode='multiplicative') 
m.add_regressor('rain', mode='multiplicative')
m.add_regressor('temp', mode='additive')
m.add_regressor('solar_azimuth', mode='multiplicative')
m.fit(dft)
future = m.make_future_dataframe(periods=10,freq='D',include_history=True)

In [None]:
future['rain'] = future.apply(lambda x: get_day_total_precipitation(df_weather, x.ds.strftime('%Y-%m-%d')), axis=1)
future['temp'] = future.apply(lambda x: get_day_min_temperature(df_weather, x.ds.strftime('%Y-%m-%d')), axis=1)
future['solar_azimuth'] = future.apply(lambda x: get_day_solar_azimuth(df_weather, x.ds.strftime('%Y-%m-%d')), axis=1)
future.head(), future.tail()

In [None]:
forecast = m.predict(future)
dft_results = dft.merge(forecast, how='outer', left_on='ds', right_on='ds')
dft_results[['ds', 'y', 'yhat', 'yhat_lower', 'yhat_upper', 'trend', 'additive_terms','multiplicative_terms']].tail(20)

In [None]:
y_true = dft_results['y'].values
y_hat = dft_results['yhat'].values
mae = mean_absolute_error(y_true[:-10], y_hat[:-10])
print('MAE: %.3f' % mae)
rsme = sqrt(mean_squared_error(y_true[:-10], y_hat[:-10]))
print('RSME: %.3f' % rsme)

fig, ax = plt.subplots(figsize=(26,10))
plt.plot(y_true, label='Actual')
plt.plot(y_hat, label='Predicted')
plt.legend()

In [None]:
f = m.plot(forecast, figsize=(20,10))
for changepoint in m.changepoints:
    plt.axvline(changepoint,ls='--', lw=1)

In [None]:
f = m.plot_components(forecast)

In [None]:
with open('seattle_collision_model.json', 'w') as fout:
    json.dump(model_to_json(m), fout)  # Save model

Prophet has many tools built-in.  I did not extensively use due to time constraints, a todo to revisit this and improve on what's here.

In [None]:
param_grid = {  
    'changepoint_prior_scale': [0.001, 0.01, 0.1, 0.5],
    'seasonality_prior_scale': [0.01, 0.1, 1.0, 10.0],
}

# Generate all combinations of parameters
all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]
rmses = []  # Store the RMSEs for each params here

#https://facebook.github.io/prophet/docs/diagnostics.html
#Here we do cross-validation to assess prediction performance on a horizon of 365 days, starting with 730 days of training data in the first 
#cutoff and then making predictions every 180 days. On this 8 year time series, this corresponds to 11 total forecasts.
#df_cv = cross_validation(m, initial='730 days', period='180 days', horizon = '365 days')

# Use cross validation to evaluate all parameters
for params in all_params:
    m = Prophet(**params).fit(dft)  # Fit model with given params
    df_cv = cross_validation(m, initial='1460 days', period='10 days', horizon='365 days')  #cutoffs=cutoffs, , parallel="processes"
    df_p = performance_metrics(df_cv, rolling_window=1)
    rmses.append(df_p['rmse'].values[0])

# Find the best parameters
tuning_results = pd.DataFrame(all_params)
tuning_results['rmse'] = rmses
print(tuning_results)
best_params = all_params[np.argmin(rmses)]
print(best_params)