In [13]:
from __future__ import print_function
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.gaussian_process import GaussianProcessRegressor as gpr
from sklearn.gaussian_process.kernels import RBF
import warnings
import pickle
import numpy as np
import pandas as pd
import plotly.graph_objects as go

#import numpy as np
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")
plt.style.use('fivethirtyeight')
#import statsmodels.api as sm
import matplotlib

In [14]:
filename = 'stackedPeds.sav'
trainPeds = pickle.load(open(filename, 'rb'))

filename = 'stackedCycs.sav'
trainCycs = pickle.load(open(filename, 'rb'))

filename = 'Locations.sav';
Locations = pickle.load(open(filename, 'rb'))

filename = 'PedestriansData.sav'
Pedestrians = pickle.load(open(filename, 'rb'))

filename = 'CyclistsData.sav'
Cyclists = pickle.load(open(filename, 'rb'))

In [20]:
# Training on Pedestrians data
trainPeds = trainPeds.dropna()

X = trainPeds[['Hour','DayOfWeek','isHoliday','lat','lng']] #Best

pedsScaler = StandardScaler()
pedsScaler.fit(X)

X=pedsScaler.transform(X)

Y = trainPeds[['volume']]

kernel = RBF(length_scale=np.ones([5,]))
gprMdl = gpr(kernel=kernel, random_state=0, normalize_y=True, n_restarts_optimizer = 0).fit(X, Y)

filename = 'gprPedsMdl.sav'
with open(filename, 'wb') as handle:
    pickle.dump(gprMdl, handle, protocol=pickle.HIGHEST_PROTOCOL)   

In [21]:
# Validation

filename = 'gprPedsMdl.sav'
gprPedsMdl = pickle.load(open(filename, 'rb'))

Peds = Pedestrians[13]

Peds = Peds.dropna()

X = Peds[['Hour','DayOfWeek','isHoliday']]

lat = Locations.iloc[13, 0]; lng = Locations.iloc[13, 1]

X['lat']= lat*np.ones([len(X),1]);
X['lng']= lng*np.ones([len(X),1]);

xTest = pedsScaler.transform(X)

xTest = X

yTest = Peds['volume']

yPred, yStd = gprPedsMdl.predict(xTest, return_std=True)
yPred = yPred.astype('int'); yPred[yPred<0]=0

yPred = pd.DataFrame(yPred, columns =['forecast']); yPred.index = X.index;
yPred = yPred['forecast']

print(yPred)

yLow = yPred - 3*yStd; yLow = yLow.astype('int'); yLow[yLow<0]=0
yHigh = yPred + 3*yStd; yHigh = yHigh.astype('int');

fig = go.Figure()
fig.add_trace(go.Scatter(x=X.index, y=yTest, name= 'Observed'))
fig.add_trace(go.Scatter(x=X.index, y=yPred, name= 'Forecast'))
fig.add_trace(go.Scatter(x=X.index, y=yLow, 
                        mode='lines',
                        line=dict(width=0.1),
                        name=''))
fig.add_trace(go.Scatter(x=X.index, y=yHigh, fill = 'tonexty', 
                        mode='lines',
                        line=dict(width=0.1),
                        name='99.7% confidence interval'))
fig.update_xaxes(title_text='Time')
fig.update_yaxes(title_text='Pedestrian volume')
fig.update_layout(title_text = 'Pedestrian traffic forecast at King-John Intersection')
fig.show()

print('RMSE: '+str(np.sqrt(np.mean((yPred.values-yTest.values)**2))))
print('R_Squared: '+str(gprMdl.score(xTest,yTest)))

date_time
2019-05-01 00:00:00     1130
2019-05-01 01:00:00      890
2019-05-01 02:00:00      188
2019-05-01 03:00:00       98
2019-05-01 04:00:00      241
2019-05-01 05:00:00     1257
2019-05-01 06:00:00     3127
2019-05-01 07:00:00    10986
2019-05-01 08:00:00    28805
2019-05-01 09:00:00    19237
2019-05-01 10:00:00     6238
2019-05-01 11:00:00     1276
2019-05-01 12:00:00        0
2019-05-01 13:00:00      329
2019-05-01 14:00:00     8463
2019-05-01 15:00:00     6760
2019-05-01 16:00:00     6046
2019-05-01 17:00:00    10268
2019-05-01 18:00:00    10736
2019-05-01 19:00:00    10682
2019-05-01 20:00:00     7061
2019-05-01 21:00:00     3790
2019-05-01 22:00:00     3794
2019-05-01 23:00:00     1453
2019-05-02 00:00:00     1104
2019-05-02 01:00:00      881
2019-05-02 02:00:00      270
2019-05-02 03:00:00      247
2019-05-02 04:00:00      252
2019-05-02 05:00:00      913
                       ...  
2019-08-30 18:00:00    12819
2019-08-30 19:00:00     8564
2019-08-30 20:00:00     6313
2019

In [22]:
print(xTest)

                     Hour  DayOfWeek  isHoliday        lat       lng
date_time                                                           
2019-05-01 00:00:00     0          2        0.0  43.646594 -79.38974
2019-05-01 01:00:00     1          2        0.0  43.646594 -79.38974
2019-05-01 02:00:00     2          2        0.0  43.646594 -79.38974
2019-05-01 03:00:00     3          2        0.0  43.646594 -79.38974
2019-05-01 04:00:00     4          2        0.0  43.646594 -79.38974
2019-05-01 05:00:00     5          2        0.0  43.646594 -79.38974
2019-05-01 06:00:00     6          2        0.0  43.646594 -79.38974
2019-05-01 07:00:00     7          2        0.0  43.646594 -79.38974
2019-05-01 08:00:00     8          2        0.0  43.646594 -79.38974
2019-05-01 09:00:00     9          2        0.0  43.646594 -79.38974
2019-05-01 10:00:00    10          2        0.0  43.646594 -79.38974
2019-05-01 11:00:00    11          2        0.0  43.646594 -79.38974
2019-05-01 12:00:00    12         

In [23]:
print('RMSE: '+str(np.sqrt(np.mean((yPred.values-yTest.values)**2))))

RMSE: 5999.798552965258


In [24]:
print('R_Squared: '+str(gprMdl.score(xTest,yTest)))

R_Squared: -18.283799079301385


In [25]:
range(2,3)

range(2, 3)

In [26]:
for i in range(2,3):
    print(i)

2
