In [24]:
# Data process
import pandas as pd
import numpy as np
# SARIMA Model
import statsmodels.api as sm
from statsmodels.tsa.stattools import acf
from statsmodels.tsa.stattools import pacf
from statsmodels.tsa.seasonal import seasonal_decompose

from sklearn.linear_model import LinearRegression, Lasso,Ridge
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import BaggingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
# Visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
import plotly.plotly as py
import plotly.graph_objs as go
# Other
import datetime
import pickle
from plotly_utils.utils import get_plotly_fig

%matplotlib inline
pylab.rcParams['figure.figsize'] = 18,4

In [17]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_pred = y_pred[y_true != 0]
    y_true = y_true[y_true != 0]
    return np.mean(np.abs((y_true - y_pred) / y_true))

In [18]:
full_df = pd.read_pickle('../datasets/Full_Task2_Dataframe.pkl')
del full_df['EarlyPeakTrafficTime']
del full_df['LatePeakTrafficTime']
del full_df['NormalTrafficTime']

full_df.columns = [
    ['Volume', 'Volume', 'Volume', 'Volume', 'Volume',
        'Weather', 'Weather', 'Weather', 'Weather', 'Date', 'Date', 'Date', 'Date', 'Date', 'Date', 'Date'],
    ['T1D0', 'T1D1', 'T2D0', 'T3D0', 'T3D1', 'HeavyRain', 'LightRain',
     'Rain', 'Sunny', 'DayOfWeek', 'BeforeNationalDay',
     'NationalDayFront(1-4)', 'NationalDayTail(5-7)', 'Weekend',
     'WorkingDay', 'WorkingWeekend']]

In [26]:
full_df

Unnamed: 0_level_0,Volume,Volume,Volume,Volume,Volume,Weather,Weather,Weather,Weather,Date,Date,Date,Date,Date,Date,Date
Unnamed: 0_level_1,T1D0,T1D1,T2D0,T3D0,T3D1,HeavyRain,LightRain,Rain,Sunny,DayOfWeek,BeforeNationalDay,NationalDayFront(1-4),NationalDayTail(5-7),Weekend,WorkingDay,WorkingWeekend
2016-09-19 00:00:00,13.0,0.0,2.0,17.0,0.0,0.0,0.0,0.0,1.0,1.0,0,0,0,0,1,0
2016-09-19 00:20:00,6.0,0.0,0.0,16.0,0.0,0.0,0.0,0.0,1.0,1.0,0,0,0,0,1,0
2016-09-19 00:40:00,9.0,0.0,0.0,16.0,0.0,0.0,0.0,0.0,1.0,1.0,0,0,0,0,1,0
2016-09-19 01:00:00,10.0,4.0,1.0,17.0,4.0,0.0,0.0,0.0,1.0,1.0,0,0,0,0,1,0
2016-09-19 01:20:00,14.0,14.0,0.0,17.0,5.0,0.0,0.0,0.0,1.0,1.0,0,0,0,0,1,0
2016-09-19 01:40:00,10.0,7.0,2.0,11.0,4.0,0.0,0.0,0.0,1.0,1.0,0,0,0,0,1,0
2016-09-19 02:00:00,7.0,7.0,0.0,11.0,5.0,0.0,0.0,0.0,1.0,1.0,0,0,0,0,1,0
2016-09-19 02:20:00,10.0,7.0,0.0,14.0,4.0,0.0,0.0,0.0,1.0,1.0,0,0,0,0,1,0
2016-09-19 02:40:00,6.0,10.0,0.0,18.0,0.0,0.0,0.0,0.0,1.0,1.0,0,0,0,0,1,0
2016-09-19 03:00:00,9.0,10.0,0.0,10.0,5.0,0.0,0.0,0.0,1.0,1.0,0,0,0,0,1,0


In [31]:
morning_full_df = full_df.between_time('6:00', '9:40')
afternoon_full_df = full_df.between_time('15:00', '18:40')

In [32]:
morning_train_df = morning_full_df[:'2016-10-17']
morning_predict_df = morning_full_df['2016-10-18':]

afternoon_train_df = afternoon_full_df[:'2016-10-17']
afternoon_predict_df = afternoon_full_df['2016-10-18':]

In [36]:
lag = ['AR.L1', 'AR.L2', 'AR.L3', 'S72.L1', 'S72.L2', 'S72.L3']


def lag_features(series):
    lag_data = list([series.shift(s) for s in [1, 2, 3, 15, 16, 17]])
    df = pd.DataFrame(dict(zip(lag, lag_data)))
    return df

In [57]:
mapes = []
sums = 0
for td in ['T1D0', 'T1D1', 'T2D0', 'T3D0', 'T3D1']:

    lags = lag_features(full_df.Volume[td])
    td_morning_full_df = morning_train_df.Volume[td].to_frame().join(
        lags).join(full_df.Weather).join(full_df.Date)

    dataset = td_morning_full_df['2016-09-20':'2016-10-17']

    train, test = dataset[:'2016-10-13'].fillna(0),
                        dataset['2016-10-14':].fillna(0)

    X_train = train[lag + features]
    y_train = train[td]

    X_test = test[lag + features].between_time('8:00', '9:40')
    y_test = test[td].between_time('8:00', '9:40')

    model = Ridge()
    model.fit(X_train, y_train)
    train_mape = mean_absolute_percentage_error(
        y_train, model.predict(X_train))

    test_mape = mean_absolute_percentage_error(
        y_test, model.predict(X_test))

    print(td)
    print(train_mape)
    print(test_mape)

T1D0
0.19155007930007215
0.17445863591665786
T1D1
0.20771079243088791
0.13006799614708728
T2D0
0.28676250673933523
0.12055097050264611
T3D0
0.1516941905345717
0.10683722119652134
T3D1
0.21601938705643045
0.13408663564839587


In [69]:
train[['AR.L1':]]

SyntaxError: invalid syntax (<ipython-input-69-f7a287e31aa2>, line 1)

In [442]:
y_test

2016-10-14 08:00:00     99.0
2016-10-14 08:20:00    134.0
2016-10-14 08:40:00    127.0
2016-10-14 09:00:00    125.0
2016-10-14 09:20:00    100.0
2016-10-14 09:40:00    105.0
2016-10-15 08:00:00     80.0
2016-10-15 08:20:00     80.0
2016-10-15 08:40:00    121.0
2016-10-15 09:00:00     90.0
2016-10-15 09:20:00     97.0
2016-10-15 09:40:00    105.0
2016-10-16 08:00:00     85.0
2016-10-16 08:20:00     80.0
2016-10-16 08:40:00     86.0
2016-10-16 09:00:00    107.0
2016-10-16 09:20:00     90.0
2016-10-16 09:40:00     98.0
2016-10-17 08:00:00    123.0
2016-10-17 08:20:00    147.0
2016-10-17 08:40:00    150.0
2016-10-17 09:00:00    136.0
2016-10-17 09:20:00    137.0
2016-10-17 09:40:00    106.0
Name: T3D1, dtype: float64