In [1]:
#### SOURCE https://alkaline-ml.com/pmdarima/auto_examples/preprocessing/example_date_featurizer.html#sphx-glr-auto-examples-preprocessing-example-date-featurizer-py

import pmdarima as pm
from pmdarima import arima
from pmdarima import model_selection
from pmdarima import pipeline
from pmdarima import preprocessing as ppc
from pmdarima.arima import ADFTest 


from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.metrics import mean_squared_log_error, mean_squared_error

from statsmodels.tsa.deterministic import CalendarSeasonality


import cabi.etl.load as l
import cabi.etl.transform as t

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

print("pmdarima version: %s" % pm.__version__)



pmdarima version: 1.7.1


In [2]:
def RMSE(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)

## Load the Data Select Top Five Most Active ANCs in Either Direction

### Follow Up Ideas **FLAGGED FOLLOW UP**

- Model Checkins/Checkouts by selecting start/ends from trips long
- Model Poisson instead of SARIMA?

In [3]:
counts = l.load_counts_full()

In [4]:
pd.set_option('display.float_format', lambda x: '%.5f' % x)

# 1A/1C has most outflow, 2E/2B/6D have most inflow on average
counts.mean().sort_values()

1A   -0.00436
1C   -0.00305
3C   -0.00168
5E   -0.00108
4C   -0.00073
3E   -0.00072
1D   -0.00063
3B   -0.00051
3F   -0.00045
4D   -0.00024
6B   -0.00021
4B   -0.00013
3D   -0.00012
7B   -0.00011
8E   -0.00010
5B   -0.00010
4A   -0.00010
5A   -0.00008
1B   -0.00008
3G   -0.00005
8B   -0.00005
6A   -0.00003
5C   -0.00002
7E   -0.00002
8C   -0.00002
7C    0.00000
2D    0.00001
7F    0.00003
8D    0.00006
7D    0.00008
5D    0.00010
8A    0.00018
6E    0.00051
2A    0.00055
2F    0.00086
6C    0.00144
2C    0.00197
6D    0.00325
2E    0.00334
2B    0.00340
dtype: float64

In [5]:
bot_five = counts.sum().sort_values().head(5).index
top_five = counts.sum().sort_values().tail(5).index
print(top_five, bot_five)

Index(['6C', '2C', '6D', '2E', '2B'], dtype='object') Index(['1A', '1C', '3C', '5E', '4C'], dtype='object')


In [6]:
model_groups = list(bot_five) + list(top_five)

In [7]:
model_groups

['1A', '1C', '3C', '5E', '4C', '6C', '2C', '6D', '2E', '2B']

In [8]:
bot_five = ['1A', '1C', '3C', '5E', '4C']
top_five = ['6C', '2C', '6D', '2E', '2B']

In [9]:
hourly_groups = counts[model_groups].resample('1H').sum()

In [10]:
hourly_groups = hourly_groups[hourly_groups.index > '2020-06-15']
hourly_groups

Unnamed: 0_level_0,1A,1C,3C,5E,4C,6C,2C,6D,2E,2B
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020-06-15 01:00:00,0,1,0,0,1,0,-1,2,0,2
2020-06-15 02:00:00,-1,0,0,0,0,0,1,0,1,0
2020-06-15 03:00:00,0,0,0,0,-1,0,0,0,1,-1
2020-06-15 04:00:00,-1,0,-1,0,0,-1,1,2,0,1
2020-06-15 05:00:00,-2,-2,-1,1,-2,2,0,2,2,-3
...,...,...,...,...,...,...,...,...,...,...
2020-08-01 04:00:00,0,0,0,0,0,0,0,0,0,0
2020-08-01 05:00:00,0,0,0,0,0,0,0,0,0,0
2020-08-01 06:00:00,0,0,0,0,0,0,0,0,0,0
2020-08-01 07:00:00,0,0,0,0,0,0,0,0,0,0


## Create Weekday/Hourly Dummies, Weekly Fourier Features to Backwards Eliminate

In [200]:
wday_dums = DateFeaturizer(column_name='time', with_day_of_month=False)
_, day_dums = wday_dums.fit_transform(hourly_groups['1A'], hourly_groups)
day_dums

Unnamed: 0,1A,1C,3C,5E,4C,6C,2C,6D,2E,2B,DATE-WEEKDAY-0,DATE-WEEKDAY-1,DATE-WEEKDAY-2,DATE-WEEKDAY-3,DATE-WEEKDAY-4,DATE-WEEKDAY-5,DATE-WEEKDAY-6
0,0,1,0,0,1,0,-1,2,0,2,1,0,0,0,0,0,0
1,-1,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0
2,0,0,0,0,-1,0,0,0,1,-1,1,0,0,0,0,0,0
3,-1,0,-1,0,0,-1,1,2,0,1,1,0,0,0,0,0,0
4,-2,-2,-1,1,-2,2,0,2,2,-3,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1131,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1132,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1133,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1134,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [202]:
day_dums.drop('DATE-WEEKDAY-2', axis=1, inplace=True)

In [205]:
day_dums = day_dums[day_dums.columns[-6:]]

In [209]:
day_dums.set_index(hourly_groups.index, inplace=True)

In [210]:
day_dums.merge(hourly_dummies)

MergeError: No common columns to perform merge on. Merge options: left_on=None, right_on=None, left_index=False, right_index=False

In [180]:
hourly_dumgen = CalendarSeasonality('H', 'D')
hourly_dummies = daily_dummies.in_sample(hourly_groups.index)

In [181]:
hourly_dummies

Unnamed: 0_level_0,"s(H=1, period=D)","s(H=2, period=D)","s(H=3, period=D)","s(H=4, period=D)","s(H=5, period=D)","s(H=6, period=D)","s(H=7, period=D)","s(H=8, period=D)","s(H=9, period=D)","s(H=10, period=D)",...,"s(H=15, period=D)","s(H=16, period=D)","s(H=17, period=D)","s(H=18, period=D)","s(H=19, period=D)","s(H=20, period=D)","s(H=21, period=D)","s(H=22, period=D)","s(H=23, period=D)","s(H=24, period=D)"
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-06-15 01:00:00,0.00000,1.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
2020-06-15 02:00:00,0.00000,0.00000,1.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
2020-06-15 03:00:00,0.00000,0.00000,0.00000,1.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
2020-06-15 04:00:00,0.00000,0.00000,0.00000,0.00000,1.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
2020-06-15 05:00:00,0.00000,0.00000,0.00000,0.00000,0.00000,1.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-08-01 04:00:00,0.00000,0.00000,0.00000,0.00000,1.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
2020-08-01 05:00:00,0.00000,0.00000,0.00000,0.00000,0.00000,1.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
2020-08-01 06:00:00,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,1.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
2020-08-01 07:00:00,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,1.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
