In [10]:
import numpy as np 
import pandas as pd
import os
import time

import data 
import utils
import pmdarima as pm

In [11]:
directory = 'output/arima'
data_directory = 'data'
aggregation = 'day'
station = "(02000) cabecera autopista norte"

In [12]:
train, test = data.split_data('data', aggregation = 'day')

# Experiments on selecting the right transformation

The motivation to do this experiments is to be able to select the best transformation to the data. 
We explore log, standarization, and minmax transformations (and combiantion of them). 

Unfortunatly, since transformation will lead to diferent scales, traditional metrics such as AIC, log-likelihood, BIC cannot be compared. To select the best transformation, I take into account the MAPE metric with a 7 days prediction, and the running time. 

For each transformation, we fit a model using pm.auto_arima which selects the best model with give hyperparameters. To compare model with different transformation, we only use the best model of the auto_arima function. For this function, the selection criteria is the AIC. 

I took one station (porta norte) and transformed the data. Regular standarization is the one that has the best performance. The MAPE is 0.13 compared to 0.25 of the second best model (log transformation). Also, compared to the second best model, the running time to fir the model is X3 times faster. 

More research: I would need to figure out if the same transformation has similar results with other stations. For now I have only tested one. 

In [345]:
def normalize(x):
    return (x - np.mean(x))/(np.std(x))

def normalize1(x):
    return (x - np.min(x))/(np.max(x) - np.min(x))

In [346]:
def APE(target, predicted):
    return np.abs((target - predicted) / target)

def mape(target, predicted, axis = None):
    return np.mean(APE(target, predicted), axis = axis)

In [347]:
s_raw = train[station]
s_log = s_raw.transform(np.log1p)
s_log_nor = normalize(s_log)
s_nor = normalize(s_raw)
s_log_nor1 = normalize1(s_log)
s_nor1 = normalize1(s_raw)

In [384]:
# Raw model 
s_data = [s_raw, s_log, s_log_nor, s_nor, s_log_nor1, s_nor1]
models = []
test_size = 7
for s in s_data:
    start = time.time()
    model = pm.auto_arima(s, start_p=2, start_q=2, max_p=7, max_q=7, seasonal=False,
                        stepwise=True, suppress_warnings=True, error_action='ignore', d = 0, 
                        maxiter = 200, out_of_sample_size = test_size, scoring = 'mse')
    models.append(model)
    end = time.time()
    print (end - start)

2.926056146621704
65.68860793113708
2.1380200386047363
20.929821014404297
4.973753213882446
3.901477336883545


Log transformation took 65 seconds, and stadarition took 20 seconds. The are the second and first best models. 

In [396]:
target = np.array(s_raw[-test_size:])
#Model 0
p50_raw = models[0].oob_preds_
print ('Model 0 - Raw MAPE:', mape(target, p50_raw))

#Model 1
p50_log = np.expm1(models[1].oob_preds_)
print ('Model 1 - log MAPE:', mape(target, p50_log))

#Model 2 

#Model 3
p50_nor = (models[3].oob_preds_ * s_raw.std()) + s_raw.mean()
print ('Model 3 - nor MAPE:', mape(target, p50_nor))

#Model 4 
p50_log_nor1 = np.expm1(models[4].oob_preds_ * (s_log.max() - s_log.min()) + s_log.min())
print ('Model 4 - log_nor1 MAPE:', mape(target, p50_log_nor1))

#Model 5
p50_nor1 = (models[5].oob_preds_ * (s_raw.max() - s_raw.min())) + s_raw.min()
print ('Model 5 - nor1 MAPE:', mape(target, p50_nor1))

Model 0 - Raw MAPE: 0.24128880092487925
Model 1 - log MAPE: 0.21380937184363305
Model 3 - nor MAPE: 0.13745761654768238
Model 4 - log_nor1 MAPE: 0.24608887583088443
Model 5 - nor1 MAPE: 0.24122682803419906


## Predicting 20 time steps into the future: 

In [386]:
# Values to keep in mind 
mean = s_raw.mean()
std = s_raw.std()
max_raw = s_raw.max()
min_raw = s_raw.min()

max_log = s_log.mean()
min_log = s_log.mean()
forecast_period = 1

In [388]:
## Measure the forcasting error of 20 predicitions
total_forcast_period = 20
target = test[station][:total_forcast_period]

t_raw = test[station][:total_forcast_period]
t_log = t_raw.transform(np.log1p)
t_log_nor = (t_log - s_log.mean())/(s_log.std())
t_nor = (t_raw - s_raw.mean())/(s_raw.std())
t_log_nor1 = (t_log - s_log.min())/(s_log.max() - s_log.min())
t_nor1 = (t_raw - s_raw.min())/(s_raw.max() - s_raw.min())

In [389]:
def prediction(model, target, forecast_period = 1):
    prediction_list = []
    for t in target:
        p = model.predict(forecast_period)
        prediction_list.append(p)
        model.update(t)
    return np.array(prediction_list)

In [390]:
#Model 0 
p20_raw = prediction(models[0], t_raw, forecast_period = 1)

#Model 1 
p20_log = prediction(models[1], t_log, forecast_period = 1)
p20_log =  np.expm1(p20_log)

#Model 2
# p20_log_nor = prediction(models[2], t_log_nor, forecast_period = 1)
# Ignore model 2

#Model 3
p20_nor = prediction(models[3], t_nor, forecast_period = 1)
p20_nor = (p20_nor*s_raw.std())+ s_raw.mean()

#Model 4 
p20_log_nor1 = prediction(models[4], t_log_nor1, forecast_period = 1)
p20_log_nor1 = np.expm1(p20_log_nor1*(max_log - min_log) + min_log)

#Model 5
p20_nor1 = prediction(models[5], t_nor1, forecast_period = 1)
p20_nor1 = (p20_nor1*(s_raw.max() - s_raw.min())) + (s_raw.min())

# (t_raw - s_raw.min())/(s_raw.max() - s_raw.min())

In [391]:
print ('Model 0 - Raw MAPE:', mape(target, p20_raw.flatten()))
print ('Model 1 - log MAPE:', mape(target, p20_log.flatten()))
print ('Model 3 - nor MAPE:', mape(target, p20_nor.flatten()))
print ('Model 4 - log_nor1 MAPE:', mape(target, p20_log_nor1.flatten()))
print ('Model 5 - nor1 MAPE:', mape(target, p20_nor1.flatten()))

Model 0 - Raw MAPE: 0.35637711614881107
Model 1 - log MAPE: 0.3016776303962018
Model 3 - nor MAPE: 0.25350074254782845
Model 4 - log_nor1 MAPE: 0.38677325679848945
Model 5 - nor1 MAPE: 0.35635911311489044


# Check full results for One station

In [2]:
import numpy as np

In [4]:
a = np.ones(shape = (3,4))

In [7]:
b = a.tolist()

In [9]:
type(b)

list