In [1]:
import pandas as pd
import numpy as np
import datetime 
import matplotlib.pyplot as plt
import pyramid as pm
%matplotlib inline
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from math import sqrt
from pyramid.arima import auto_arima

In [2]:
nights = pd.read_csv('/Users/jinny/Documents/touristcast/datasets/nights_2010-2017.csv', index_col='date', parse_dates=['date'], 
                          usecols=[*range(1, 15)])
temp = pd.read_csv('/Users/jinny/Documents/touristcast/datasets/avgtemp_2010-2017.csv',parse_dates=['date'],index_col='date',usecols=[*range(1, 15)])
daysoff = pd.read_csv('/Users/jinny/Documents/touristcast/datasets/daysoff_2010-2017.csv',parse_dates=['date'],index_col='date',usecols=['date','daysoff'])
gdp = pd.read_csv('/Users/jinny/Documents/touristcast/datasets/regionalGDP_2010-2015.csv',parse_dates=['date'],index_col='date',usecols=[*range(1, 15)])

# df exog
exog_PAC = pd.concat([nights, temp, daysoff, gdp], axis=1)
exog_PAC = exog_PAC.dropna()
exog_PAC = exog_PAC[['avgtemp_PAC','gdp_PAC','daysoff']]

# df with training nights  
nights_PAC = nights[['nights_PAC']][('2010-01-01' <= nights.index) & (nights.index < '2016-01-01')]

# df with testing nights
truth = nights[['nights_PAC']][('2016-01-01' <= nights.index) & (nights.index < '2018-01-01')]

In [3]:
best_sarimax_PAC = auto_arima(y=nights_PAC, exogenous=exog_PAC, start_p=0, start_q=0, max_p=3, max_q=3, m=12,
                    start_P=1, seasonal=True, d=1, D=1, trace=True,
                    error_action='ignore', 
                    suppress_warnings=True,  
                    stepwise=True)

Fit ARIMA: order=(0, 1, 0) seasonal_order=(1, 1, 1, 12); AIC=701.745, BIC=716.287, Fit time=0.834 seconds
Fit ARIMA: order=(0, 1, 0) seasonal_order=(0, 1, 0, 12); AIC=709.629, BIC=720.017, Fit time=0.077 seconds
Fit ARIMA: order=(1, 1, 0) seasonal_order=(1, 1, 0, 12); AIC=699.323, BIC=713.865, Fit time=0.645 seconds
Fit ARIMA: order=(0, 1, 1) seasonal_order=(0, 1, 1, 12); AIC=687.749, BIC=702.292, Fit time=1.096 seconds
Fit ARIMA: order=(0, 1, 1) seasonal_order=(1, 1, 1, 12); AIC=689.126, BIC=705.746, Fit time=1.083 seconds
Fit ARIMA: order=(0, 1, 1) seasonal_order=(0, 1, 0, 12); AIC=698.393, BIC=710.858, Fit time=0.340 seconds
Fit ARIMA: order=(0, 1, 1) seasonal_order=(0, 1, 2, 12); AIC=nan, BIC=nan, Fit time=nan seconds
Fit ARIMA: order=(0, 1, 1) seasonal_order=(1, 1, 2, 12); AIC=nan, BIC=nan, Fit time=nan seconds
Fit ARIMA: order=(1, 1, 1) seasonal_order=(0, 1, 1, 12); AIC=686.361, BIC=702.982, Fit time=1.330 seconds
Fit ARIMA: order=(1, 1, 0) seasonal_order=(0, 1, 1, 12); AIC=694.7

In [4]:
best_sarimax_PAC.params()

array([-3.72584990e-01, -1.85617590e-01,  2.16878419e-02,  2.54127102e+01,
        3.03973865e-01, -9.99496301e-01, -6.66313816e-01,  4.06224335e+03])

In [5]:
best_sarimax_PAC.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,72.0
Model:,"SARIMAX(1, 1, 1)x(0, 1, 1, 12)",Log Likelihood,-335.181
Date:,"Wed, 05 Sep 2018",AIC,686.361
Time:,12:38:42,BIC,702.982
Sample:,0,HQIC,692.849
,- 72,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept,-0.3726,0.779,-0.479,0.632,-1.899,1.154
x1,-0.1856,6.272,-0.030,0.976,-12.479,12.107
x2,0.0217,0.013,1.650,0.099,-0.004,0.047
x3,25.4127,8.487,2.994,0.003,8.778,42.047
ar.L1,0.3040,0.174,1.750,0.080,-0.037,0.644
ma.L1,-0.9995,16.065,-0.062,0.950,-32.487,30.488
ma.S.L12,-0.6663,0.296,-2.255,0.024,-1.245,-0.087
sigma2,4062.2433,6.54e+04,0.062,0.950,-1.24e+05,1.32e+05

0,1,2,3
Ljung-Box (Q):,30.68,Jarque-Bera (JB):,7.1
Prob(Q):,0.86,Prob(JB):,0.03
Heteroskedasticity (H):,1.37,Skew:,-0.41
Prob(H) (two-sided):,0.49,Kurtosis:,4.49


#### Calculating the minimum amount of observations for a set of orders/seasonal orders to work

In [6]:
# https://github.com/statsmodels/statsmodels/issues/4465#issuecomment-381004987
p,d,q,P,D,Q,s = 1,1,1,0,1,1,12
min_observations = d + D*s + max([3*q + 1, 3*Q*s + 1, p, P*s]) + 1
min_observations // 12

4

#### SARIMAX Cross Val 

In [20]:
data = nights[['nights_PAC']] # dataframe-type dataset
exog = exog_PAC

all_mae = []
all_rmse = []

for index in range(2011,2018):
    start = '2010-01-01'     # date as string
    end = str(index)+'-01-01'
    predict = str(index+1)+'-01-01'

    train = (start <= data.index) & (data.index < end) 
    test = (end <= data.index) & (data.index < predict)

    sarimax_model = pm.arima.ARIMA(order=(1, 1, 1), seasonal_order=(0, 1, 1, 12)
                                   #,start_params=[0,0,0,0,0,0]
                                   #,start_params=best_sarimax_PAC.params()
                                   ,start_params=[-2.13823161e+00, 2.16036632e-01, -9.99103160e-01, -3.82650803e-01,6.38041336e+03]
                                  )
    
        
    res = sarimax_model.fit(y=data[train],
                       exog=exog[(start <= exog.index) & (exog.index < end)])
    
    #print('params', sarimax_model.params())
    
#     start_str = data[train].index[0].strftime('%Y-%m-%d')
#     end_str = data[train].index[-1].strftime('%Y-%m-%d')  
#     print(start_str, end_str)
    
    forecast = sarimax_model.predict(n_periods=12, exogenous=exog[(end <= exog.index) & (exog.index < predict)])
    
    # --------Calculated error measures for each CV step----------
    rmse_test = np.sqrt(mean_squared_error(data[test], forecast))
    mae_test = mean_absolute_error(data[test], forecast)
    
    all_rmse.append(rmse_test)
    all_mae.append(mae_test)
    
    print('Years of training data:', data[train].index.strftime('%Y').unique().tolist())
    print('Predicted year:', end)
    print('RMSE test:', rmse_test)
    print('MAE test:', mae_test)
    print('-------')

print('Avg MAE for SARIMAX CrossVal:', np.mean(mae_test))
print('Avg RMSE for SARIMAX CrossVal:', np.mean(rmse_test))

Years of training data: ['2010']
Predicted year: 2011-01-01
RMSE test: 328.5735703340593
MAE test: 312.7678991626963
-------




Years of training data: ['2010', '2011']
Predicted year: 2012-01-01
RMSE test: 245.76761197358402
MAE test: 221.80481859114775
-------
Years of training data: ['2010', '2011', '2012']
Predicted year: 2013-01-01
RMSE test: 123.01759900340302
MAE test: 103.16789011722273
-------
Years of training data: ['2010', '2011', '2012', '2013']
Predicted year: 2014-01-01
RMSE test: 89.73044647039566
MAE test: 70.91650482137509
-------
Years of training data: ['2010', '2011', '2012', '2013', '2014']
Predicted year: 2015-01-01
RMSE test: 118.3617376260508
MAE test: 111.6513724827369
-------
Years of training data: ['2010', '2011', '2012', '2013', '2014', '2015']
Predicted year: 2016-01-01
RMSE test: 105.1027466509721
MAE test: 82.3526590341266
-------
Years of training data: ['2010', '2011', '2012', '2013', '2014', '2015', '2016']
Predicted year: 2017-01-01
RMSE test: 144.49859293423174
MAE test: 118.04633366285445
-------
Avg MAE for SARIMAX CrossVal: 118.04633366285445
Avg RMSE for SARIMAX CrossVa