Cross Validation of SARIMAX Forecasting Nuitees dans l'hôtellerie Provence-Alpes-Côte d'Azur PAC 
Set "stepwise = true" in auto-arima.
Include 3 exogenous variables (meteo,GDP,holidays) in training & test datasets
Predict GDP with auto-arima for 2016 & 2017 and include as exogenous in training&test datasets
Datasets https://github.com/jinnyto/touristcast/tree/datasets/datasets

In [1]:
import pandas as pd
import numpy as np
import datetime 
import matplotlib.pyplot as plt
import pyramid
%matplotlib inline
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from math import sqrt
from pyramid.arima import auto_arima
def seasonal_MASE(truth, forecast, seasonality=1):    
    period = truth.shape[0] # T
    # print(period)
    forecast_errors = np.abs(truth - forecast)
    # print(forecast_errors)
    mean_absolute_forecast_error = np.sum(forecast_errors) / period
    # print(mean_absolute_forecast_error)

    naive_period = truth.shape[0] - seasonality # T - m
    # print(naive_period)
    # print(truth[seasonality:])
    # print(truth[:period - seasonality])
    naive_errors = np.abs(truth[seasonality:] - truth[:period - seasonality])
    mean_absolute_naive_error = np.sum(naive_errors) / naive_period
    
    return mean_absolute_forecast_error / mean_absolute_naive_error

In [2]:
dn = pd.read_csv('./datasets/nights_2010-2017.csv',parse_dates=['date'], index_col='date',usecols=[*range(1, 15)])
dm = pd.read_csv('./datasets/avgtemp_2010-2017.csv',parse_dates=['date'],index_col='date',usecols=[*range(1, 15)])
dh = pd.read_csv('./datasets/daysoff_2010-2017.csv',parse_dates=['date'],index_col='date',usecols=['date','daysoff'])
dg = pd.read_csv('./datasets/regionalGDP_2010-2015.csv',parse_dates=['date'],index_col='date',usecols=[*range(1, 15)])
dr = pd.concat([dn, dm,dh,dg], axis=1)
df = dr.dropna()
df = df[['nights_PAC','avgtemp_PAC','gdp_PAC','daysoff']]
exogenous= np.array(df[['avgtemp_PAC','gdp_PAC','daysoff']])
y = np.array(df.nights_PAC)

In [4]:
mc_fit = auto_arima(y=np.array(df.gdp_PAC),start_p=0, start_q=0, max_p=3, max_q=3, m=12,
                    start_P=1, seasonal=False, d=1, D=1, trace=True,
                    error_action='ignore',  # don't want to know if an order does not work
                    suppress_warnings=True,  # don't want convergence warnings
                    stepwise=True)
mc_fit.summary()

Fit ARIMA: order=(0, 1, 0); AIC=1139.416, BIC=1143.942, Fit time=0.093 seconds
Fit ARIMA: order=(1, 1, 0); AIC=1141.212, BIC=1148.000, Fit time=0.080 seconds
Fit ARIMA: order=(0, 1, 1); AIC=1141.184, BIC=1147.972, Fit time=0.036 seconds
Fit ARIMA: order=(1, 1, 1); AIC=nan, BIC=nan, Fit time=nan seconds
Total fit time: 0.227 seconds


0,1,2,3
Dep. Variable:,D.y,No. Observations:,71.0
Model:,"ARIMA(0, 1, 0)",Log Likelihood,-567.708
Method:,css,S.D. of innovations,718.346
Date:,"Mon, 03 Sep 2018",AIC,1139.416
Time:,13:35:07,BIC,1143.942
Sample:,1,HQIC,1141.216
,,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,165.7243,85.252,1.944,0.056,-1.367,332.815


In [5]:
mc_fit.fit(df.gdp_PAC)

ARIMA(callback=None, disp=0, maxiter=50, method=None, order=(0, 1, 0),
   out_of_sample_size=0, scoring='mse', scoring_args={},
   seasonal_order=None, solver='lbfgs', start_params=None,

In [6]:
next_24 = mc_fit.predict(n_periods=24)
dr.loc['2016-01-01':'2017-12-01','gdp_PAC'] = next_24

In [27]:
mae_fold = []
mse_fold = []
rmse_fold = []
smase_fold= []
startyear = '2010-01-01'
for index in range(2012,2018):
    nextyear = str(index)+'-01-01'
    predictyear = str(index+1)+'-01-01'
    dftrain = dr[(dr.index>=startyear)&(dr.index<nextyear)]
    train_nights  = np.array(dftrain.nights_PAC)
    train_exogenous =  np.array(dftrain[['avgtemp_PAC','gdp_PAC','daysoff']])
    dftest = dr[(dr.index>=nextyear)&(dr.index<predictyear)]
    test_nights = np.array(dftest.nights_PAC)
    test_exogenous =  np.array(dftest[['avgtemp_PAC','gdp_PAC','daysoff']])
    tc_fit = auto_arima(y=train_nights,exogenous=train_exogenous,start_p=1, start_q=1, max_p=3, max_q=3, m=12,
                    start_P=1, seasonal=True, d=1, D=1, trace=True,
                    error_action='ignore',  # don't want to know if an order does not work
                    suppress_warnings=True,  # don't want convergence warnings
                    stepwise=True)
    tc_future_forecast = tc_fit.predict(n_periods=12,exogenous = test_exogenous)
    mae_fold.append(mean_absolute_error(test_nights, tc_future_forecast))
    mse = mean_squared_error(test_nights, tc_future_forecast)
    mse_fold.append(mse)
    rmse_fold.append(sqrt(mse))
    smase_fold.append(seasonal_MASE(test_nights,tc_future_forecast))

print('MAE:')
print(mae_fold)    
print('MSE:')
print(mse_fold)
print('RMSE:')
print(rmse_fold)
print('SMASE:')
print(smase_fold)

Fit ARIMA: order=(1, 1, 1) seasonal_order=(1, 1, 1, 12); AIC=nan, BIC=nan, Fit time=nan seconds
Fit ARIMA: order=(0, 1, 0) seasonal_order=(0, 1, 0, 12); AIC=136.489, BIC=138.478, Fit time=0.038 seconds
Fit ARIMA: order=(1, 1, 0) seasonal_order=(1, 1, 0, 12); AIC=nan, BIC=nan, Fit time=nan seconds
Fit ARIMA: order=(0, 1, 1) seasonal_order=(0, 1, 1, 12); AIC=nan, BIC=nan, Fit time=nan seconds
Fit ARIMA: order=(0, 1, 0) seasonal_order=(1, 1, 0, 12); AIC=nan, BIC=nan, Fit time=nan seconds
Fit ARIMA: order=(0, 1, 0) seasonal_order=(0, 1, 1, 12); AIC=nan, BIC=nan, Fit time=nan seconds
Fit ARIMA: order=(0, 1, 0) seasonal_order=(1, 1, 1, 12); AIC=nan, BIC=nan, Fit time=nan seconds
Fit ARIMA: order=(1, 1, 0) seasonal_order=(0, 1, 0, 12); AIC=138.103, BIC=140.491, Fit time=0.252 seconds
Fit ARIMA: order=(0, 1, 1) seasonal_order=(0, 1, 0, 12); AIC=134.866, BIC=137.253, Fit time=0.459 seconds
Fit ARIMA: order=(1, 1, 2) seasonal_order=(0, 1, 0, 12); AIC=134.377, BIC=137.560, Fit time=0.583 seconds


Fit ARIMA: order=(0, 1, 1) seasonal_order=(1, 1, 0, 12); AIC=561.521, BIC=574.472, Fit time=1.153 seconds
Total fit time: 22.085 seconds
Fit ARIMA: order=(1, 1, 1) seasonal_order=(1, 1, 1, 12); AIC=687.799, BIC=706.497, Fit time=1.874 seconds
Fit ARIMA: order=(0, 1, 0) seasonal_order=(0, 1, 0, 12); AIC=709.629, BIC=720.017, Fit time=0.115 seconds
Fit ARIMA: order=(1, 1, 0) seasonal_order=(1, 1, 0, 12); AIC=699.327, BIC=713.870, Fit time=1.229 seconds
Fit ARIMA: order=(0, 1, 1) seasonal_order=(0, 1, 1, 12); AIC=687.750, BIC=702.293, Fit time=1.452 seconds
Fit ARIMA: order=(0, 1, 1) seasonal_order=(1, 1, 1, 12); AIC=689.214, BIC=705.834, Fit time=1.780 seconds
Fit ARIMA: order=(0, 1, 1) seasonal_order=(0, 1, 0, 12); AIC=698.393, BIC=710.858, Fit time=0.506 seconds
Fit ARIMA: order=(0, 1, 1) seasonal_order=(0, 1, 2, 12); AIC=nan, BIC=nan, Fit time=nan seconds
Fit ARIMA: order=(0, 1, 1) seasonal_order=(1, 1, 2, 12); AIC=nan, BIC=nan, Fit time=nan seconds
Fit ARIMA: order=(1, 1, 1) seasonal