In [1]:
import pandas as pd
import numpy as np
import datetime 
import matplotlib.pyplot as plt
%matplotlib inline
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from math import sqrt
from pyramid.arima import auto_arima
from sklearn.externals import joblib
from collections import defaultdict
from itertools import product
import pytemperature
import seaborn as sns

In [2]:
def seasonal_MASE(last_season, this_season, forecast):
    '''
    Parameters
    ----------
    last_season: array containing data for last season (size S)
    
    current_truth: array containing data for current season (size S)
    
    forecast: array containing data predictiing current season (size S)
    '''
    forecast_errors = np.abs(this_season - forecast)
    #print(forecast_errors)
    mean_absolute_forecast_error = np.average(forecast_errors)
    #print('forecast MAE', mean_absolute_forecast_error)
    
    naive = last_season
    naive.index = this_season.index
    naive_errors = np.abs(this_season - naive)
    #print(naive_errors)
    mean_absolute_naive_error = np.average(naive_errors)
    #print('naive MAE', mean_absolute_naive_error)
    
    return mean_absolute_forecast_error / mean_absolute_naive_error

In [3]:
nights =  pd.read_csv('/Users/jinny/Documents/touristcast/exercises/datasets/nights_2010-2017.csv', index_col='date', parse_dates=True, usecols=[*range(1, 15)])
nights.index.freq = 'MS'
temp = pd.read_csv('/Users/jinny/Documents/touristcast/exercises/avgtemp_2010-2019.csv', parse_dates=True, index_col=0)
temp = pytemperature.c2k(temp)
daysoff_1017 = pd.read_csv('/Users/jinny/Documents/touristcast/exercises/datasets/daysoff_2010-2017.csv', parse_dates=True, index_col='date', usecols=[*range(1, 3)])
daysoff_1017.drop(daysoff_1017[(daysoff_1017.index>='2016-01-01')&(daysoff_1017.index<'2018-01-01')].index, inplace=True)
daysoff_1619 = pd.read_csv('/Users/jinny/Documents/touristcast/exercises/daysoff_2016-2019.csv', parse_dates=True, index_col=0)
daysoff_1619.rename(columns={'NbDaysOff': 'daysoff'}, inplace=True)
daysoff = pd.concat([daysoff_1017, daysoff_1619], axis=0)
# exog = pd.concat([temp, daysoff], axis=1)

In [4]:
args = {
    'region': ['CVL', 'IDF', 'NAQ', 'ARA', 'PAC', 'PDL', 
               'BRE', 'OCC', 'COR', 'BFC', 'GES', 'HDF', 'NOR'],
    # periods is a tuple of (training start year, training end year, test year)
    'periods': [('2010-01-01', f'{end_year}-01-01', f'{end_year + 1}-01-01')
                 for end_year in range(2012, 2017)]
}

sarimax_error_df = pd.DataFrame(columns=list(args.keys()) + ['MAE', 'RMSE', 'Max_error', 'MASE'])

for region, periods in product(*args.values()):
    start_year, end_year, predict_year = periods
    # Input data = nights only, same as testing variable
    input_data = nights[[f'nights_{region}']]
    
    # Exog = everything that's not nights
    exog = pd.concat([temp, daysoff], axis=1)
    exog = exog[[f'avgtemp_{region}', 'daysoff']]

    training_period = (input_data.index>=start_year)&(input_data.index<end_year) 
    testing_period = (input_data.index>=end_year)&(input_data.index<predict_year)

    train_data = input_data[training_period]
    test_data = np.array(input_data[testing_period])
    
    exog_train_data = exog[(exog.index>=start_year)&(exog.index<end_year)]
    exog_future_data = exog[(exog.index>=end_year)&(exog.index<predict_year)]
    
    model = auto_arima(y=np.array(train_data), exogenous=np.array(exog_train_data), 
                       start_p=1, start_q=1, max_p=3, max_q=3, start_P=1, d=1, D=1,  
                       m=12, seasonal=True, trace=False,
                       error_action='ignore', suppress_warnings=False, stepwise=True)
    
    future_forecast = model.predict(n_periods=12, exogenous=np.array(exog_future_data))
    forecast = pd.DataFrame(future_forecast, columns=[f'nights_{region}'], index=input_data[testing_period].index)

    # --------Calculated error measures for each CV step----------
    rmse_test = np.sqrt(mean_squared_error(test_data, forecast))
    mae_test = mean_absolute_error(test_data, forecast)
    years_train_data = len(train_data.index.strftime('%Y').unique().tolist())

    test = pd.Series(test_data[:,0],
                     index=input_data[testing_period].index)
    forecast.index = input_data[testing_period].index
    
    mase = seasonal_MASE(input_data[str(int(end_year[:4]) - 1)], input_data[end_year[:4]], forecast)
    max_error = np.max(np.abs(forecast[f'nights_{region}'] - test)) / test[forecast[f'nights_{region}'].idxmax(np.abs(forecast[f'nights_{region}'] - test))]
    new_row = {
        'region': region,
        'periods': years_train_data,
        'MAE': mae_test,
        'RMSE': rmse_test,
        'Max_error': max_error,
        'MASE': mase,
    }
    sarimax_error_df = sarimax_error_df.append(new_row, ignore_index=True)





  params_variance = (residuals[k_params_ma:]**2).mean()
  ret = ret.dtype.type(ret / rcount)








In [5]:
sarimax_error_df.head()

Unnamed: 0,region,periods,MAE,RMSE,Max_error,MASE
0,CVL,2,18.678354,24.058958,0.088551,0.90613
1,CVL,3,22.935219,26.573539,0.065172,0.95863
2,CVL,4,13.389392,16.027405,0.039579,0.733096
3,CVL,5,30.909648,39.100744,0.09778,1.44314
4,CVL,6,10.338523,12.563125,0.037208,0.783964


In [6]:
# sarimax_error_df.to_csv('SARIMAX_error_crossval.csv')