In [1]:
import pandas as pd
import numpy as np
import datetime 
import holidays
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from math import sqrt
%matplotlib inline
import seaborn as sns

import fbprophet
from sklearn.externals import joblib
from collections import defaultdict
from sklearn.externals import joblib
import matplotlib.pyplot as plt
from itertools import product

In [2]:
nights = pd.read_csv('/Users/jinny/Documents/touristcast/exercises/datasets/nights_2010-2017.csv', index_col='date', parse_dates=['date'], usecols=[*range(1, 15)])
nights.index.freq = 'MS'

In [3]:
def seasonal_MASE(last_season, this_season, forecast):
    '''
    Parameters
    ----------
    last_season: array containing data for last season (size S)
    
    current_truth: array containing data for current season (size S)
    
    forecast: array containing data predictiing current season (size S)
    '''
    forecast_errors = np.abs(this_season - forecast)
    #print(forecast_errors)
    mean_absolute_forecast_error = np.average(forecast_errors)
    #print('forecast MAE', mean_absolute_forecast_error)
    
    naive = last_season
    naive.index = this_season.index
    naive_errors = np.abs(this_season - naive)
    #print(naive_errors)
    mean_absolute_naive_error = np.average(naive_errors)
    #print('naive MAE', mean_absolute_naive_error)
    
    return mean_absolute_forecast_error / mean_absolute_naive_error

In [4]:
args = {
    'region': ['CVL', 'IDF', 'NAQ', 'ARA', 'PAC', 
               'PDL', 'BRE', 'OCC', 'COR', 'BFC', 'GES', 'HDF', 'NOR'],
    # periods is a tuple of (training start year, training end year, test year)
    'periods': [('2010-01-01', f'{end_year}-01-01', f'{end_year + 1}-01-01')
                 for end_year in range(2012, 2017)]
}

prophet_error_df = pd.DataFrame(columns=list(args.keys()) + ['MAE', 'RMSE', 'Max_error', 'MASE'])

for region, periods in product(*args.values()):
    start_year, end_year, predict_year = periods
    input_data = nights[[f'nights_{region}']]

    training_period = (input_data.index>=start_year)&(input_data.index<end_year) 
    testing_period = (input_data.index>=end_year)&(input_data.index<predict_year)

    train_data = input_data[training_period]
    test_data = np.array(input_data[testing_period])
    
    fb_holidays = []
    for holiday_year in range(2010, int(predict_year[:4])):
        for date, name in sorted(holidays.FRA(years=holiday_year).items()):
            fb_holidays.append(date.strftime('%Y-%m-%d'))

    # --------Facebook Prophet model----------
    fb = pd.DataFrame()
    fb['ds'] = train_data.index
    fb['y'] = np.array(train_data)
    holidays_exo = pd.DataFrame({  
        'holiday': 'france',
        'ds': pd.to_datetime(fb_holidays),
       })
    fb_prophet = fbprophet.Prophet(yearly_seasonality=True, weekly_seasonality=False, daily_seasonality=False,
                                   seasonality_mode='multiplicative', 
                                   seasonality_prior_scale=0.04, changepoint_prior_scale=0.1, 
                                   holidays=holidays_exo, holidays_prior_scale=0.01)
    fb_prophet.fit(fb)
    fb_forecast = fb_prophet.make_future_dataframe(periods=12, freq='M')
    fb_forecast = fb_prophet.predict(fb_forecast)
    future_forecast = np.array(fb_forecast.tail(12).yhat)

    # --------Forecasted results----------
    forecast = pd.DataFrame(future_forecast, columns=[f'nights_{region}'], index=input_data[testing_period].index)

    # --------Calculated error measures for each CV step----------
    rmse_test = np.sqrt(mean_squared_error(test_data, forecast))
    mae_test = mean_absolute_error(test_data, forecast)
    years_train_data = len(train_data.index.strftime('%Y').unique().tolist())

    test = pd.Series(test_data[:,0],
                     index=input_data[testing_period].index)
    forecast.index = input_data[testing_period].index
    
    mase = seasonal_MASE(input_data[str(int(end_year[:4]) - 1)], input_data[end_year[:4]], forecast)
    max_error = np.max(np.abs(forecast[f'nights_{region}'] - test)) / test[forecast[f'nights_{region}'].idxmax(np.abs(forecast[f'nights_{region}'] - test))]
    new_row = {
        'region': region,
        'periods': years_train_data,
        'MAE': mae_test,
        'RMSE': rmse_test,
        'Max_error': max_error,
        'MASE': mase,
    }
    prophet_error_df = prophet_error_df.append(new_row, ignore_index=True)


INFO:fbprophet.forecaster:n_changepoints greater than number of observations.Using 18.0.
  np.linspace(0, hist_size - 1, self.n_changepoints + 1)
  elif np.issubdtype(np.asarray(v).dtype, float):
INFO:fbprophet.forecaster:n_changepoints greater than number of observations.Using 18.0.
INFO:fbprophet.forecaster:n_changepoints greater than number of observations.Using 18.0.
INFO:fbprophet.forecaster:n_changepoints greater than number of observations.Using 18.0.
INFO:fbprophet.forecaster:n_changepoints greater than number of observations.Using 18.0.
INFO:fbprophet.forecaster:n_changepoints greater than number of observations.Using 18.0.
INFO:fbprophet.forecaster:n_changepoints greater than number of observations.Using 18.0.
INFO:fbprophet.forecaster:n_changepoints greater than number of observations.Using 18.0.
INFO:fbprophet.forecaster:n_changepoints greater than number of observations.Using 18.0.
INFO:fbprophet.forecaster:n_changepoints greater than number of observations.Using 18.0.
INF

In [5]:
prophet_error_df.head()

Unnamed: 0,region,periods,MAE,RMSE,Max_error,MASE
0,CVL,2,101.844712,127.349215,0.340169,4.94072
1,CVL,3,16.868236,19.403262,0.052734,0.705046
2,CVL,4,13.165598,16.139994,0.045319,0.720843
3,CVL,5,25.860059,29.848861,0.077722,1.20738
4,CVL,6,13.605493,17.778427,0.053119,1.031696


In [6]:
prophet_error_df.to_csv('Prophet_error_crossval.csv')