<a href="https://colab.research.google.com/github/hafilmr/LearningCodeTrainee/blob/main/Exponential%20Smoothing/Walk_Forward_Validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget nc https://lazyprogrammer.me/course_files/airline_passengers.csv

--2022-02-03 07:59:53--  http://nc/
Resolving nc (nc)... failed: No address associated with hostname.
wget: unable to resolve host address ‘nc’
--2022-02-03 07:59:53--  https://lazyprogrammer.me/course_files/airline_passengers.csv
Resolving lazyprogrammer.me (lazyprogrammer.me)... 104.21.23.210, 172.67.213.166, 2606:4700:3031::6815:17d2, ...
Connecting to lazyprogrammer.me (lazyprogrammer.me)|104.21.23.210|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2036 (2.0K) [text/csv]
Saving to: ‘airline_passengers.csv’


2022-02-03 07:59:53 (45.7 MB/s) - ‘airline_passengers.csv’ saved [2036/2036]

FINISHED --2022-02-03 07:59:53--
Total wall clock time: 0.4s
Downloaded: 1 files, 2.0K in 0s (45.7 MB/s)


In [None]:
!pip install -U statsmodels

Collecting statsmodels
  Downloading statsmodels-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.8 MB)
[K     |████████████████████████████████| 9.8 MB 5.1 MB/s 
Installing collected packages: statsmodels
  Attempting uninstall: statsmodels
    Found existing installation: statsmodels 0.10.2
    Uninstalling statsmodels-0.10.2:
      Successfully uninstalled statsmodels-0.10.2
Successfully installed statsmodels-0.13.1


In [None]:
import pandas as pd
import numpy as numpy
import matplotlib.pyplot as plt

import itertools

from sklearn.metrics import mean_squared_error
from statsmodels.tsa.holtwinters import ExponentialSmoothing

In [None]:
df = pd.read_csv('/content/airline_passengers.csv', index_col='Month', parse_dates=True)

In [None]:
df.index.freq = 'MS'

# set the frequency of our dataframe index for column Month

In [None]:
df.shape

(144, 1)

The next step is to set a few parameters for our test was that the forecast horizon to be 12 and the number of walk forward steps to be 10, the effective validation period, which I've called and test, is the length of the whole data frame, minus H, minus the number of steps, plus one.

You might want to draw this out on paper to make sure it makes sense.

There will be some debugging code in our walk forward function to check this.



In [None]:
# Assume the forecast horizon we care about is 12
# Validate over 10 steps

h = 12
steps = 10
Ntest = len(df) - h - steps + 1

In [None]:
# Configuration hyperparameters to try

trend_type_list = ['add', 'mul']
seasonal_type_list = ['add', 'mul']
damped_trend_list = [True, False]
init_method_list = ['estimated', 'heuristic', 'legacy-heuristic']
use_boxcox_list = [True, False, 0] #use boxcox transform

In [None]:
import numpy as np

In [None]:
def walkforward(
    trend_type,
    seasonal_type,
    damped_trend,
    init_method,
    use_boxcox,
    debug=False):
  
  # store errors
  errors = []
  seen_last = False
  steps_completed = 0

  for end_of_train in range(Ntest, len(df) - h + 1):
    # we dont have to manually 'add' the data to our dataset
    # Just index it at the right points - this is a view not a copy
    # so it doesnt take up any extra space or computation
    train = df.iloc[:end_of_train]
    test = df.iloc[end_of_train:end_of_train + h]

    if test.index[-1] == df.index[-1]:
      seen_last = True
    
    steps_completed += 1

    hw = ExponentialSmoothing(
        train['Passengers'],
        initialization_method=init_method,
        trend=trend_type,
        damped_trend=damped_trend,
        seasonal=seasonal_type,
        seasonal_periods=12,
        use_boxcox=use_boxcox)
    res_hw = hw.fit()

    # compute error for the forecast horizon
    fcast = res_hw.forecast(h)
    error = mean_squared_error(test['Passengers'], fcast)
    errors.append(error)

  if debug:
    print('seen_last', seen_last)
    print('steps completed:', steps_completed)

  return np.mean(error)

In [None]:
# test our function
walkforward('add', 'add', False, 'legacy-heuristic', 0, debug=True)
# """
# isi dari walkforward adalah:
# walkforward(trend_type(?), seasonal_type(?), 
# damped_trend(?), init_method,(?) use_boxcox(?),debug=False)
# """

seen_last True
steps completed: 10


1052.5905805666453

In [None]:
# iterate through all possible options (i.e. grid search)
tuple_of_option_list = (
    trend_type_list,
    seasonal_type_list,
    damped_trend_list,
    init_method_list,
    use_boxcox_list,
)
for x in itertools.product(*tuple_of_option_list):
  print(x)

('add', 'add', True, 'estimated', True)
('add', 'add', True, 'estimated', False)
('add', 'add', True, 'estimated', 0)
('add', 'add', True, 'heuristic', True)
('add', 'add', True, 'heuristic', False)
('add', 'add', True, 'heuristic', 0)
('add', 'add', True, 'legacy-heuristic', True)
('add', 'add', True, 'legacy-heuristic', False)
('add', 'add', True, 'legacy-heuristic', 0)
('add', 'add', False, 'estimated', True)
('add', 'add', False, 'estimated', False)
('add', 'add', False, 'estimated', 0)
('add', 'add', False, 'heuristic', True)
('add', 'add', False, 'heuristic', False)
('add', 'add', False, 'heuristic', 0)
('add', 'add', False, 'legacy-heuristic', True)
('add', 'add', False, 'legacy-heuristic', False)
('add', 'add', False, 'legacy-heuristic', 0)
('add', 'mul', True, 'estimated', True)
('add', 'mul', True, 'estimated', False)
('add', 'mul', True, 'estimated', 0)
('add', 'mul', True, 'heuristic', True)
('add', 'mul', True, 'heuristic', False)
('add', 'mul', True, 'heuristic', 0)
('add

In [None]:
best_score = float('inf')
best_options = None
for x in itertools.product(*tuple_of_option_list):
  score = walkforward(*x)

  if score < best_score:
    print("Best Score so far:", score)
    best_score = score
    best_options = x

Best Score so far: 265.371721206003
Best Score so far: 238.7749932347182


  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.T @ err
  return err.

you can see that when we run this, we get some warnings about overflows.
This is probably not an issue since it just means we'll get a bad model.

In [None]:
print('best score:', best_score)

trend_type, seasonal_type, damped_trend, init_method, use_boxcox = best_options
print('trend type:', trend_type)
print('seasonal type:', seasonal_type)
print('damped trend:', damped_trend)
print('init method:', init_method)
print('use_boxcox:', use_boxcox)

best score: 238.7749932347182
trend type: add
seasonal type: add
damped trend: True
init method: estimated
use_boxcox: False
