<a href="https://colab.research.google.com/github/jeremysb1/forecasting/blob/main/sarimax_parameter_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries and Data

In [4]:
!pip install pmdarima

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pmdarima
  Downloading pmdarima-2.0.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pmdarima
Successfully installed pmdarima-2.0.3


In [5]:
%cd /content/drive/MyDrive/Time Series Forecasting Product

/content/drive/MyDrive/Time Series Forecasting Product


In [6]:
# libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import ParameterGrid
import pmdarima as pm
from pmdarima import model_selection

In [7]:
# load the data
# YYYY-MM-DD
df = pd.read_csv('nyc_data.csv', index_col = 0, parse_dates = True)
df.head()

Unnamed: 0_level_0,Demand,Easter,Thanksgiving,Christmas,Temperature,Marketing
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-01-01,720.000885,0,0,0,3.68,41.305
2015-01-02,581.276773,0,0,0,4.73,131.574
2015-01-03,754.117039,0,0,0,7.23,162.7
2015-01-04,622.252774,0,0,0,10.96,160.281
2015-01-05,785.373319,0,0,0,6.92,51.077


In [8]:
# rename variable
df = df.rename(columns = {'Demand': 'y'})
df.head(0)

Unnamed: 0_level_0,y,Easter,Thanksgiving,Christmas,Temperature,Marketing
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


In [9]:
# extract regressors
X = df.iloc[:, 1:]
X.head(0)

Unnamed: 0_level_0,Easter,Thanksgiving,Christmas,Temperature,Marketing
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


# Stationarity

In [13]:
# test
from statsmodels.tsa.stattools import adfuller
pvalue = adfuller(x = df.y)[1]

# condition to read test
if pvalue < 0.05:
  print(f"The Time Series is stationary. The p-value is {pvalue}")
else:
  print(f"The time series is not stationary. The p-value is {pvalue}")

The time series is not stationary. The p-value is 0.37677707077291045


In [14]:
# differencing
df.y.diff().dropna()

# test
pvalue = adfuller(x = df.y.diff().dropna())[1]

#condition to read test
if pvalue < 0.05:
  print(f"The Time Series is stationary. The p-value is {pvalue}")
else:
  print(f"The Time Series is not stationary. The p-value is {pvalue}")

The Time Series is stationary. The p-value is 3.3557739456287946e-22


# Sarimax Model

In [15]:
# model
# hourly: 24, daily: 7, weekly: 52, monthly: 12, quarterly: 4
model = pm.ARIMA(order = (1, 1, 1),
                 seasonal_order = (1, 1, 1, 7),
                 X = X,
                 suppress_warning = True, 
                 force_stationarity = False)


In [None]:
# cross-validation
cv = model_selection.RollingForecastCV(h = 31,
                                       step = 16,
                                       initial = df.shape[0] - 180)
cv_score = model_selection.cross_val_score(model,
                                           y = df.y,
                                           scoring = 'mean_squared_error',
                                           cv = cv,
                                           verbose = 2,
                                           error_score = 1000000000000000)

In [17]:
# CV performance
error = np.sqrt(np.average(cv_score))