In [1]:
!pip install sktime

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import statsmodels
#import xgboost as xgb

In [3]:
data = pd.read_csv(r'https://raw.githubusercontent.com/ladyjossy77/hierarchical-forecasting/master/data/clean_data.csv', index_col =0)

In [4]:
Y_data = pd.read_csv(r'https://raw.githubusercontent.com/ladyjossy77/hierarchical-forecasting/master/data/forecast.csv', index_col = 0)

In [5]:
Y_data["dateOfPurchase"] = pd.to_datetime(Y_data["dateOfPurchase"]).dt.to_period('M')

In [6]:
data["dateOfPurchase"] = pd.to_datetime(data["dateOfPurchase"]).dt.to_period('M')

In [7]:
Y_data = Y_data.groupby(["CustomerContinent", "ProductCategory", "dateOfPurchase"]).agg(
                                TotalQuantity = pd.NamedAgg(column = "TotalQuantity", aggfunc=sum),
                                TotalRevenue = pd.NamedAgg(column = "TotalRevenue", aggfunc = sum)
)

In [8]:
data.head()

Unnamed: 0,CustomerContinent,ProductCategory,dateOfPurchase,TotalQuantity,TotalRevenue,AvgPrice,MedPrice,AvgCost,AvgDiscount
0,Africa,Automotive,2016-01,1,961.27,991.0,991.0,307.21,0.03
1,Africa,Automotive,2016-02,22,6736.86,323.0,269.0,195.56,0.05
2,Africa,Automotive,2016-03,10,6633.2,721.0,721.0,591.22,0.08
3,Africa,Automotive,2016-04,23,10619.63,507.75,504.5,258.9825,0.0575
4,Africa,Automotive,2016-05,18,8175.08,479.25,437.5,238.255,0.065


In [9]:
exo_data = data.drop(["TotalQuantity", "TotalRevenue"], axis = 1)

In [10]:
exo_data.head()

Unnamed: 0,CustomerContinent,ProductCategory,dateOfPurchase,AvgPrice,MedPrice,AvgCost,AvgDiscount
0,Africa,Automotive,2016-01,991.0,991.0,307.21,0.03
1,Africa,Automotive,2016-02,323.0,269.0,195.56,0.05
2,Africa,Automotive,2016-03,721.0,721.0,591.22,0.08
3,Africa,Automotive,2016-04,507.75,504.5,258.9825,0.0575
4,Africa,Automotive,2016-05,479.25,437.5,238.255,0.065


In [11]:
# ensuring consistent date intervals for X data
cat_combination = exo_data[["CustomerContinent", "ProductCategory"]].drop_duplicates()
time = data[["dateOfPurchase"]].drop_duplicates()
time["dateOfPurchase"].nunique()
indexes = cat_combination.merge(time, how='cross')
X_data = indexes.merge(exo_data, on = ["CustomerContinent", "ProductCategory", "dateOfPurchase"], how = 'left')

In [12]:
X_data = X_data.groupby(["CustomerContinent", "ProductCategory", "dateOfPurchase"]).agg(
                                AvgPrice = pd.NamedAgg(column = "AvgPrice", aggfunc=sum),
                                MedPrice = pd.NamedAgg(column = "MedPrice", aggfunc = sum),
                                AvgDiscount = pd.NamedAgg(column ="AvgDiscount", aggfunc=np.mean),
                                AvgCost = pd.NamedAgg(column ="AvgCost", aggfunc=np.mean),
)

In [13]:
X_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,AvgPrice,MedPrice,AvgDiscount,AvgCost
CustomerContinent,ProductCategory,dateOfPurchase,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Africa,Automotive,2016-01,991.0,991.0,0.03,307.21
Africa,Automotive,2016-02,323.0,269.0,0.05,195.56
Africa,Automotive,2016-03,721.0,721.0,0.08,591.22
Africa,Automotive,2016-04,507.75,504.5,0.0575,258.9825
Africa,Automotive,2016-05,479.25,437.5,0.065,238.255


In [14]:
X_data = X_data.fillna(0.0)

In [15]:
y_train = Y_data[Y_data.index.get_level_values('dateOfPurchase')<"2019-01"]
y_validate = Y_data[Y_data.index.get_level_values('dateOfPurchase')>="2019-01"]

In [16]:
#standardizing data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
y_train.values
index = y_train.index
columns = y_train.columns
scaled_values = scaler.fit_transform(y_train.values)

In [17]:
scaled_y = pd.DataFrame(scaled_values, columns = columns, index = index)
scaled_y.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,TotalQuantity,TotalRevenue
CustomerContinent,ProductCategory,dateOfPurchase,Unnamed: 3_level_1,Unnamed: 4_level_1
Africa,Automotive,2016-01,-0.874053,-0.68007
Africa,Automotive,2016-02,1.406147,0.500698
Africa,Automotive,2016-03,0.103175,0.479506
Africa,Automotive,2016-04,1.514727,1.294496
Africa,Automotive,2016-05,0.971823,0.79473


In [18]:
from sktime.forecasting.base import ForecastingHorizon
fh = ForecastingHorizon(
    pd.PeriodIndex(pd.date_range("2019-01", periods= 24, freq="M")), is_relative=False
)
fh

ForecastingHorizon(['2019-01', '2019-02', '2019-03', '2019-04', '2019-05', '2019-06',
             '2019-07', '2019-08', '2019-09', '2019-10', '2019-11', '2019-12',
             '2020-01', '2020-02', '2020-03', '2020-04', '2020-05', '2020-06',
             '2020-07', '2020-08', '2020-09', '2020-10', '2020-11', '2020-12'],
            dtype='period[M]', is_relative=False)

In [19]:
from sktime.forecasting.var import VAR
from sktime.forecasting.varmax import VARMAX
from sktime.forecasting.dynamic_factor import DynamicFactor
from sktime.forecasting.model_selection._tune import ForecastingGridSearchCV
from sktime.forecasting.compose._pipeline import Permute
from sktime.forecasting.compose._pipeline import TransformedTargetForecaster
from sktime.forecasting.compose._hierarchy_ensemble import HierarchyEnsembleForecaster
from sktime.forecasting.compose._ensemble import EnsembleForecaster
import statsmodels


In [20]:
forecasters = [("var", VAR()),
               ("varmax", VARMAX())]
  
forecaster = EnsembleForecaster(forecasters)

In [21]:
#scaling X values before fitting
# 

X_scaler = StandardScaler()
index_X = X_data.index
columns_X = X_data.columns
scaled_values = X_scaler.fit_transform(X_data.values)


In [22]:
scaled_X = pd.DataFrame(scaled_values, columns = columns_X, index = index_X)
scaled_X.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,AvgPrice,MedPrice,AvgDiscount,AvgCost
CustomerContinent,ProductCategory,dateOfPurchase,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Africa,Automotive,2016-01,2.164787,2.104211,-0.309387,0.704538
Africa,Automotive,2016-02,-0.120003,-0.296764,0.331049,0.045495
Africa,Automotive,2016-03,1.241294,1.20634,1.291702,2.38098
Africa,Automotive,2016-04,0.511905,0.48638,0.571212,0.419863
Africa,Automotive,2016-05,0.414426,0.263574,0.811375,0.297513


In [23]:
X_train = scaled_X[scaled_X.index.get_level_values('dateOfPurchase')<"2019-01"]
X_validate = scaled_X[scaled_X.index.get_level_values('dateOfPurchase')>="2019-01"]

In [24]:
exo_model_one = forecaster.fit(y = scaled_y, X = X_train, fh =fh)

  warn('Non-stationary starting autoregressive parameters'


In [26]:
predict_exo_one = exo_model_one.predict(X = X_validate, fh=fh)

In [28]:
#model evaluation
predicted_index = predict_exo_one.index
predicted_columns = predict_exo_one.columns

inversed_prediction = scaler.inverse_transform(predict_exo_one)
inversed_prediction
predicted_data = pd.DataFrame(inversed_prediction, index = predicted_index, columns = predicted_columns)
predicted_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,TotalQuantity,TotalRevenue
CustomerContinent,ProductCategory,dateOfPurchase,Unnamed: 3_level_1,Unnamed: 4_level_1
Africa,Automotive,2019-01,15.403664,8259.025413
Africa,Automotive,2019-02,14.212673,6620.540456
Africa,Automotive,2019-03,14.308195,6521.802724
Africa,Automotive,2019-04,12.924828,6267.879781
Africa,Automotive,2019-05,15.180075,7188.377262


In [29]:
from sklearn.metrics import r2_score

In [30]:
score_exo_one = r2_score(y_validate, predicted_data)
score_exo_one

0.32072815446216635

In [31]:
from sktime.forecasting.model_selection import SlidingWindowSplitter
from sktime.forecasting.model_selection import SingleWindowSplitter
forecaster_set = [("var", VAR()), 
                  ("varmax", VARMAX())]
forecaster = EnsembleForecaster(forecasters= forecaster_set, weights= [4,10])
# fh = ForecastingHorizon(
#     pd.PeriodIndex(pd.date_range("2019-01", periods= 24, freq="M")), is_relative=False
# )
fh = pd.PeriodIndex(pd.date_range("2019-01", periods= 24, freq="M"))

cv = SingleWindowSplitter(fh = 24)

In [32]:
from sktime.forecasting.model_selection._tune import ForecastingRandomizedSearchCV

In [34]:
param_grid = {
    'var__ic': ['aic', 'fpe', 'hqic'],
    'var__method' :["ols","gls", "gmm"],
    'var__trend':["c", "ct"],
    'varmax__method':["newton","nf","bfgs"],
    'varmax__trend':["n", "c","t", ],
    'weights': [(0.33, 0.67), (0.6, 0.4), (0.5,0.5)],
    'n_jobs':[-1],
    'aggfunc':['mean', 'median']
}

In [35]:
forecasters = [("var", VAR()),
               ("varmax", VARMAX())]
  
forecaster = EnsembleForecaster(forecasters)

In [36]:
from sktime.performance_metrics.forecasting import mean_absolute_percentage_error

In [39]:
rdscv = ForecastingRandomizedSearchCV(forecaster= forecaster, cv=cv, scoring= mean_absolute_percentage_error, param_distributions= param_grid, error_score='raise')

In [40]:
tuned_exo = rdscv.fit(y = scaled_y, X = X_train, fh =fh)

LinAlgError: ignored