In [1]:
!pip install sktime

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import statsmodels
#import xgboost as xgb

In [3]:
data = pd.read_csv(r'https://raw.githubusercontent.com/ladyjossy77/hierarchical-forecasting/master/data/clean_data.csv', index_col =0)

In [4]:
Y_data = pd.read_csv(r'https://raw.githubusercontent.com/ladyjossy77/hierarchical-forecasting/master/data/forecast.csv', index_col = 0)

In [5]:
Y_data["dateOfPurchase"] = pd.to_datetime(Y_data["dateOfPurchase"]).dt.to_period('M')

In [6]:
data["dateOfPurchase"] = pd.to_datetime(data["dateOfPurchase"]).dt.to_period('M')

In [7]:
Y_data = Y_data.groupby(["CustomerContinent", "ProductCategory", "dateOfPurchase"]).agg(
                                TotalQuantity = pd.NamedAgg(column = "TotalQuantity", aggfunc=sum),
                                TotalRevenue = pd.NamedAgg(column = "TotalRevenue", aggfunc = sum)
)

In [8]:
data.head()

Unnamed: 0,CustomerContinent,ProductCategory,dateOfPurchase,TotalQuantity,TotalRevenue,AvgPrice,MedPrice,AvgCost,AvgDiscount
0,Africa,Automotive,2016-01,1,961.27,991.0,991.0,307.21,0.03
1,Africa,Automotive,2016-02,22,6736.86,323.0,269.0,195.56,0.05
2,Africa,Automotive,2016-03,10,6633.2,721.0,721.0,591.22,0.08
3,Africa,Automotive,2016-04,23,10619.63,507.75,504.5,258.9825,0.0575
4,Africa,Automotive,2016-05,18,8175.08,479.25,437.5,238.255,0.065


In [9]:
X_data = data.drop(["TotalQuantity", "TotalRevenue"], axis = 1)

In [10]:
X_data.head()

Unnamed: 0,CustomerContinent,ProductCategory,dateOfPurchase,AvgPrice,MedPrice,AvgCost,AvgDiscount
0,Africa,Automotive,2016-01,991.0,991.0,307.21,0.03
1,Africa,Automotive,2016-02,323.0,269.0,195.56,0.05
2,Africa,Automotive,2016-03,721.0,721.0,591.22,0.08
3,Africa,Automotive,2016-04,507.75,504.5,258.9825,0.0575
4,Africa,Automotive,2016-05,479.25,437.5,238.255,0.065


In [11]:
X_data = X_data.groupby(["CustomerContinent", "ProductCategory", "dateOfPurchase"]).agg(
                                AvgPrice = pd.NamedAgg(column = "AvgPrice", aggfunc=sum),
                                MedPrice = pd.NamedAgg(column = "MedPrice", aggfunc = sum),
                                AvgDiscount = pd.NamedAgg(column ="AvgDiscount", aggfunc=np.mean),
                                AvgCost = pd.NamedAgg(column ="AvgCost", aggfunc=np.mean),
)

In [12]:
X_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,AvgPrice,MedPrice,AvgDiscount,AvgCost
CustomerContinent,ProductCategory,dateOfPurchase,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Africa,Automotive,2016-01,991.0,991.0,0.03,307.21
Africa,Automotive,2016-02,323.0,269.0,0.05,195.56
Africa,Automotive,2016-03,721.0,721.0,0.08,591.22
Africa,Automotive,2016-04,507.75,504.5,0.0575,258.9825
Africa,Automotive,2016-05,479.25,437.5,0.065,238.255


In [13]:
y_train = Y_data[Y_data.index.get_level_values('dateOfPurchase')<"2019-01"]
y_validate = Y_data[Y_data.index.get_level_values('dateOfPurchase')>="2019-01"]

In [14]:
#standardizing data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
y_train.values
index = y_train.index
columns = y_train.columns
scaled_values = scaler.fit_transform(y_train.values)

In [15]:
scaled_y = pd.DataFrame(scaled_values, columns = columns, index = index)
scaled_y.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,TotalQuantity,TotalRevenue
CustomerContinent,ProductCategory,dateOfPurchase,Unnamed: 3_level_1,Unnamed: 4_level_1
Africa,Automotive,2016-01,-0.874053,-0.68007
Africa,Automotive,2016-02,1.406147,0.500698
Africa,Automotive,2016-03,0.103175,0.479506
Africa,Automotive,2016-04,1.514727,1.294496
Africa,Automotive,2016-05,0.971823,0.79473


In [16]:
from sktime.forecasting.base import ForecastingHorizon
fh = ForecastingHorizon(
    pd.PeriodIndex(pd.date_range("2019-01", periods= 24, freq="M")), is_relative=False
)
fh

ForecastingHorizon(['2019-01', '2019-02', '2019-03', '2019-04', '2019-05', '2019-06',
             '2019-07', '2019-08', '2019-09', '2019-10', '2019-11', '2019-12',
             '2020-01', '2020-02', '2020-03', '2020-04', '2020-05', '2020-06',
             '2020-07', '2020-08', '2020-09', '2020-10', '2020-11', '2020-12'],
            dtype='period[M]', is_relative=False)

In [17]:
from sktime.forecasting.var import VAR
from sktime.forecasting.varmax import VARMAX
from sktime.forecasting.dynamic_factor import DynamicFactor
from sktime.forecasting.model_selection._tune import ForecastingGridSearchCV
from sktime.forecasting.compose._pipeline import Permute
from sktime.forecasting.compose._pipeline import TransformedTargetForecaster
from sktime.forecasting.compose._hierarchy_ensemble import HierarchyEnsembleForecaster
from sktime.forecasting.compose._ensemble import EnsembleForecaster
import statsmodels


In [18]:
forecasters = [("var", VAR()),
               ("varmax", VARMAX())]
  
forecaster = EnsembleForecaster(forecasters)

In [19]:
#scaling X values before fitting
X_train = X_data[X_data.index.get_level_values('dateOfPurchase')<"2019-01"]

X_scaler = StandardScaler()
index_X = X_train.index
columns_X = X_train.columns
scaled_values = X_scaler.fit_transform(X_train.values)


In [20]:
scaled_X = pd.DataFrame(scaled_values, columns = columns_X, index = index_X)
scaled_X.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,AvgPrice,MedPrice,AvgDiscount,AvgCost
CustomerContinent,ProductCategory,dateOfPurchase,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Africa,Automotive,2016-01,2.262561,2.119519,-1.141758,0.326386
Africa,Automotive,2016-02,-0.806172,-0.989227,-0.238298,-0.469649
Africa,Automotive,2016-03,1.022205,0.956968,1.116892,2.351301
Africa,Automotive,2016-04,0.042553,0.024775,0.100499,-0.017463
Africa,Automotive,2016-05,-0.088373,-0.26371,0.439297,-0.165245


In [None]:
exo_model_one = forecaster.fit(y = scaled_y, X = scaled_X, fh =fh)