In [0]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pickle
import itertools

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import regex as re

from prophet import Prophet
from prophet.plot import add_changepoints_to_plot
from prophet.diagnostics import cross_validation
from prophet.diagnostics import performance_metrics
from prophet.plot import plot_cross_validation_metric


from statsmodels.tsa.seasonal import seasonal_decompose
#from learntools.time_series.utils import plot_periodogram, seasonal_plot

import sktime
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.utils.plotting import plot_series
from sktime.performance_metrics.forecasting import mean_absolute_percentage_error
from sklearn.metrics import mean_squared_error

Functions

In [0]:
#################
### Functions ###
#################

def get_color(name, number):
    pal = list(sns.color_palette(palette=name, n_colors=number).as_hex())
    return pal

def add_datepart(df, fldname, drop=True, time=False):
    "Helper function that adds columns relevant to a date."
    fld = df[fldname]
    fld_dtype = fld.dtype
    if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        fld_dtype = np.datetime64

    if not np.issubdtype(fld_dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
            'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
    if time: attr = attr + ['Hour', 'Minute', 'Second']
    for n in attr: df[targ_pre + n] = getattr(fld.dt, n.lower())
    df[targ_pre + 'Elapsed'] = fld.astype(np.int64) // 10 ** 9
    if drop: df.drop(fldname, axis=1, inplace=True)
    
def holiday(name):
    dset = holidays_events[holidays_events.locale == name]
    dset = dset[['date','var']].reset_index(drop=True)
    dset.rename(columns = {'var':name}, inplace = True)
    dset = dset.drop_duplicates()
    dset.date = dset.date.astype('datetime64[ns]')
    return dset    
  
def train_test_plot(train,test,name):

    y_train = train.y
    y_test = test.y
    y_test.index = y_test.index + max(y_train.index)
    print(y_train.shape[0], y_test.shape[0])
    
    f = plt.figure(figsize=(19, 15))
    plot_series(y_train, y_test, labels=["y_train", "y_test"], title = f'{name} Train-test plot');
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0)
    plot_series(y_test, title = f'{name} Test plot');
    plt.show    

########################
### Cross validation ###
########################

def cv_exam(model, test_df, y_test, initial, period, horizon):
    
    forecast = model.predict(test_df)
    df_cv = cross_validation(model, initial = initial, period = period, horizon = horizon)
    df_p = performance_metrics(df_cv)
    df_cv['residual'] = df_cv['y'] - df_cv['yhat']
    prop_test = pd.merge(test_df,forecast[['yhat','ds']],on=['ds'],how='inner')
    prop_test.index = y_test.index
    
    return df_cv, df_p, forecast, prop_test

def perf_vals(dset, meas = 'smape'):
    rmse_add = dset.groupby(['Model'])[meas].mean()
    rmse_add = pd.DataFrame(rmse_add)
    a = rmse_add.sort_values(by = [meas])
    
    rmse_add = rmse_add.reset_index()
    model = rmse_add[rmse_add[meas] == min(rmse_add[meas])]
    print(f"The best model is the {model['Model'].values[0]} model")

    return a

def mod_diag(model, mod_name, test_df, y_train, y_test, name, initial, period, horizon):
    f = plt.figure(figsize=(19, 15))
    plot_series(y_train, y_test, labels=["y_train", "y_test"], title = f'{name} Train-test plot');
    
    df_cv, df_p, forecast, prop_test = cv_exam(model, test_df, y_test, initial, period, horizon)
    fig = model.plot(forecast)
    a = add_changepoints_to_plot(fig.gca(), model, forecast)
    mape_ph = mean_absolute_percentage_error(prop_test['y'], prop_test['yhat'], symmetric=True)
    mse_ph = mean_squared_error(prop_test['y'], prop_test['yhat'], squared=False)
    print(f"The Smape loss value for {name} and {mod_name} model is {mape_ph:.6f}")
    print(f"The RMSE loss value for {name} and {mod_name} model is {mse_ph:.6f}")

    plot_series(prop_test['yhat'], y_test, labels=["y_pred","y_test"], title = f'{name} {mod_name} Test predict-actuals plot')
    plt.show();
    
    fig = model.plot_components(forecast)
    f = plt.figure(figsize=(19, 15))
    ax = sns.lineplot(x="ds", y="residual", markers=True, data=df_cv)
    ax.set(xlabel='Dates', ylabel='Residuals')
    plt.show();
    return df_cv, df_p, forecast, prop_test

def perf_vals(dset, meas = 'smape'):
    rmse_add = dset.groupby(['Model'])[meas].mean()
    rmse_add = pd.DataFrame(rmse_add)
    a = rmse_add.sort_values(by = [meas])
    
    rmse_add = rmse_add.reset_index()
    model = rmse_add[rmse_add[meas] == min(rmse_add[meas])]
    print(f"The best model is the {model['Model'].values[0]} model")

    return a

def res_data(dset,meas,name,res_dset):
    try:
        baby = perf_vals(dset,meas)        
        baby[str(name + ' Model')] = baby.index
        baby.rename(columns = {meas : str(name) + ' ' + str(meas.capitalize())}, inplace = True)
        baby = baby.reset_index(drop=True)
        res_dset = pd.concat([res_dset,baby], axis = 1)
        res_dset = res_dset.reset_index(drop=True)
        return res_dset
    except:
        print('No ' + name + ' data')

In [0]:
def train_dset(dset,name,tt):
    dset = dset.drop(['month','year','day_of_week','day_of_year','sales','family'],axis = 1)
    dset.date = dset.date.astype('datetime64[ns]')
    time_data_ph = pd.merge(tt[tt.family == name],dset,on = 'date',how = 'left')
    time_data_ph.rename(columns = {'sales':'y', 'date':'ds'}, inplace = True)

    time_data_ph = time_data_ph.sort_values('ds')
    time_data_ph['y'] = time_data_ph['y'].astype(int)
    time_data_ph['ds'] = pd.to_datetime(time_data_ph['ds'], format='%Y-%m-%d')
    time_data_ph = time_data_ph.reset_index(drop=True)

    for name in ['National','Regional','Local','Is_month_end','Is_year_start','Is_month_end','Is_month_start','Is_month_end','Is_quarter_start','Is_year_end','Is_quarter_end']:
        time_data_ph[name] = time_data_ph[name].astype(int)
    return time_data_ph

In [0]:
#######################
### Model framework ###
#######################

def cap_floor(train_dset, test_dset, name, initial, period, horizon, growth):
    
    train_dset['cap'] = max(int(max(train_dset.y) * 1.1),int(max(test_dset.y) * 1.1))
    train_dset['floor'] = 0
    test_dset['cap'] = train_dset['cap']
    test_dset['floor'] = 0
    #return train_df, test_dset

    ### Baseline model ###

    m = Prophet(growth = growth)
    m.fit(train_dset)

    ### Strict model ###

    m_strict = Prophet(changepoint_prior_scale=0.01,
                       weekly_seasonality=True, 
                       daily_seasonality=False, 
                       yearly_seasonality=True,
                       seasonality_mode = 'multiplicative',
                       growth = growth)
    m_strict.fit(train_dset)

    ### Flexible model ###

    m_flex = Prophet(changepoint_prior_scale=0.5, growth = growth)
    m_flex.fit(train_dset)

    ### Holiday model ###

    m_hol = Prophet(holidays=holidays, growth = growth)
    m_hol.add_regressor(train_dset.filter(regex='dcoilwtico').columns[0])
    m_hol.add_regressor('Is_month_end')
    m_hol.add_regressor('Year')
    m_hol.add_regressor('Month')
    m_hol.add_regressor('Week')
    m_hol.add_regressor('Dayofweek')
    m_hol.add_regressor('Dayofyear')
    m_hol.add_regressor('Is_month_start')
    m_hol.add_regressor('Is_quarter_end')
    m_hol.add_regressor('Is_quarter_start')
    m_hol.add_regressor('Is_year_end')
    m_hol.add_regressor('Is_year_start')
    m_hol.add_regressor(train_dset.filter(regex='is_holiday').columns[0])
    m_hol.add_regressor(train_dset.filter(regex='onpromotion').columns[0])
    m_hol.fit(train_dset)

    ### Regressors ###

    m_reg = Prophet(changepoint_prior_scale=0.01, growth = growth)
    m_reg.add_regressor(train_dset.filter(regex='dcoilwtico').columns[0])
    m_reg.add_regressor(train_dset.filter(regex='is_holiday').columns[0])
    m_reg.add_regressor(train_dset.filter(regex='onpromotion').columns[0])
    m_reg.add_regressor('National')
    m_reg.add_regressor('Regional')
    m_reg.add_regressor('Local')    
    m_reg.add_regressor('Is_month_end')
    m_reg.add_regressor('Year')
    m_reg.add_regressor('Month')
    m_reg.add_regressor('Week')
    m_reg.add_regressor('Dayofweek')
    m_reg.add_regressor('Dayofyear')
    m_reg.add_regressor('Is_month_start')
    m_reg.add_regressor('Is_quarter_end')
    m_reg.add_regressor('Is_quarter_start')
    m_reg.add_regressor('Is_year_end')
    m_reg.add_regressor('Is_year_start')
    m_reg.fit(train_dset)

    ### Holiday-Regressors-Seasonality ###

    m_hrs = Prophet(weekly_seasonality=True, 
                    daily_seasonality=False, 
                    yearly_seasonality=True,
                    seasonality_mode = 'multiplicative',
                    changepoint_prior_scale = 0.1, 
                    seasonality_prior_scale = 10.0,
                    holidays=holidays,
                    growth = growth)
    m_hrs.add_regressor(train_dset.filter(regex='dcoilwtico').columns[0])
    m_hrs.add_regressor(train_dset.filter(regex='onpromotion').columns[0])
    m_hrs.add_regressor(train_dset.filter(regex='is_holiday').columns[0])
    m_hrs.add_regressor('National')
    m_hrs.add_regressor('Regional')
    m_hrs.add_regressor('Local')    
    m_hrs.add_regressor('Is_month_end')
    m_hrs.add_regressor('Year')
    m_hrs.add_regressor('Month')
    m_hrs.add_regressor('Week')
    m_hrs.add_regressor('Dayofweek')
    m_hrs.add_regressor('Dayofyear')
    m_hrs.add_regressor('Is_month_start')
    m_hrs.add_regressor('Is_quarter_end')
    m_hrs.add_regressor('Is_quarter_start')
    m_hrs.add_regressor('Is_year_end')
    m_hrs.add_regressor('Is_year_start')
    m_hrs.fit(train_dset)

    ### Holiday-Regressors-Seasonality ###

    m_comp = Prophet(growth = growth,
                    seasonality_mode = 'additive',      # seasonality_mode = 'multiplicative',
                    changepoint_prior_scale = 0.1, 
                    seasonality_prior_scale = 10.0,
                    holidays_prior_scale = 20.0,
                    weekly_seasonality = False, 
                    daily_seasonality = False, 
                    yearly_seasonality = False,
                    holidays=holidays,
                    ).add_seasonality(
                        name = 'monthly',
                        period = 30.5,
                        fourier_order = 55
                    #).add_seasonality(
                        #name = 'daily',
                        #period = 1,
                        #fourier_order = 3,
                        #prior_scale = 30
                    ).add_seasonality(
                        name = 'weekly',
                        period = 7,
                        fourier_order = 10,
                        prior_scale = 40
                    ).add_seasonality(
                        name = 'yearly',
                        period = 365.25,
                        fourier_order = 20
                    ).add_seasonality(
                        name = 'quarterly',
                        period = 365.25 / 4,
                        fourier_order = 5,
                        prior_scale = 15
                    ).add_seasonality(
                        name = 'bi-monthly',
                        period = 365.25 / 6,
                        fourier_order = 5,
                        prior_scale = 15
                    )

    m_comp.add_regressor(train_dset.filter(regex='dcoilwtico').columns[0])
    m_comp.add_regressor(train_dset.filter(regex='onpromotion').columns[0])
    m_comp.add_regressor(train_dset.filter(regex='is_holiday').columns[0])
    m_comp.add_regressor('National')
    m_comp.add_regressor('Regional')
    m_comp.add_regressor('Local')    
    m_comp.add_regressor('Is_month_end')
    m_comp.add_regressor('Year')
    m_comp.add_regressor('Month')
    m_comp.add_regressor('Week')
    m_comp.add_regressor('Dayofweek')
    m_comp.add_regressor('Dayofyear')
    m_comp.add_regressor('Is_month_start')
    m_comp.add_regressor('Is_quarter_end')
    m_comp.add_regressor('Is_quarter_start')
    m_comp.add_regressor('Is_year_end')
    m_comp.add_regressor('Is_year_start')
    m_comp.fit(train_dset)

    df_all = pd.DataFrame()

    y_train = train_dset.y
    y_test = test_dset.y
    y_test.index = y_test.index + max(y_train.index)
    print(y_train.shape[0], y_test.shape[0])
    
    df_cv, df_p, forecast, prop_test = mod_diag(m, 'Baseline model', test_dset, y_train, y_test, name, initial, period, horizon)
    df_p['Model'] = 'Baseline'
    df_all = pd.concat([df_all,df_p], axis = 0).reset_index(drop=True)
    print(df_all.shape)
    
    df_cv, df_p, forecast, prop_test = mod_diag(m_strict, 'Strict model', test_dset, y_train, y_test, name, initial, period, horizon)
    df_p['Model'] = 'Strict'
    df_all = pd.concat([df_all,df_p], axis = 0).reset_index(drop=True)
    print(df_all.shape)
    
    df_cv, df_p, forecast, prop_test = mod_diag(m_flex, 'Flexible model', test_dset, y_train, y_test, name, initial, period, horizon)
    df_p['Model'] = 'Flexible'
    df_all = pd.concat([df_all,df_p], axis = 0).reset_index(drop=True)
    print(df_all.shape)
    
    df_cv, df_p, forecast, prop_test = mod_diag(m_reg, 'Regressor model', test_dset, y_train, y_test, name, initial, period, horizon)
    df_p['Model'] = 'Regressor'
    df_all = pd.concat([df_all,df_p], axis = 0).reset_index(drop=True)
    print(df_all.shape)
    
    df_cv, df_p, forecast, prop_test = mod_diag(m_hol, 'Holiday model', test_dset, y_train, y_test, name, initial, period, horizon)
    df_p['Model'] = 'Holiday'
    df_all = pd.concat([df_all,df_p], axis = 0).reset_index(drop=True)
    print(df_all.shape)
    
    df_cv, df_p, forecast, prop_test = mod_diag(m_hrs, 'Holiday-Regressor-Seasonal model', test_dset, y_train, y_test, name, initial, period, horizon)
    df_p['Model'] = 'HRS'
    df_all = pd.concat([df_all,df_p], axis = 0).reset_index(drop=True)
    print(df_all.shape)
    
    df_cv, df_p, forecast, prop_test = mod_diag(m_comp, 'Constructed model', test_dset, y_train, y_test, name, initial, period, horizon)            
    df_p['Model'] = 'Constructed'
    df_all = pd.concat([df_all,df_p], axis = 0).reset_index(drop=True)
    print(df_all.shape)
    
    df_all['days'] = df_all['horizon'].astype('timedelta64[D]')
    df_all['days'] = df_all['days'].astype(int)

    f = plt.figure(figsize=(19, 15))
    ax = sns.lineplot(x="days", y="smape", hue="Model", markers=True, data=df_all)
    ax.set(xlabel='Date', ylabel='SMAPE')
    plt.show();
    
    return df_all

In [0]:
def all_mod(train_dset, test_dset, params, df_comp, initial, period, horizon, name):
    
    train_dset['cap'] = max(int(max(train_dset.y) * 1.1),int(max(test_dset.y) * 1.1))
    train_dset['floor'] = 0
    test_dset['cap'] = train_dset['cap']
    test_dset['floor'] = 0
    
    y_train = train_dset.y
    y_test = test_dset.y
    y_test.index = y_test.index + max(y_train.index)
    print(y_train.shape[0], y_test.shape[0])
    
    ### Fit the model using the best parameters ###
    
    m3_changepoints = (
    # 10 potential changepoints in 1 years
    pd.date_range('2014-08-01', '2015-06-01', periods=10).date.tolist() +
    # 15 potential changepoints in 1 year 2 months
    pd.date_range('2015-08-01', '2016-01-01', periods=10).date.tolist()
    )

    auto_model = Prophet(changepoint_prior_scale = params['changepoint_prior_scale'], 
                         seasonality_prior_scale = params['seasonality_prior_scale'], 
                         seasonality_mode = params['seasonality_mode'],
                         growth = params['growth'],
                         changepoints = m3_changepoints,
                         holidays = holidays)
    auto_model.add_regressor(train_dset.filter(regex='dcoilwtico').columns[0])
    auto_model.add_regressor(train_dset.filter(regex='is_holiday').columns[0])
    auto_model.add_regressor(train_dset.filter(regex='onpromotion').columns[0])
    auto_model.add_regressor('National')
    auto_model.add_regressor('Regional')
    auto_model.add_regressor('Local')    
    auto_model.add_regressor('Is_month_end')
    auto_model.add_regressor('Year')
    auto_model.add_regressor('Month')
    auto_model.add_regressor('Week')
    auto_model.add_regressor('Dayofweek')
    auto_model.add_regressor('Dayofyear')
    auto_model.add_regressor('Is_month_start')
    auto_model.add_regressor('Is_quarter_end')
    auto_model.add_regressor('Is_quarter_start')
    auto_model.add_regressor('Is_year_end')
    auto_model.add_regressor('Is_year_start')

    ### Fit the model on the training dataset ###

    auto_model.fit(train_dset)

    ### Cross validation ###
    auto_model_cv, auto_model_p, forecast_auto, prop_test_auto = cv_exam(auto_model, test_dset, y_test, initial, period, horizon)
    
    fig = auto_model.plot(forecast_auto)
    a = add_changepoints_to_plot(fig.gca(), auto_model, forecast_auto)
    
    mape_ph_auto = mean_absolute_percentage_error(prop_test_auto['y'], prop_test_auto['yhat'], symmetric=True)
    
    print(f"The Smape loss value for {name} is {mape_ph_auto:.6f}")
    # 0.111345
    print(f"The RMSE value is {mean_squared_error(prop_test_auto['y'], prop_test_auto['yhat'], squared=False)}")
    # 1186.9104787999713
    plot_series(prop_test_auto['yhat'], y_test, labels=["y_pred", "y_test"], title = f'{name} Hyperparameter Train-test plot');
    
    f = plt.figure(figsize=(19, 15))
    ax = sns.lineplot(x="ds", y="residual", markers=True, data=auto_model_cv)
    ax.set(xlabel='Dates', ylabel='Residuals')
    plt.show()

    ### Model performance metrics ###
    auto_model_pm = performance_metrics(auto_model_cv, rolling_window=1)
    print(auto_model_pm)

    auto_model_p['Model'] = 'Hyperparameter'

    df_comp = pd.concat([df_comp,auto_model_p], axis = 0).reset_index(drop=True)
    df_comp['days'] = df_comp['horizon'].astype('timedelta64[D]')
    df_comp['days'] = df_comp['days'].astype(int)

    f = plt.figure(figsize=(19, 15))
    ax = sns.lineplot(x="days", y="smape", hue="Model", markers=True, data=df_comp)
    ax.set(xlabel='Date', ylabel='SMAPE')
    plt.show();
    return df_comp

In [0]:
def hyper(train_dset, test_dset, initial, period, horizon):
        
    train_dset['cap'] = max(int(max(train_dset.y) * 1.1),int(max(test_dset.y) * 1.1))
    train_dset['floor'] = 0
    test_dset['cap'] = train_dset['cap']
    test_dset['floor'] = 0
    
    ### Create a list to store MAPE values for each combination ###
    mapes = [] 

    ### Use cross validation to evaluate all parameters ###

    for params in all_params:
        ### Fit a model using one parameter combination ###
        m = Prophet(**params)
        m.add_regressor(train_dset.filter(regex='dcoilwtico').columns[0])
        m.add_regressor(train_dset.filter(regex='is_holiday').columns[0])
        m.add_regressor(train_dset.filter(regex='onpromotion').columns[0])
        
        m.add_regressor('National')
        m.add_regressor('Regional')
        m.add_regressor('Local')    
        m.add_regressor('Is_month_end')
        m.add_regressor('Year')
        m.add_regressor('Month')
        m.add_regressor('Week')
        m.add_regressor('Dayofweek')
        m.add_regressor('Dayofyear')
        m.add_regressor('Is_month_start')
        m.add_regressor('Is_quarter_end')
        m.add_regressor('Is_quarter_start')
        m.add_regressor('Is_year_end')
        m.add_regressor('Is_year_start')
        m.fit(train_dset)
    
        ### Cross-validation ###
        df_cv = cross_validation(m, initial = initial, period = period, horizon = horizon, parallel="processes")
    
        ### Model performance ###
        df_p = performance_metrics(df_cv, rolling_window=1)
    
        ### Save model performance metrics ###
        mapes.append(df_p['smape'].values[0])
    
    ### Tuning results
    tuning_results = pd.DataFrame(all_params)
    tuning_results['mape'] = mapes
    # Find the best parameters
    best_params = all_params[np.argmin(mapes)]
    print(best_params)
    return best_params

Read in data

In [0]:
spark.conf.set("fs.azure.account.key.ifsandboxstorage.dfs.core.windows.net", dbutils.secrets.get(scope="if-databricks-scope", key="if-storage-key"))

In [0]:
path = "abfss://raw@ifsandboxstorage.dfs.core.windows.net/" + "/experimental dataset/store-sales/automotive.csv"
df = spark.read.option("header", "true").options(inferSchema="True",delimiter=',').csv(path)
automotive = df.toPandas()

In [0]:
path = "abfss://raw@ifsandboxstorage.dfs.core.windows.net/" + "/experimental dataset/store-sales/babycare.csv"
df = spark.read.option("header", "true").options(inferSchema="True",delimiter=',').csv(path)
babycare = df.toPandas()

In [0]:
print(babycare.shape)
babycare.head()

In [0]:
path = "abfss://raw@ifsandboxstorage.dfs.core.windows.net/" + "/experimental dataset/store-sales/beauty.csv"
df = spark.read.option("header", "true").options(inferSchema="True",delimiter=',').csv(path)
beauty = df.toPandas()

In [0]:
path = "abfss://raw@ifsandboxstorage.dfs.core.windows.net/" + "/experimental dataset/store-sales/beverages.csv"
df = spark.read.option("header", "true").options(inferSchema="True",delimiter=',').csv(path)
beverages = df.toPandas()

In [0]:
path = "abfss://raw@ifsandboxstorage.dfs.core.windows.net/" + "/experimental dataset/store-sales/books.csv"
df = spark.read.option("header", "true").options(inferSchema="True",delimiter=',').csv(path)
books = df.toPandas()

In [0]:
### Reduce the data down to non leading zeros ###

books.Date = books.Date.astype('datetime64[ns]')
books = books[books.Date > '2016-10-07']

In [0]:
automotive['family'] = 'Automotive'
automotive.rename(columns = {'Date':'date'}, inplace = True)
babycare['family'] = 'Babycare'
babycare.rename(columns = {'Date':'date'}, inplace = True)
beauty['family'] = 'Beauty'
beauty.rename(columns = {'Date':'date'}, inplace = True)
beverages['family'] = 'Beverages'
beverages.rename(columns = {'Date':'date'}, inplace = True)
books['family'] = 'Books'
books.rename(columns = {'Date':'date'}, inplace = True)

train = pd.concat([automotive[['date','sales','family']], babycare[['date','sales','family']]], axis = 0)
train = pd.concat([train, beauty[['date','sales','family']]], axis = 0)
train = pd.concat([train, beverages[['date','sales','family']]], axis = 0)
train = pd.concat([train, books[['date','sales','family']]], axis = 0)



In [0]:
file = '/dbfs/mnt/experimental dataset/store-sales/store_sales_test_by_product.pickle'

with open(file, 'rb') as f:
    file_bytes = f.read()
    
data = pickle.loads(file_bytes)

test_df = pd.DataFrame()

for name in ['AUTOMOTIVE','BABY CARE','BEAUTY','BEVERAGES','BOOKS']:
    dset = data[name]
    test_df = pd.concat([test_df,dset], axis = 0)    
    print(test_df.shape)
    
test_df.rename(columns = {'Date':'date'}, inplace = True)
test_df.head()

In [0]:
train = train[~train.sales.isna()]
print(min(train.date),max(train.date))
# 2013-01-01 00:00:00 2017-07-06 00:00:00
test_df = test_df[~test_df.date.isna()]
print(min(test_df.date),max(test_df.date))
# 2017-06-21 00:00:00 2017-08-15 00:00:00
test = train[train.date >= min(test_df.date)]
print(test.shape)
train = train[train.date < min(test_df.date)]
print(train.shape)


In [0]:
file_location = "experimental dataset/store-sales/holidays_events.csv"
csvFile = "abfss://raw@ifsandboxstorage.dfs.core.windows.net/" + file_location
df_raw = spark.read.option("header", "true").options(inferSchema='True',delimiter=',').csv(csvFile)
holidays_events = df_raw.toPandas()
holidays_events.head()

In [0]:
print(holidays_events.shape)
holidays_events.date = pd.to_datetime(holidays_events.date).dt.date
holidays_events.date = holidays_events.date.astype('datetime64[ns]')
holidays_events_nd = holidays_events[['date']].drop_duplicates()
print(holidays_events_nd.shape)

for name in list(holidays_events.columns):    
    print(f" There are {holidays_events[name].nunique()} unique {name} ids")
    if holidays_events[name].nunique() < 25:
        print(list(set(holidays_events[name])))
        
for name in list(set(holidays_events['type'])):
    print(name,holidays_events[holidays_events.type == str(name)].shape[0])        
        
holidays_events.head()

In [0]:
hol_list = holidays_events[holidays_events.type == 'Holiday']
hol_list = list(set(hol_list.date))
holi_list = [d.strftime('%d-%m-%Y') for d in hol_list]

print(holidays_events.shape)
holidays_events = holidays_events[(holidays_events.type != 'Work Day')  & (holidays_events.transferred != True)]
holidays_events['var'] = 1
print(holidays_events.shape)

national = holiday('National')
regional = holiday('Regional')
local = holiday('Local')
print(national.shape,regional.shape,local.shape)


In [0]:
####################
### Remerge data ###
####################

train.date = pd.to_datetime(train.date).dt.date
train.date = train.date.astype('datetime64[ns]')
print(train.shape)

add_datepart(train, 'date', drop = False)
train['month_date'] = pd.to_datetime(train[['Year', 'Month']].assign(DAY=1))

train = pd.merge(train, national,on = 'date',how = 'left')
print(train.shape)
train = pd.merge(train,regional,on = 'date',how = 'left')
print(train.shape)
train = pd.merge(train,local,on = 'date',how = 'left')
print(train.shape)
for name in ['National','Regional','Local']:
    train[name] = train[name].fillna(0)
print(train.shape)

In [0]:
####################
### Remerge data ###
####################

test.date = pd.to_datetime(test.date).dt.date
test.date = test.date.astype('datetime64[ns]')
print(test.shape)

add_datepart(test, 'date', drop = False)
test['month_date'] = pd.to_datetime(test[['Year', 'Month']].assign(DAY=1))

test = pd.merge(test, national,on = 'date',how = 'left')
print(test.shape)
test = pd.merge(test,regional,on = 'date',how = 'left')
print(test.shape)
test = pd.merge(test,local,on = 'date',how = 'left')
print(test.shape)
for name in ['National','Regional','Local']:
    test[name] = test[name].fillna(0)
print(test.shape)

Exploratory plots

In [0]:
#######################
### Sales over time ###
#######################

plt.rc('font',size=10)
grid = gridspec.GridSpec(3,2)
plt.figure(figsize=(19,15))
plt.subplots_adjust(wspace=0.4,hspace=0.3)
      
for idx, name in enumerate(list(set(train.family))):
    ax = plt.subplot(grid[idx])
    dset = train[train.family == name]

    sns.lineplot(x = "date", y = "sales", markers=True, data=dset)
    ax.set_title(f'{name} Sales plot')
    ax.set(xlabel='Dates', ylabel='Sales')

In [0]:
#########################
### Percentage change ###
#########################

plt.rc('font',size=10)
grid = gridspec.GridSpec(3,2)
plt.figure(figsize=(19,15))
plt.subplots_adjust(wspace=0.4,hspace=0.3)
      
for idx, name in enumerate(list(set(train.family))):
    ax = plt.subplot(grid[idx])
    dset = train[train.family == name]
    dset['Change'] = dset.sales.div(dset.sales.shift())

    sns.lineplot(x = "Dayofyear", y = "Change", hue = 'Year', markers = True, data = dset)
    ax.set_title(f'{name} Percentage change plot')
    ax.set(xlabel='Dates', ylabel='Sales count percentage change')

In [0]:
plt.rc('font',size=10)
grid = gridspec.GridSpec(3,2)
plt.figure(figsize=(19,15))
plt.subplots_adjust(wspace=0.4,hspace=0.3)
      
for idx, name in enumerate(list(set(train.family))):
    ax = plt.subplot(grid[idx])
    dset = train[train.family == name]
    dset['sales'] = dset['sales'] + 0.01
    print(name)
    if name != 'Books':
        result = seasonal_decompose(dset['sales'], model='multiplicative', period=365)
    else:
        result = seasonal_decompose(dset['sales'], model='multiplicative', period=31)
    ax.set_title(f'{name} Seasonal plot')
    result.seasonal.plot()



In [0]:
plt.rc('font',size=10)
grid = gridspec.GridSpec(3,2)
plt.figure(figsize=(19,15))
plt.subplots_adjust(wspace=0.4,hspace=0.3)
      
for idx, name in enumerate(list(set(train.family))):
    ax = plt.subplot(grid[idx])
    dset = train[train.family == name]
    dset['sales'] = dset['sales'] + 0.01

    if name != 'Books':
        result = seasonal_decompose(dset['sales'], model='multiplicative', period=365)
    else:
        result = seasonal_decompose(dset['sales'], model='multiplicative', period=31)
    ax.set_title(f'{name} Trend plot')
    result.trend.plot()
    


In [0]:
plt.rc('font',size=10)
grid = gridspec.GridSpec(3,2)
plt.figure(figsize=(19,15))
plt.subplots_adjust(wspace=0.4,hspace=0.3)
      
for idx, name in enumerate(list(set(train.family))):
    ax = plt.subplot(grid[idx])
    dset = train[train.family == name]
    dset['sales'] = dset['sales'] + 0.01
    if name != 'Books':
        result = seasonal_decompose(dset['sales'], model='multiplicative', period=365)
    else:
        result = seasonal_decompose(dset['sales'], model='multiplicative', period=31)
    result.plot();
    ax.set_title(f'{name} Decomposition plot');
    

In [0]:
######################################
### Align data with Prophet format ###
######################################
 
babycare_train = train_dset(babycare,'Babycare',train_df)
beauty_train = train_dset(beauty,'Beauty',train_df)
automotive_train = train_dset(automotive,'Automotive',train_df)
beverages_train = train_dset(beverages,'Beverages',train_df)
books_train = train_dset(books,'Books',train_df)

babycare_test = train_dset(babycare,'Babycare',test)
beauty_test = train_dset(beauty,'Beauty',test)
automotive_test = train_dset(automotive,'Automotive',test)
beverages_test = train_dset(beverages,'Beverages',test)
books_test = train_dset(books,'Books',test)



In [0]:
train_test_plot(babycare_train,babycare_test,'Babycare')    
train_test_plot(beauty_train,beauty_test,'Beauty')    
train_test_plot(automotive_train,automotive_test,'Automotive')    
train_test_plot(beverages_train,beverages_test,'Beverages')    
train_test_plot(books_train,books_test,'Books')    

In [0]:
hdate = holidays_events.date.dt.date
a = list(set(train.month_date.dt.date))
a.sort()

dates = [x.replace(day = 15) for x in a]

holiday = pd.DataFrame({
  'holiday': 'holiday',
  'ds': pd.to_datetime(list(set(holi_list))),
  'lower_window': -1,
  'upper_window': 1,
})
pay_day = pd.DataFrame({
  'holiday': 'pay_day',
  'ds': pd.to_datetime(dates),
  'lower_window': -1,
  'upper_window': 1,
})
month_start = pd.DataFrame({
  'holiday': 'month_start',
  'ds': pd.to_datetime(a),
  'lower_window': -1,
  'upper_window': 1,
})
holidays = pd.concat((holiday, pay_day, month_start))
holidays = holidays.drop(['holiday'],axis = 1)
print(holidays.shape)
holidays = holidays.drop_duplicates()
print(holidays.shape)

for jan_date in ['2016-01-01','2017-01-01']:
    holidays.loc[holidays.ds == pd.to_datetime(jan_date),'lower_window'] = -4
    holidays.loc[holidays.ds == pd.to_datetime(jan_date),'upper_window'] = 4
holidays['holiday'] = 'Holiday'

In [0]:
import logging
logger = logging.getLogger('cmdstanpy')
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.CRITICAL)

In [0]:
#########################
### Models by dataset ###
#########################

babycare_res = cap_floor(babycare_train, babycare_test, 'Babycare', initial = '730 days', period = '30 days', horizon =  '60 days', growth = 'logistic')    
beauty_res = cap_floor(beauty_train, beauty_test, 'Beauty', initial = '730 days', period = '30 days', horizon =  '60 days', growth = 'logistic')    
automotive_res = cap_floor(automotive_train, automotive_test, 'Automotive', initial = '730 days', period = '30 days', horizon =  '60 days', growth = 'logistic')    
beverages_res = cap_floor(beverages_train, beverages_test, 'Beverages', initial = '730 days', period = '30 days', horizon =  '60 days', growth = 'logistic')    
books_res = cap_floor(books_train, books_test, 'Books', initial = '200 days', period = '10 days', horizon =  '20 days', growth = 'logistic')    

 

In [0]:
###############
### Results ###
###############
        
res_dset = pd.DataFrame()

res_dset = res_data(babycare_res,'smape','Babycare',res_dset)        
res_dset = res_data(automotive_res,'smape','Automotive',res_dset)    
res_dset = res_data(beauty_res,'smape','Beauty',res_dset)    
res_dset = res_data(beverages_res,'smape','Beverages',res_dset)   
res_dset = res_data(books_res,'smape','Books',res_dset)   

res_dset = res_data(babycare_res,'rmse','Babycare',res_dset)        
res_dset = res_data(automotive_res,'rmse','Automotive',res_dset)    
res_dset = res_data(beauty_res,'rmse','Beauty',res_dset)    
res_dset = res_data(beverages_res,'rmse','Beverages',res_dset)    
res_dset = res_data(books_res,'rmse','Books',res_dset)    

res_dset

Hyperparameter tuning on final model

In [0]:
holidays_df = pd.DataFrame(holidays[['ds','holiday']])
holidays_df.head()

m3_changepoints = (
    # 10 potential changepoints in 1 years
    pd.date_range('2014-08-01', '2015-06-01', periods=10).date.tolist() +
    # 15 potential changepoints in 1 year 2 months
    pd.date_range('2015-08-01', '2016-01-01', periods=10).date.tolist()
    )


### Set up parameter grid ###

param_grid = {  
    'changepoint_prior_scale': [0.001, 0.05, 0.08, 0.5],
    'seasonality_prior_scale': [0.01, 1, 5, 10, 12],
    'seasonality_mode': ['multiplicative','additive'],
    'growth': ['linear','logistic'],
    'changepoints' : [m3_changepoints],
    'weekly_seasonality': [True],
    'yearly_seasonality': [True],
    'daily_seasonality': [False],
    'holidays': [holidays_df]
}

### Generate all combinations of parameters ###

all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]



#beauty_params = hyper(beauty_train, beauty_test, '730 days', '15 days', '30 days')    
babycare_params = hyper(babycare_train,babycare_test, '730 days', '15 days', '30 days')    
#books_params = hyper(books_train,books_test, '200 days', '5 days', '10 days')    
#automotive_params = hyper(automotive_train, automotive_test, '730 days', '15 days', '30 days')    
#beverages_params = hyper(beverages_train, beverages_test, '730 days', '15 days', '30 days')    

In [0]:
print('Beauty parameters ', beauty_params)            # linear
print('Babycare parameters ', babycare_params)        # linear
print('Books parameters ', books_params)              # Logistic
print('Automotive parameters ', automotive_params)    # Logistic
print('Beverages parameters ', beverages_params)       # linear


In [0]:
#beauty_all = all_mod(beauty_train,beauty_test, beauty_params, beauty_res, '730 days', '15 days', '30 days', 'Beauty')    
babycare_all = all_mod(babycare_train, babycare_test, babycare_params, babycare_res, '730 days', '15 days', '30 days', 'Babycare')    
#books_all = all_mod(books_train, books_test, books_params, books_res, '200 days', '5 days', '10 days', 'Books')    
#automotive_all = all_mod(automotive_train, automotive_test, automotive_params, automotive_res, '730 days', '15 days', '30 days', 'Automotive')    
#beverages_all = all_mod(beverages_train, beverages_test, beverages_params, beverages_res, '730 days', '15 days', '30 days', 'Beverages')    


In [0]:
#perf_vals(beauty_all, meas = 'smape')
#perf_vals(babycare_all, meas = 'smape')
#perf_vals(beauty_all, meas = 'smape')
#perf_vals(automotive_all, meas = 'smape')
perf_vals(beverages_all, meas = 'smape')