In [37]:
# BASE
# ------------------------------------------------------
import numpy as np
import pandas as pd
import os
import gc
import warnings
from tqdm.auto import tqdm 

# PACF - ACF
# ------------------------------------------------------
import statsmodels.api as sm
from statsmodels.tsa.holtwinters import ExponentialSmoothing, SimpleExpSmoothing 
from sklearn.metrics import root_mean_squared_error, mean_absolute_percentage_error, root_mean_squared_log_error
from statsmodels.graphics.tsaplots import month_plot, quarter_plot, plot_acf, plot_pacf
# DATA VISUALIZATION
# ------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go 

from sklearn.model_selection import ParameterGrid 
# CONFIGURATIONS
# ------------------------------------------------------
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:.2f}'.format
warnings.filterwarnings('ignore')
# Custom Code 
#---------------------------------------
from utils import plot_forecasts, plot_acf_plotly
from joblib import Parallel, delayed

In [38]:
df = pd.read_parquet('train.parquet')
# Forecast next 13 days 
periods = 13 
#df['sales'] = df['sales'] + 1 

In [39]:
x = df[["store_nbr", "family"]].drop_duplicates().sample(1)
store = x["store_nbr"].iloc[0]

family = x["family"].iloc[0]

temp = df[(df["store_nbr"] == store) & (df["family"] == family)]
# px.line(temp, x = 'date', y = 'sales', title=f"{family} - {store}")

In [40]:
# Forecast next 13 days 
periods = 13 

temp = temp.set_index('date').sort_index()

In [41]:
# plot_acf_plotly(temp['sales'], nlags=100)

In [42]:
train, test = temp[['sales']].iloc[:-periods],  temp[['sales']].iloc[-periods:]

In [43]:
model_holt = ExponentialSmoothing(endog=train['sales'], trend='add', seasonal='add', seasonal_periods=365).fit()
holt_pred = model_holt.forecast(periods)

In [44]:
rmsle_value = root_mean_squared_log_error(test["sales"].values, holt_pred.values)
rmse = root_mean_squared_error(test["sales"].values , holt_pred.values)


rmsle_value,rmse 

(0.3255328544474165, 267.67082931873574)

In [45]:
# plot_forecasts(temp, holt_pred, test)

In [46]:
def fit_model(param_dict: list, df: pd.DataFrame,store_nbr: str, family: str) -> list:

    train, test = df[["sales"]].iloc[:-periods], df[["sales"]].iloc[-periods:]

    combinations = []

    for param_dict in param_dict:
        try:
            seasonal, seasonal_periods, trend = (
                param_dict["seasonal"],
                param_dict["seasonal_periods"],
                param_dict["trend"],
            )

            if trend != "mul" and seasonal != "mul":
                model_holt = ExponentialSmoothing(
                    endog=train["sales"],
                    trend=trend,
                    seasonal=seasonal,
                    seasonal_periods=seasonal_periods,
                ).fit()
                holt_pred = model_holt.forecast(periods)

                holt_pred = model_holt.forecast(periods).values
                holt_pred = np.maximum(holt_pred, 0)
                actual = test["sales"].values

            else:
                model_holt = ExponentialSmoothing(
                    endog=train["sales"] + 1,
                    trend=trend,
                    seasonal=seasonal,
                    seasonal_periods=seasonal_periods,
                ).fit()
                holt_pred = model_holt.forecast(periods).values
                holt_pred = np.maximum(holt_pred, 0)
                actual = test["sales"].values + 1
        


        
            rmsle = root_mean_squared_log_error(actual, holt_pred)
            rmse = root_mean_squared_error(actual, holt_pred)
            mape = mean_absolute_percentage_error(actual, holt_pred)

            combinations.append(
                {
                    "seasonal": seasonal,
                    "seasonal_periods": seasonal_periods,
                    "trend": trend,
                    "rmse": rmse,
                    "mape": mape,
                    "rmsle": rmsle,
                    'store_nbr': store_nbr, 
                    'family': family 
                }
            )


        except ValueError:
            combinations.append(
                {
                    "seasonal": seasonal,
                    "seasonal_periods": seasonal_periods,
                    "trend": trend,
                    "rmse": None,
                    "mape": None,
                    "rmsle": None,
                    'store_nbr': store_nbr, 
                    'family': family 
                }
            )
            

    return combinations


params = {
    "trend": ["mul", "add"],
    "seasonal": ["mul", "add"],
    "seasonal_periods": [7, 30, 365],
}

params_grid = list(ParameterGrid(params))

In [49]:
def process_combination(store_nbr, family, param_grid): 
    store_combinations_df = df[(df["store_nbr"] == store_nbr) & (df["family"] == family)].dropna()
    store_combinations_df.set_index('date', inplace = True)
    
    
    combinations = fit_model(param_grid, store_combinations_df,store_nbr, family)

    return combinations 


In [None]:
unique_combinations = df[["store_nbr", "family"]].drop_duplicates()

combinations_list = Parallel(n_jobs=6, verbose=10, backend='threading', batch_size='auto')(
    delayed(process_combination)(row["store_nbr"], row["family"], params_grid)
    for _, row in tqdm(unique_combinations.iterrows(), total=len(unique_combinations))
)

  0%|          | 0/1782 [00:00<?, ?it/s]

[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   1 tasks      | elapsed:    4.0s
[Parallel(n_jobs=6)]: Done   6 tasks      | elapsed:    8.0s
[Parallel(n_jobs=6)]: Done  13 tasks      | elapsed:   13.8s
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:   24.4s
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:   32.0s
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:   40.4s
[Parallel(n_jobs=6)]: Done  49 tasks      | elapsed:   52.9s
[Parallel(n_jobs=6)]: Done  60 tasks      | elapsed:  1.1min
[Parallel(n_jobs=6)]: Done  73 tasks      | elapsed:  1.3min
[Parallel(n_jobs=6)]: Done  86 tasks      | elapsed:  1.5min
[Parallel(n_jobs=6)]: Done 101 tasks      | elapsed:  1.8min
[Parallel(n_jobs=6)]: Done 116 tasks      | elapsed:  2.0min
[Parallel(n_jobs=6)]: Done 133 tasks      | elapsed:  2.3min
[Parallel(n_jobs=6)]: Done 150 tasks      | elapsed:  2.6min
[Parallel(n_jobs=6)]: Done 169 tasks      | elapsed:  2.9min
[Para

In [51]:
combinations_list_ = [item for sublist in combinations_list for item in sublist]

In [52]:
holt = pd.DataFrame(combinations_list_)

In [35]:
holt.to_parquet('holt.parquet')

In [23]:
holt = pd.read_parquet('metrics/holt.parquet')

In [24]:
holt = holt.dropna()

In [25]:
# holt.groupby(['seasonal','seasonal_periods','trend']).agg(
#     mean_rmse = ('rmse','mean'), 
#     mean_rmsle = ('rmsle','mean'), 
#     mean_mape = ('mape','mean'), 
# ).reset_index()

In [26]:
import plotly.figure_factory as ff 

In [27]:
ff.create_distplot([holt['rmse']], group_labels=['kde plot'])

ValueError: array must not contain infs or NaNs

In [33]:
holt[np.isinf(holt['rmse'])]

Unnamed: 0,seasonal,seasonal_periods,trend,rmse,mape,rmsle
210,add,7,mul,inf,97010163982673782757392554496357892162994550479...,297.29
5118,add,7,mul,inf,93265497207557793648450733497743542996589060071...,423.74
8682,add,7,mul,inf,54928173686472596747990245676362329492093068576...,573.84
11564,add,30,mul,inf,14277416558117948540703470081245829768576221301...,257.89
17756,add,30,mul,inf,48819957362848878495849561722798108886508603117...,364.02
19374,add,7,mul,inf,37023735034058370600536786227246399079158064199...,640.44
21320,add,30,mul,inf,85290143807203091985090014773165373879337531631...,493.25
