In [63]:
# BASE
# ------------------------------------------------------
import numpy as np
import pandas as pd
import os
import gc
import warnings
from tqdm.auto import tqdm 

# PACF - ACF
# ------------------------------------------------------
import statsmodels.api as sm
from statsmodels.tsa.holtwinters import ExponentialSmoothing, SimpleExpSmoothing 
from sklearn.metrics import root_mean_squared_error, mean_absolute_percentage_error, root_mean_squared_log_error
from statsmodels.graphics.tsaplots import month_plot, quarter_plot, plot_acf, plot_pacf
# DATA VISUALIZATION
# ------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go 

from sklearn.model_selection import ParameterGrid 
# CONFIGURATIONS
# ------------------------------------------------------
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:.2f}'.format
warnings.filterwarnings('ignore')
# Custom Code 
#---------------------------------------
from utils import plot_forecasts, plot_acf_plotly
from joblib import Parallel, delayed

In [66]:
df = pd.read_parquet('train.parquet')
# Forecast next 13 days 
periods = 16
#df['sales'] = df['sales'] + 1 

In [88]:
x = df[["store_nbr", "family"]].drop_duplicates().sample(1)
store_nbr = x["store_nbr"].iloc[0]

family = x["family"].iloc[0]

temp = df[(df["store_nbr"] == store_nbr) & (df["family"] == family)]
px.line(temp, x = 'date', y = 'sales', title=f"{family} - {store_nbr}")

In [89]:
# Forecast next 13 days 
periods = 16

temp = temp.set_index('date').sort_index()

In [90]:
# plot_acf_plotly(temp['sales'], nlags=100)

In [91]:
train, test = temp[['sales']].iloc[:-periods],  temp[['sales']].iloc[-periods:]

In [95]:
model_holt = ExponentialSmoothing(endog=train['sales'], trend='add', seasonal='add', seasonal_periods=7).fit()
holt_pred = model_holt.forecast(periods)

In [96]:
rmsle_value = root_mean_squared_log_error(test["sales"].values, holt_pred.values)
rmse = root_mean_squared_error(test["sales"].values , holt_pred.values)


rmsle_value,rmse 

(0.5113357348954999, 9.148987063140561)

In [97]:
plot_forecasts(temp, holt_pred, test)

In [99]:
def fit_model(param_dict: list, df: pd.DataFrame,store_nbr: str, family: str) -> list:

    train, test = df[["sales"]].iloc[:-periods], df[["sales"]].iloc[-periods:]

    combinations = []

    for param_dict in param_dict:
        try:
            seasonal, seasonal_periods, trend = (
                param_dict["seasonal"],
                param_dict["seasonal_periods"],
                param_dict["trend"],
            )

            if trend != "mul" and seasonal != "mul":
                model_holt = ExponentialSmoothing(
                    endog=train["sales"],
                    trend=trend,
                    seasonal=seasonal,
                    seasonal_periods=seasonal_periods,
                ).fit()

                holt_pred = model_holt.forecast(periods).values
                holt_pred = np.maximum(holt_pred, 0)
                actual = test["sales"].values

            else:
                model_holt = ExponentialSmoothing(
                    endog=train["sales"] + 1,
                    trend=trend,
                    seasonal=seasonal,
                    seasonal_periods=seasonal_periods,
                ).fit()
                holt_pred = model_holt.forecast(periods).values
                holt_pred = np.maximum(holt_pred, 0)
                actual = test["sales"].values + 1
        


        
            rmsle = root_mean_squared_log_error(actual, holt_pred)
            rmse = root_mean_squared_error(actual, holt_pred)
            mape = mean_absolute_percentage_error(actual, holt_pred)

            combinations.append(
                {
                    "seasonal": seasonal,
                    "seasonal_periods": seasonal_periods,
                    "trend": trend,
                    "rmse": rmse,
                    "mape": mape,
                    "rmsle": rmsle,
                    'store_nbr': store_nbr, 
                    'family': family 
                }
            )


        except ValueError:
            combinations.append(
                {
                    "seasonal": seasonal,
                    "seasonal_periods": seasonal_periods,
                    "trend": trend,
                    "rmse": None,
                    "mape": None,
                    "rmsle": None,
                    'store_nbr': store_nbr, 
                    'family': family 
                }
            )
            

    return combinations


params = {
    "trend": ["mul", "add"],
    "seasonal": ["mul", "add"],
    "seasonal_periods": [7, 30],
}

params_grid = list(ParameterGrid(params))

In [100]:
def process_combination(store_nbr, family, param_grid): 
    store_combinations_df = df[(df["store_nbr"] == store_nbr) & (df["family"] == family)].dropna()
    store_combinations_df.set_index('date', inplace = True)
    
    
    combinations = fit_model(param_grid, store_combinations_df,store_nbr, family)

    return combinations 


In [12]:
unique_combinations = df[["store_nbr", "family"]].drop_duplicates()

combinations_list = Parallel(n_jobs=-1, verbose=10, backend='loky', batch_size=32)(
    delayed(process_combination)(row["store_nbr"], row["family"], params_grid)
    for _, row in tqdm(unique_combinations.iterrows(), total=len(unique_combinations))
)

  0%|          | 0/1782 [00:00<?, ?it/s]

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done  32 tasks      | elapsed:   16.4s
[Parallel(n_jobs=-1)]: Done 320 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 608 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 960 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 1126 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 1232 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 1307 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 1477 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 1616 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 1633 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 1650 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 1669 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 1688 tasks      | e

In [13]:
combinations_list_ = [item for sublist in combinations_list for item in sublist]

In [14]:
holt = pd.DataFrame(combinations_list_)

In [15]:
holt.to_parquet('metrics/holt.parquet')

In [15]:
holt = pd.read_parquet('metrics/holt.parquet')
holt = holt.replace(to_replace=[np.inf, -np.inf], value=np.nan)
holt = holt.dropna()

holt[['rmse','mape','rmsle']] = holt[['rmse','mape','rmsle']].round(2)

In [16]:
# holt.to_csv('metrics/holt.csv')

In [17]:
holt['rmse_rank'] = holt.groupby(['store_nbr','family'])['rmse'].rank(method='first', ascending=True)
holt['mape_rank'] = holt.groupby(['store_nbr','family'])['mape'].rank(method='first', ascending=True)
holt['rmsle_rank'] = holt.groupby(['store_nbr','family'])['rmsle'].rank(method='first', ascending=True)


In [18]:
holt_rmsle = holt[holt['rmsle_rank'] == 1]

In [19]:
holt_rmsle[['seasonal','seasonal_periods','trend']].value_counts().reset_index()

Unnamed: 0,seasonal,seasonal_periods,trend,count
0,mul,7,mul,455
1,mul,7,add,356
2,add,7,mul,238
3,mul,30,add,179
4,mul,30,mul,177
5,add,7,add,155
6,add,30,mul,140
7,add,30,add,49


In [20]:
holt_rmsle.groupby(['store_nbr']).agg(
    mean_rmsle = ('rmsle','mean'), 
    median_rmsle = ('rmsle','median')
).round(2).reset_index()

Unnamed: 0,store_nbr,mean_rmsle,median_rmsle
0,1,0.46,0.47
1,2,0.4,0.33
2,3,0.37,0.35
3,4,0.39,0.36
4,5,0.36,0.29
5,6,0.34,0.32
6,7,0.31,0.29
7,8,0.37,0.35
8,9,0.48,0.37
9,10,0.38,0.38


In [21]:
holt_rmsle.groupby(['family']).agg(
    mean_family = ('rmsle','mean'), 
    median_family = ('rmsle','median')
).round(2).reset_index()

Unnamed: 0,family,mean_family,median_family
0,AUTOMOTIVE,0.46,0.45
1,BABY CARE,0.14,0.11
2,BEAUTY,0.49,0.47
3,BEVERAGES,0.29,0.28
4,BOOKS,0.08,0.0
5,BREAD/BAKERY,0.26,0.25
6,CELEBRATION,0.54,0.51
7,CLEANING,0.33,0.3
8,DAIRY,0.3,0.3
9,DELI,0.28,0.28


In [22]:
round(holt_rmsle['rmsle'].mean(),2), holt_rmsle['rmsle'].median()

(0.41, 0.36)

In [23]:
round(holt_rmsle['rmse'].mean(),2), holt_rmsle['rmse'].median()

(155.98, 17.93)

In [52]:
test_df = pd.read_csv('test.csv')
test_df['date'] = pd.to_datetime(test_df['date'])

combinations = test_df[["store_nbr", "family"]].drop_duplicates()

In [101]:
def make_predictions(store_nbr, family, test_df, df, holt_rmsle):
    train_temp = df[
        (df["family"] == family) & (df["store_nbr"] == store_nbr)
    ].sort_index()

    test_temp = test_df[
        (test_df["family"] == family) & (test_df["store_nbr"] == store_nbr)
    ].sort_values(by='date').copy()

    train_temp = train_temp.set_index("date")["sales"]

    parameters = holt_rmsle[
        (holt_rmsle["family"] == family) & (holt_rmsle["store_nbr"] == store_nbr)
    ]

    parameters = parameters.to_dict(orient="records")

    if not train_temp.empty and parameters:
        parameters = parameters[0]
        trend = parameters["trend"]
        seasonal = parameters["seasonal"]
        seasonal_periods = parameters["seasonal_periods"]
    else:
        trend = "add"
        seasonal = "add"
        seasonal_periods = 7

    if trend != "mul" and seasonal != "mul":

        model = ExponentialSmoothing(
            endog=train_temp,
            trend=trend,
            seasonal=seasonal,
            seasonal_periods=seasonal_periods,
        ).fit()

        model_pred = model.forecast(periods).values 

    else:

        model = ExponentialSmoothing(
            endog=train_temp + 1,
            trend=trend,
            seasonal=seasonal,
            seasonal_periods=seasonal_periods,
        ).fit()

        model_pred = model.forecast(periods).values 
    test_temp["pred"] = model_pred
    return test_temp 

In [106]:
def make_predictions(store_nbr, family, test_df, df):
    train_temp = df[
        (df["family"] == family) & (df["store_nbr"] == store_nbr)
    ].sort_index()

    test_temp = test_df[
        (test_df["family"] == family) & (test_df["store_nbr"] == store_nbr)
    ].sort_values(by='date').copy()

    train_temp = train_temp.set_index("date")["sales"]


    trend = "add"
    seasonal = "add"
    seasonal_periods = 7


    model = ExponentialSmoothing(
        endog=train_temp,
        trend=trend,
        seasonal=seasonal,
        seasonal_periods=seasonal_periods,
    ).fit()

    model_pred = model.forecast(periods).values 
    test_temp["pred"] = model_pred
    return test_temp 

In [107]:
combinations_processed = Parallel(
    n_jobs=-1, verbose=10, backend="loky", batch_size=128
)(
    delayed(make_predictions)(row["store_nbr"], row["family"], test_df, df)
    for _, row in tqdm(combinations.iterrows(), total=len(combinations))
)

  0%|          | 0/1782 [00:00<?, ?it/s]

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    9.4s
[Parallel(n_jobs=-1)]: Done  32 tasks      | elapsed:   11.5s
[Parallel(n_jobs=-1)]: Done 122 tasks      | elapsed:   18.3s
[Parallel(n_jobs=-1)]: Done 212 tasks      | elapsed:   22.8s
[Parallel(n_jobs=-1)]: Done 322 tasks      | elapsed:   28.3s
[Parallel(n_jobs=-1)]: Done 432 tasks      | elapsed:   33.8s
[Parallel(n_jobs=-1)]: Done 562 tasks      | elapsed:   40.5s
[Parallel(n_jobs=-1)]: Done 692 tasks      | elapsed:   47.2s
[Parallel(n_jobs=-1)]: Done 842 tasks      | elapsed:   55.1s
[Parallel(n_jobs=-1)]: Done 992 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 1162 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1332 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 1522 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1782 out of 1782 | elaps

In [108]:
combinations_processed_ = pd.concat(combinations_processed)


In [109]:
combinations_processed_[combinations_processed_.isna().any(axis = 1)]

Unnamed: 0,id,date,store_nbr,family,onpromotion,pred


In [110]:
combinations_processed_ = combinations_processed_.rename(columns={'pred':'sales'})
combinations_processed_ = combinations_processed_.fillna(1.0)
combinations_processed_[['id','sales']].reset_index(drop = True).to_csv('submission1.csv', index = False)
