In [1]:
# BASE
# ------------------------------------------------------
import numpy as np
import pandas as pd
import os
import gc
import warnings
from tqdm.auto import tqdm 

# PACF - ACF
# ------------------------------------------------------
import statsmodels.api as sm
from statsmodels.tsa.holtwinters import ExponentialSmoothing, SimpleExpSmoothing 
from sklearn.metrics import root_mean_squared_error, mean_absolute_percentage_error, root_mean_squared_log_error
# DATA VISUALIZATION
# ------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go 


# CONFIGURATIONS
# ------------------------------------------------------
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:.2f}'.format
warnings.filterwarnings('ignore')
# Custom Code 
#---------------------------------------
from utils import plot_forecasts

In [2]:
df = pd.read_parquet('train.parquet')
# Forecast next 13 days 
periods = 13 

In [3]:
x = df[["store_nbr", "family"]].drop_duplicates().sample(1)
store = x["store_nbr"].iloc[0]

family = x["family"].iloc[0]

temp = df[(df["store_nbr"] == store) & (df["family"] == family)]
px.line(temp, x = 'date', y = 'sales', title=f"{family} - {store}")

In [4]:
# Forecast next 13 days 
periods = 13 

temp = temp.set_index('date').sort_index()

In [5]:
train, test = temp[['sales']].iloc[:-periods],  temp[['sales']].iloc[-periods:]

In [6]:
ses_model = SimpleExpSmoothing(train).fit()
ses_pred = ses_model.forecast(periods)

In [7]:
rmsle_value = root_mean_squared_log_error(test['sales'].values, ses_pred.values)
rmse = root_mean_squared_error(test['sales'].values, ses_pred.values)

In [8]:
plot_forecasts(temp, ses_pred, test)

In [11]:
from joblib import Parallel, delayed

In [None]:
def ses(store_nbr, family):
    forecasting_results = {}
    store_family_df = df[(df["store_nbr"] == store_nbr) & (df["family"] == family)]
    store_family_df.set_index("date").sort_index()

    train, test = (
        store_family_df[["sales"]].iloc[:-periods],
        store_family_df[["sales"]].iloc[-periods:],
    )
    ses_model = SimpleExpSmoothing(train).fit()

    ses_pred = ses_model.forecast(periods)

    rmsle = root_mean_squared_log_error(test["sales"].values, ses_pred.values)
    rmse = root_mean_squared_error(test["sales"].values, ses_pred.values)

    forecasting_results["store_nbr"] = store_nbr
    forecasting_results["family"] = family
    forecasting_results["rmsle"] = rmsle
    forecasting_results["rmse"] = rmse

    return forecasting_results

In [22]:
unique_combinations = df[["store_nbr", "family"]].drop_duplicates().dropna()

combinations_list = Parallel(n_jobs=-1, verbose=10, backend = 'threading')(
    delayed(ses)(row["store_nbr"], row["family"])
    for _, row in tqdm(unique_combinations.iterrows(), total=len(unique_combinations))
)

  0%|          | 0/1782 [00:00<?, ?it/s]

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:    8.9s
[Parallel(n_jobs=-1)]: Done 113 tasks      | elapsed:   10.4s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:   12.0s
[Parallel(n_jobs=-1)]: Done 149 tasks      | elapsed:   13.7s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:   15.3s
[Parallel(n_jobs=-1)]: Done 189 tasks      | elapsed:   17.3s
[Parallel(n_jobs=-1)]: Done 210 tasks      | elapsed:   19.4s
[Parallel(n_jobs=-1)]: Done 233 tasks      | elaps

In [19]:
ses_results_df = pd.DataFrame(combinations_list)

In [20]:
mean_rmse, mean_rmsle, median_rmse, median_rmsle = (
    ses_results_df["rmse"].mean(),
    ses_results_df["rmsle"].mean(),
    ses_results_df["rmse"].median(),
    ses_results_df["rmsle"].median(),
)


print(
    f"Mean RMSE: {round(mean_rmse,2)}, Mean RMSLE: {round(mean_rmsle,2)}, Median RMSE: {round(median_rmsle,2)}, Median RMSLE: {round(median_rmsle,2)}"
)

Mean RMSE: 135.87, Mean RMSLE: 0.41, Median RMSE: 0.37, Median RMSLE: 0.37


In [21]:
ses_results_df.to_csv('metrics/ses.csv', index=None)