In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
import pickle

from sklearn.model_selection import TimeSeriesSplit
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from sklearn.metrics import mean_absolute_error
from statsmodels.tsa.arima.model import ARIMA
import warnings
from prophet import Prophet
from prophet.diagnostics import cross_validation, performance_metrics
from prophet.plot import plot_cross_validation_metric

  from .autonotebook import tqdm as notebook_tqdm
Importing plotly failed. Interactive plots will not work.


# 0 Importing Data

In [2]:
# Load your dataset
merge_df_scaled = pd.read_csv('../raw_data/cleaned_merge_df_top10.csv')
merge_df_scaled['date'] = pd.to_datetime(merge_df_scaled['date'])
merge_df_scaled.set_index('date', inplace=True)

merge_df_scaled.head()
# 382600 rows × 64 columns

Unnamed: 0_level_0,id,item_id,dept_id,cat_id,store_id,state_id,sales,weekday,wday,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2011-01-29,FOODS_2_197_CA_1_validation,FOODS_2_197,FOODS_2,FOODS,CA_1,CA,38,Saturday,1,0,0,0,0,0,0,0,2.98
2011-01-29,FOODS_3_080_CA_1_validation,FOODS_3_080,FOODS_3,FOODS,CA_1,CA,33,Saturday,1,0,0,0,0,0,0,0,1.48
2011-01-29,FOODS_3_090_CA_1_validation,FOODS_3_090,FOODS_3,FOODS,CA_1,CA,107,Saturday,1,0,0,0,0,0,0,0,1.25
2011-01-29,FOODS_3_120_CA_1_validation,FOODS_3_120,FOODS_3,FOODS,CA_1,CA,0,Saturday,1,0,0,0,0,0,0,0,0.0
2011-01-29,FOODS_3_252_CA_1_validation,FOODS_3_252,FOODS_3,FOODS,CA_1,CA,19,Saturday,1,0,0,0,0,0,0,0,1.48


# 1. Defining Model Functions

In [3]:
def perform_prophet(product_data):

    product_data.reset_index(inplace=True,names="date")
    
    prophet_product_df = product_data[["id","date","sales"]]
    prophet_product_df.columns = ["id","ds","y"]
    prophet_product_df['ds'] = pd.to_datetime(prophet_product_df['ds'])
    
    data_train = prophet_product_df.iloc[:-28]
    data_test = prophet_product_df.iloc[-28:]
    X_train = data_train["ds"]
    y_train = data_train["y"]
    X_test = data_test["ds"]
    y_test = data_test["y"]
    
    fbp = Prophet()

    model = fbp.fit(data_train)
    
    predict_placeholder = fbp.make_future_dataframe(28,freq="D")
    
    # Predict on the test data
    y_pred = fbp.predict(predict_placeholder[-28:])
    

    # Calculate and return the error metric for the current fold
    mae = mean_absolute_error(y_test, y_pred["yhat"])
    
    return model, mae

In [4]:
def perform_auto_arima(product_data):
    data_train = product_data.iloc[:-28]
    data_test = product_data.iloc[-28:]
    y_train = data_train["sales"]
    y_test = data_test["sales"]

    # Fit ARIMA model on the training data using auto_arima to find the best (p, d, q)
    model = auto_arima(y_train, start_p=0, start_q=0, max_p=5, max_q=5, d=1,
                       seasonal=True, trace=False, error_action='ignore', 
                       suppress_warnings=True, stepwise=True)
    
    # Predict on the test data
    predictions = model.predict(n_periods=len(y_test))

    # Calculate and return the error metric for the current fold
    mae = mean_absolute_error(y_test, predictions)
    
    return model, mae

In [23]:
def objective_optuna(trial, y_train, y_test):
    
    trend = trial.suggest_categorical('trend', ['add'])
    seasonal = trial.suggest_categorical('seasonal', [None, 'add'])
    seasonal_periods = trial.suggest_categorical('seasonal_periods', [None, 4, 7, 12])
    
    product_results = []

    # Fit Holt-Winters model on the training data
    model = ExponentialSmoothing(y_train, trend=trend, seasonal=seasonal, seasonal_periods=seasonal_periods,freq='D')
    fitted_model = model.fit(optimized=True)

    # Predict on the test data
    predictions = fitted_model.forecast(steps=len(y_test))

    # Calculate and store the error metric
    mae = mean_absolute_error(y_test, predictions)
    product_results.append(mae)

    # Average MAE for this product
    average_mae = np.mean(product_results)
    return average_mae

In [28]:
def perform_exp_smoothing(product_data):
    data_train = product_data.iloc[:-28]
    data_test = product_data.iloc[-28:]
    y_train = data_train["sales"]
    y_test = data_test["sales"]
    # Create a study object
    study = optuna.create_study(direction='minimize')
    
    print(f"Optimizing hyperparameters for product: {id}")
    
    
    # Run the optimization process for the current product
    study.optimize(lambda trial: objective_optuna(trial, y_train, y_test), n_trials=10, n_jobs=-1)

    # Get the best hyperparameters and the corresponding best MAE
    best_params = study.best_params
    best_mae = study.best_value

    # Create the best model with the obtained hyperparameters
    best_model = ExponentialSmoothing(y_train, **best_params).fit()
    
    return best_model, best_mae

# 2.Running all models in a loop to find for each product with lowest score

In [29]:
models_list = ["ARIMA","ExponentialSmoothing","Prophet"]

In [None]:
from pmdarima import auto_arima

# Dictionary to store MAE results for each unique time-series identified by id
product_results = {}
average_mae = []

# Iterate over each unique product series identified by id
for id in merge_df_scaled['id'].unique()[:10]:
    print(f"Analyzing product: {id}")
    product_data = merge_df_scaled[merge_df_scaled['id'] == id]

    # Results list for the current product time-series
    results = {}
    best_score = 999.99
    best_model_name = ""



    #Looping all models
    for model_name in models_list:

        if model_name == "ARIMA":
            #TODO: Add 5-fold split here for another loop (or inside the model function?) and then take the average score per model as their mae score
            
            # Fit ARIMA model on the training data using auto_arima to find the best (p, d, q)
            model, mae = perform_auto_arima(product_data)
            results[model_name] = {"mae": mae, "model": model}
            if mae < best_score:
                best_score = mae
                best_model = model
                best_model_name = model_name

        elif model_name == "ExponentialSmoothing":

            # To be built
            model, mae = perform_exp_smoothing(product_data)
            results[model_name] = {"mae": mae, "model": model}
            if mae < best_score:
                best_score = mae
                best_model = model
                best_model_name = model_name

        elif model_name == "Prophet":

            model, mae = perform_prophet(product_data)
            results[model_name] = {"mae": mae, "model": model}
            if mae < best_score:
                best_score = mae
                best_model = model
                best_model_name = model_name


    #Printing results for this product
    print(results)
    print(f"Model results for {id}")
    print(f"Best model: {best_model_name}")
    print(f"Best score: {best_score}")

    average_mae.append(best_score)

    # Store the average MAE for the current product time-series
    product_results[id] = {"best_score": best_score, "best_model": best_model_name, "model": best_model}

    #Store the best model in a pkl file
    filename = f'../models/{id}_model.pkl'
    with open(filename, 'wb') as f:
        pickle.dump(best_model, f)

# Create a DataFrame to store the results
results_df_arima = pd.DataFrame(product_results.items(), columns=['id', 'MAE'])

# Set the 'id' column as the index
results_df_arima.set_index('id', inplace=True)

average_mae = np.mean(average_mae)

print(f"Total average MAE: {average_mae}")


Analyzing product: FOODS_2_197_CA_1_validation


[I 2024-05-09 15:11:29,522] A new study created in memory with name: no-name-0bf505e2-c389-45cf-b0b3-99ea0a2aa926


Optimizing hyperparameters for product: FOODS_2_197_CA_1_validation


[I 2024-05-09 15:11:30,938] Trial 0 finished with value: 9.388827437788668 and parameters: {'trend': 'add', 'seasonal': None, 'seasonal_periods': 4}. Best is trial 0 with value: 9.388827437788668.
[I 2024-05-09 15:11:30,981] Trial 1 finished with value: 9.388827437788668 and parameters: {'trend': 'add', 'seasonal': None, 'seasonal_periods': None}. Best is trial 0 with value: 9.388827437788668.
[I 2024-05-09 15:11:31,029] Trial 4 finished with value: 9.780132669684251 and parameters: {'trend': 'add', 'seasonal': 'add', 'seasonal_periods': 12}. Best is trial 0 with value: 9.388827437788668.
[I 2024-05-09 15:11:31,096] Trial 3 finished with value: 9.388827437788668 and parameters: {'trend': 'add', 'seasonal': None, 'seasonal_periods': 12}. Best is trial 0 with value: 9.388827437788668.
[I 2024-05-09 15:11:31,118] Trial 7 finished with value: 9.388827437788668 and parameters: {'trend': 'add', 'seasonal': None, 'seasonal_periods': 12}. Best is trial 0 with value: 9.388827437788668.
[I 2024-

      with_intercept=False)}, 'ExponentialSmoothing': {'mae': 9.388827437788668, 'model': <statsmodels.tsa.holtwinters.results.HoltWintersResultsWrapper object at 0x7fed7ed37730>}, 'Prophet': {'mae': 11.367010469983766, 'model': <prophet.forecaster.Prophet object at 0x7fed7f936230>}}
Model results for FOODS_2_197_CA_1_validation
Best model: ARIMA
Best score: 8.726633436944372
Analyzing product: FOODS_3_080_CA_1_validation


[I 2024-05-09 15:11:42,277] A new study created in memory with name: no-name-33dec351-89ec-4e9e-9289-ab4d8c48e528


Optimizing hyperparameters for product: FOODS_3_080_CA_1_validation


[I 2024-05-09 15:11:43,612] Trial 4 finished with value: 5.603251743760496 and parameters: {'trend': 'add', 'seasonal': None, 'seasonal_periods': None}. Best is trial 4 with value: 5.603251743760496.
[I 2024-05-09 15:11:43,625] Trial 6 finished with value: 5.603251743760496 and parameters: {'trend': 'add', 'seasonal': None, 'seasonal_periods': 7}. Best is trial 4 with value: 5.603251743760496.
[I 2024-05-09 15:11:43,626] Trial 1 finished with value: 5.603251743760496 and parameters: {'trend': 'add', 'seasonal': None, 'seasonal_periods': 12}. Best is trial 4 with value: 5.603251743760496.
[I 2024-05-09 15:11:43,626] Trial 7 finished with value: 5.603251743760496 and parameters: {'trend': 'add', 'seasonal': None, 'seasonal_periods': 7}. Best is trial 4 with value: 5.603251743760496.
[I 2024-05-09 15:11:43,638] Trial 3 finished with value: 5.603251743760496 and parameters: {'trend': 'add', 'seasonal': None, 'seasonal_periods': 4}. Best is trial 4 with value: 5.603251743760496.
[I 2024-05-

      with_intercept=False)}, 'ExponentialSmoothing': {'mae': 4.851012052269259, 'model': <statsmodels.tsa.holtwinters.results.HoltWintersResultsWrapper object at 0x7fed7fd95c00>}, 'Prophet': {'mae': 4.776500194398421, 'model': <prophet.forecaster.Prophet object at 0x7fed7ff9b370>}}
Model results for FOODS_3_080_CA_1_validation
Best model: Prophet
Best score: 4.776500194398421
Analyzing product: FOODS_3_090_CA_1_validation


[I 2024-05-09 15:12:20,610] A new study created in memory with name: no-name-62e20bef-51b6-4a90-9974-e353d3344d23


Optimizing hyperparameters for product: FOODS_3_090_CA_1_validation


[I 2024-05-09 15:12:20,885] Trial 0 finished with value: 37.241681706408414 and parameters: {'trend': 'add', 'seasonal': None, 'seasonal_periods': 7}. Best is trial 0 with value: 37.241681706408414.
[I 2024-05-09 15:12:21,336] Trial 3 finished with value: 37.241681706408414 and parameters: {'trend': 'add', 'seasonal': None, 'seasonal_periods': 4}. Best is trial 0 with value: 37.241681706408414.
[I 2024-05-09 15:12:21,475] Trial 2 finished with value: 37.241681706408414 and parameters: {'trend': 'add', 'seasonal': None, 'seasonal_periods': None}. Best is trial 0 with value: 37.241681706408414.
[I 2024-05-09 15:12:21,577] Trial 9 finished with value: 37.241681706408414 and parameters: {'trend': 'add', 'seasonal': None, 'seasonal_periods': 4}. Best is trial 0 with value: 37.241681706408414.
[I 2024-05-09 15:12:21,648] Trial 5 finished with value: 37.241681706408414 and parameters: {'trend': 'add', 'seasonal': None, 'seasonal_periods': None}. Best is trial 0 with value: 37.241681706408414.

      with_intercept=False)}, 'ExponentialSmoothing': {'mae': 37.241681706408414, 'model': <statsmodels.tsa.holtwinters.results.HoltWintersResultsWrapper object at 0x7fed7f934430>}, 'Prophet': {'mae': 16.664135902250575, 'model': <prophet.forecaster.Prophet object at 0x7fed7f5e9030>}}
Model results for FOODS_3_090_CA_1_validation
Best model: Prophet
Best score: 16.664135902250575
Analyzing product: FOODS_3_120_CA_1_validation


# ----------- Jonas Cleaned and Optimized until here -------------------

In [None]:
# Convert the results dictionary to a DataFrame
results_df_exp = pd.DataFrame(results_dict).T.reset_index()
results_df_exp.columns = ['id', 'ExpSmoothing_params', 'ExpSmoothing_MAE']

# Set the 'Product ID' column as the index
results_df_exp.set_index('id', inplace=True)
results_df_exp

In [None]:
# Merge the two DataFrames based on the product ID
comparison_df = pd.merge(results_df_arima, results_df_exp, left_index=True, right_index=True, suffixes=('_arima', '_exp'))
comparison_df['Best MAE'] = comparison_df[['ARIMA_MAE', 'ExpSmoothing_MAE']].min(axis=1)
comparison_df['Best Method'] = comparison_df.apply(lambda row: 'ARIMA' if row['Best MAE'] == row['ARIMA_MAE'] else 'Exponential Smoothing', axis=1)

comparison_df
