In [1]:
cd ../

/Users/linafaik/Documents/projects/time-series-forecasting-models


In [2]:
import pandas as pd
import numpy as np
import os

from config import *
from src.data_processing import *
from src.metrics import *
from src.training import *
from src.viz import *

%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
force = False

## Data loading

In [4]:
df = pd.read_csv(path_data_processed)
df["date"] = pd.to_datetime(df["date"], format="%Y-%m-%d")

In [5]:
df.head()

Unnamed: 0,date,store_id,state_id,sold_quantity,sold_amount,event_type_1,event_type_2,event_sporting,event_cultural,event_national,event_religious
0,2011-01-29,CA_1,CA,4337,10933.16,,,0,0,0,0
1,2011-01-29,CA_2,CA,3494,9101.52,,,0,0,0,0
2,2011-01-29,CA_3,CA,4739,11679.83,,,0,0,0,0
3,2011-01-29,CA_4,CA,1625,4561.59,,,0,0,0,0
4,2011-01-29,TX_1,TX,2556,6586.68,,,0,0,0,0


## Model training

In [6]:
from statsforecast import StatsForecast
from statsforecast.models import AutoARIMA, AutoETS, AutoCES, MSTL, Theta

In [7]:
name_scenario = "stat_models"

In [8]:
# Define the output path for the forecasts file based on the scenario name
path = os.path.join("output", name_scenario, f"forecasts_{name_scenario}.csv")

# If the file doesn't exist or if forced recomputation is requested
if force or not(os.path.exists(path)):

    # Split the original dataframe into train and test sets
    train_df, test_df = split_train_test(
        df=df, 
        horizon=H,                    # Forecasting horizon
        column_date=time_col,        # Column with the time information
        column_id=id_col,            # Column with the identifier (e.g., store or product)
    )

    # Print the number of rows in each split
    print(f"{len(train_df)} rows for train")
    print(f"{len(test_df)} rows for test")

    # Define a list of forecasting models with weekly seasonality
    models = [
        AutoARIMA(season_length=7),     # Autoregressive model with automatic selection
        AutoETS(season_length=7),       # Exponential smoothing model
        AutoCES(season_length=7),       # Complex Exponential Smoothing
        MSTL(season_length=7),          # Seasonal-trend decomposition using LOESS
        Theta(season_length=7)          # Theta model for trend extrapolation
    ]

    # Initialize the StatsForecast object with parallel jobs
    sf = StatsForecast(
        models=models,
        freq=freq,          # Frequency of the time series (e.g., 'D' for daily)
        n_jobs=-1           # Use all available CPU cores
    )

    # Train the models on the training dataset
    sf.fit(train_df[[id_col, time_col, target_col]], id_col=id_col, time_col=time_col, target_col=target_col)

    # Generate forecasts for the test period with prediction intervals
    forecasts_df = sf.predict(h=H, level=[90])

    # Merge forecasts with the actual test set for comparison
    forecasts_enr_df = (
        test_df
        .merge(forecasts_df, on=[id_col, time_col], how="left")
    )

    # Concatenate training and test (with forecasts) into a single dataframe
    forecasts_enr_df = pd.concat([train_df, forecasts_enr_df], axis=0).reset_index(drop=True)

    # Create output directory if it doesn't exist and save forecasts to CSV
    os.makedirs(os.path.join("output", name_scenario), exist_ok=True)
    forecasts_enr_df.to_csv(path, index=False)

else:
    # Load existing forecast results if available and not forced to recompute
    forecasts_enr_df = pd.read_csv(path)

# Show the last rows of the resulting dataframe
forecasts_enr_df.tail()


Unnamed: 0,date,store_id,state_id,sold_quantity,sold_amount,event_type_1,event_type_2,event_sporting,event_cultural,event_national,...,AutoETS-hi-90,CES,CES-lo-90,CES-hi-90,MSTL,MSTL-lo-90,MSTL-hi-90,Theta,Theta-lo-90,Theta-hi-90
19405,2016-05-18,WI_3,WI,3268,9163.29,,,0,0,0,...,13933.359409,8965.794983,6693.377368,11364.057324,9492.684917,6348.128916,12637.240919,10655.948383,6440.972199,14833.715852
19406,2016-05-19,WI_3,WI,3398,9660.13,,,0,0,0,...,14173.507816,9174.54364,6857.66106,11502.264355,9748.409778,6490.61895,13006.200606,10792.258138,5762.988074,15067.737097
19407,2016-05-20,WI_3,WI,4126,11982.37,,,0,0,0,...,15718.490607,10406.047241,8081.220947,12656.702539,11710.666878,8343.446999,15077.886756,12238.517288,7372.711645,16695.685278
19408,2016-05-21,WI_3,WI,4519,12370.23,,,0,0,0,...,17411.787618,12443.463379,10184.971436,14674.319775,14170.842095,10697.639201,17644.044989,13827.982109,8984.082293,18310.834246
19409,2016-05-22,WI_3,WI,4757,13432.85,,,0,0,0,...,17056.7096,11788.405273,9604.503369,13975.019189,13513.485874,9937.439597,17089.532151,13380.286012,8573.014679,17530.172882


## Results analysis

In [9]:
# Initialize two containers:
# - 'scores' will store global evaluation metrics per model
# - 'scores_per_ts' will store evaluation metrics per time series (i.e., per ID)
scores = {}
scores_per_ts = []

# Loop through each model's forecast column name
for column in ['AutoARIMA', 'AutoETS', 'CES', 'MSTL', 'Theta']:
    
    # Filter out rows where the model's forecast is missing (NaN)
    forecasts_filtered_df = forecasts_enr_df[forecasts_enr_df[column].notna()]
    
    # Compute global evaluation metrics (e.g., MAE, RMSE, etc.) for this model
    scores[column] = evaluate(forecasts_filtered_df[target_col], forecasts_filtered_df[column])
    
    # Compute evaluation metrics per time series (i.e., per unique ID)
    scores_per_ts_model_df = (
        forecasts_filtered_df
        .groupby(id_col)
        .apply(lambda group: pd.Series(evaluate(group[target_col], group[column])))
        .reset_index()
    )
    
    # Add a column to identify the model
    scores_per_ts_model_df["model"] = column
    
    # Append the result to the global list, keeping the model name and metrics
    scores_per_ts.append(
        scores_per_ts_model_df[["model"] + [c for c in scores_per_ts_model_df.columns if c != "model"]]
    )

# Convert the global scores dictionary into a DataFrame, with one row per model
scores_df = pd.DataFrame(scores).T.reset_index().rename(columns={"index": "model"})

# Concatenate all per-time-series score DataFrames into one final table
scores_per_ts_df = pd.concat(scores_per_ts, axis=0).reset_index(drop=True)


  .apply(lambda group: pd.Series(evaluate(group[target_col], group[column])))
  .apply(lambda group: pd.Series(evaluate(group[target_col], group[column])))
  .apply(lambda group: pd.Series(evaluate(group[target_col], group[column])))
  .apply(lambda group: pd.Series(evaluate(group[target_col], group[column])))
  .apply(lambda group: pd.Series(evaluate(group[target_col], group[column])))


In [10]:
scores_df

Unnamed: 0,model,MAE,RMSE,MAPE,R2,count
0,AutoARIMA,1345.850419,2915555.0,0.091815,0.829123,140.0
1,AutoETS,1181.079193,2331103.0,0.080433,0.863377,140.0
2,CES,1252.440603,2889545.0,0.08459,0.830647,140.0
3,MSTL,1290.309465,2858886.0,0.087434,0.832444,140.0
4,Theta,1216.612549,2508826.0,0.081705,0.852961,140.0


In [11]:
list_metrics = ["MAPE"]
plot_global_scores(scores_df=scores_df, list_metrics=list_metrics)

In [12]:
plot_scores_per_ts(scores_per_ts_df, column_id = id_col, list_metrics=list_metrics)

In [16]:
rnd_id = np.random.choice(scores_per_ts_df[id_col].unique())

for column in ['AutoARIMA', 'AutoETS', 'CES', 'MSTL', 'Theta']:
    
    print(f"Model: {column}") 

    plot_forecast_with_ci(
        forecasts_enr_df, 
        column_id=id_col,
        column_date=time_col,
        column_target=target_col,
        column_forecast=column,
        model_name=column,
        uid=rnd_id, 
        train_tail=30,
        ).show()

Model: AutoARIMA



Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.



Model: AutoETS



Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.



Model: CES



Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.



Model: MSTL



Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.



Model: Theta



Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.

