# Setup

In [1]:
# General libraries
import pandas as pd
import numpy as np

# Utilities
from pathlib import Path
import psutil
import gc
import os

# Cuda
import torch

In [2]:
# Warnings
import warnings

# Suppress warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)

# Suppress the specific PerformanceWarning
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

# Pd options
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', None)

In [3]:
# Get the number of available CPUs
num_cpus = os.cpu_count()

# Show
print(num_cpus)

12


In [4]:
# Check if CUDA is available
if torch.cuda.is_available():
    print("CUDA is available. Using GPU.")
else:
    print("CUDA is not available. Using CPU.")

CUDA is not available. Using CPU.


# Directory

In [5]:
# Set the right directory
current_path = Path.cwd()
if current_path.name in ['notebooks', 'src']:
    os.chdir('..')
    print(f"Moved up from {current_path.name} to: {os.getcwd()}")
else:
    print(f"Already in project directory: {os.getcwd()}")

Moved up from notebooks to: /Users/ignasipascual/GitHub/Forecaster


# Assets

In [6]:
# Import assets
from utils.data_preparation import DataPreparation
from utils.feature_engineering import FeatureEngineering
from utils.create_baselines import CreateBaselines
from utils.forecaster import Forecaster
from utils.evaluator import Evaluator
from utils.auxiliar import *

# Load data

In [7]:
# Raw URL of the CSV file
url = 'https://raw.githubusercontent.com/ipveka/Forecaster/main/data/example.csv'

# Load the CSV into a DataFrame
df_input = pd.read_csv(url)

# Ensure the 'data' folder exists
os.makedirs('data', exist_ok=True)

# Save DataFrame as CSV locally
df_input.to_csv('data/example.csv', index=False)

# Show
df_input.head()

Unnamed: 0,id,date,year_week,product_number,reporterhq_id,prod_category,specs,display_size,segment,sales_units,inventory_units
0,202224-2961,2022-06-18,202224,2961,15,Doraemon,186631,15.6,Gaming,52.0,88.0
1,202225-2961,2022-06-25,202225,2961,15,Doraemon,186631,15.6,Gaming,6.0,88.0
2,202226-2961,2022-07-02,202226,2961,15,Doraemon,186631,15.6,Gaming,60.0,138.0
3,202227-2961,2022-07-09,202227,2961,15,Doraemon,186631,15.6,Gaming,158.0,65.0
4,202228-2961,2022-07-16,202228,2961,15,Doraemon,186631,15.6,Gaming,23.0,30.0


## Subsample


In [None]:
# Subsample
use_subsample = True

if use_subsample:
    # Show all distinct reporterhq_ids
    distinct_reporters = df_input['reporterhq_id'].unique()
    print("🟡 Distinct reporterhq_ids:")
    print(distinct_reporters)

    # Choose reporters to keep
    reporters = [15]

    # Count before filtering
    before_rows = df_input.shape[0]
    before_reporters = df_input['reporterhq_id'].nunique()
    print(f"🟢 BEFORE filtering → Rows: {before_rows}, Distinct Reporters: {before_reporters}")

    # Filter ONLY by reporterhq_id
    df_input = df_input[df_input['reporterhq_id'].isin(reporters)]

    # Count after filtering
    after_rows = df_input.shape[0]
    after_reporters = df_input['reporterhq_id'].nunique()
    print(f"🔵 AFTER filtering (reporterhq_id={reporters}) → Rows: {after_rows}, Distinct Reporters: {after_reporters}")

    # Show sample
    display(df_input.head())

🟡 Distinct reporterhq_ids:
[15 93 90 12 78 24 60  6  3  9 72 21 87 48 96 84 39 27 81 36]
🟢 BEFORE filtering → Rows: 25139, Distinct Reporters: 20
🔵 AFTER filtering (reporterhq_id=[15, 93, 90]) → Rows: 9495, Distinct Reporters: 3


Unnamed: 0,id,date,year_week,product_number,reporterhq_id,prod_category,specs,display_size,segment,sales_units,inventory_units
0,202224-2961,2022-06-18,202224,2961,15,Doraemon,186631,15.6,Gaming,52.0,88.0
1,202225-2961,2022-06-25,202225,2961,15,Doraemon,186631,15.6,Gaming,6.0,88.0
2,202226-2961,2022-07-02,202226,2961,15,Doraemon,186631,15.6,Gaming,60.0,138.0
3,202227-2961,2022-07-09,202227,2961,15,Doraemon,186631,15.6,Gaming,158.0,65.0
4,202228-2961,2022-07-16,202228,2961,15,Doraemon,186631,15.6,Gaming,23.0,30.0


## Formatting

In [9]:
# Convert all column names to lowercase
df_input.columns = df_input.columns.str.lower()

# Date format
df_input['date'] = pd.to_datetime(df_input['date'])

# Convert each specified column to string
character_cols = ['reporterhq_id', 'product_number', 'prod_category', 'specs', 'display_size', 'segment']
for col in character_cols:
    df_input[col] = df_input[col].astype(str)

# Convert signal columns to numeric, coercing errors to NaN
signal_cols = ['sales_units', 'inventory_units']
for col in signal_cols:
    df_input[col] = pd.to_numeric(df_input[col], errors='coerce')

# Select the specified columns
df_input = df_input[['reporterhq_id', 'product_number', 'prod_category', 'specs', 'display_size', 'segment', 'date', 'sales_units', 'inventory_units']]

# Sort by 'id_column' and 'date' in ascending order
df_input = df_input.sort_values(by=['reporterhq_id', 'product_number', 'date'])

# Reset index
df_input = df_input.reset_index(drop=True)

# Show
df_input.head()

Unnamed: 0,reporterhq_id,product_number,prod_category,specs,display_size,segment,date,sales_units,inventory_units
0,15,100674,Conan,158588,15.6,Premium,2022-06-18,0.0,28.0
1,15,100674,Conan,158588,15.6,Premium,2022-06-25,0.0,45.0
2,15,100674,Conan,158588,15.6,Premium,2022-07-02,14.0,68.0
3,15,100674,Conan,158588,15.6,Premium,2022-07-09,8.0,55.0
4,15,100674,Conan,158588,15.6,Premium,2022-07-16,6.0,75.0


## Statistics

In [10]:
# Print the number of rows and columns
print(f"Number of columns in df_input: {df_input.shape[1]}")
print(f"Number of rows in df_input: {df_input.shape[0]}")

# Print the number of distinct values for each specified column
print(f"Number of distinct reporterhq_id: {df_input['reporterhq_id'].nunique()}")
print(f"Number of distinct product_number: {df_input['product_number'].nunique()}")

Number of columns in df_input: 9
Number of rows in df_input: 9495
Number of distinct reporterhq_id: 3
Number of distinct product_number: 153


In [11]:
# Calculate the percentage of NA values in the entire DataFrame
total_rows = len(df_input)
total_na = df_input.isna().sum().sum()
percent_na_total = (total_na / (total_rows * df_input.shape[1])) * 100
print(f"Percentage of NA values: {percent_na_total:.2f}%")

Percentage of NA values: 0.44%


# Data Preparation

In [12]:
# Init class
data_preparation = DataPreparation()

In [13]:
# Run data preparation
df_data_prepared = data_preparation.run_data_preparation(
    df=df_input,
    group_cols=['reporterhq_id', 'product_number'],
    date_col='date',
    target='inventory_units',
    horizon=13,
    complete_dataframe=False,
    smoothing=False,
    dp_window_size=13,
    n_cutoffs=5
)

# Show
df_data_prepared.head()


DATA PREPARATION

📊 Input Dataset:
   • Rows: 9,495
   • Columns: 9
   • Groups: 213
   • Date column: 'date'
   • Target: 'inventory_units'

🔄 Converting 'date' to datetime format...
   ✓ Date range: 2021-05-15 to 2023-05-06
   ✓ Target > 0 & not NaN: 9,184 records (2021-05-15 to 2023-05-06)

🔍 Auto-detecting frequency...
   ✓ Detected frequency: W-SAT
   ✓ Using specified horizon: 13 periods
   ✓ Using specified smoothing window: 13 periods

⏭️  Skipping date completion (complete_dataframe=False)

📈 Identified 2 signal column(s):
   • sales_units (98.0% non-null)
   • inventory_units (98.1% non-null)

⏭️  Skipping smoothing (smoothing=False)

📅 Creating 5 cutoff(s) for backtesting...
   ✓ Cutoff dates (based on dates with valid target):
      1. 2023-05-06 [LATEST - Used for future forecasting]
      2. 2023-04-01
      3. 2023-03-04
      4. 2023-02-04
      5. 2023-01-07

🔀 Creating train/test splits...
   ✓ Expanded dataset from 9,495 to 47,475 rows
   ✓ Train samples: 37,853 (79

Unnamed: 0,reporterhq_id,product_number,prod_category,specs,display_size,segment,date,sales_units,inventory_units,cutoff,sample
3,15,100674,Conan,158588,15.6,Premium,2022-06-18,0.0,28.0,2023-01-07,train
7,15,100674,Conan,158588,15.6,Premium,2022-06-25,0.0,45.0,2023-01-07,train
11,15,100674,Conan,158588,15.6,Premium,2022-07-02,14.0,68.0,2023-01-07,train
15,15,100674,Conan,158588,15.6,Premium,2022-07-09,8.0,55.0,2023-01-07,train
19,15,100674,Conan,158588,15.6,Premium,2022-07-16,6.0,75.0,2023-01-07,train


# Feature Engineering

In [14]:
# Init class
feature_engineering = FeatureEngineering()

In [15]:
# Run feature engineering
df_with_features = feature_engineering.run_feature_engineering(
    df=df_data_prepared,
    group_cols=['reporterhq_id', 'product_number', 'cutoff'],
    date_col='date',
    target='inventory_units',
    freq='W',
    fe_window_size=(4, 13),
    lags=(4, 13, 26, 52),
    fill_lags=True,
    n_clusters=10
)


FEATURE ENGINEERING

📊 Input Dataset:
   • Rows: 50,244
   • Columns: 11
   • Existing features: 0
   • Target: 'inventory_units'

📅 Using specified frequency: W
   ✓ Using specified window sizes: (4, 13)
   ✓ Using specified lags: (4, 13, 26, 52)

📈 Identified 2 signal column(s) for feature creation
   • Signal columns: sales_units, inventory_units

🏷️  Encoding 6 categorical column(s)...
   • Columns to encode: reporterhq_id, product_number, prod_category, specs, display_size, segment
     - reporterhq_id: 3 unique values
     - product_number: 153 unique values
     - prod_category: 7 unique values
     - specs: 153 unique values
     - display_size: 7 unique values
     - segment: 3 unique values
   ✓ Created 6 encoded feature(s)

📅 Creating temporal features...
   ✓ Created 26 date-based features
      - Basic: year, quarter, month, week, day, dayofweek
      - Cyclical: sin/cos encodings for seasonality

⏱️  Creating period-based features...
   ✓ Created 3 period feature(s)
    

# Create Baselines

In [16]:
# Parameters for baseline creation
group_columns_cb = ['reporterhq_id', 'product_number', 'cutoff']
date_column = 'date'

# MA Baseline parameters
signal_columns = ['sales_units', 'inventory_units']
bs_window_size = 13

# Feature baselines
create_features = False

In [17]:
# Prepare class
create_baselines = CreateBaselines()

In [18]:
# Call the function
df_backtesting = create_baselines.create_ma_baseline(
    df_with_features, 
    group_columns_cb, 
    date_column, 
    signal_columns, 
    bs_window_size,
    create_features
)

# Show
df_backtesting.head()

Unnamed: 0,reporterhq_id,product_number,prod_category,specs,display_size,segment,date,sales_units,inventory_units,cutoff,sample,feature_reporterhq_id,feature_product_number,feature_prod_category,feature_specs,feature_display_size,feature_segment,feature_year,feature_quarter,feature_month,feature_week,feature_day,feature_dayofweek,feature_sin_yearly_1,feature_cos_yearly_1,feature_sin_yearly_2,feature_cos_yearly_2,feature_sin_yearly_3,feature_cos_yearly_3,feature_sin_quarterly_1,feature_cos_quarterly_1,feature_sin_quarterly_2,feature_cos_quarterly_2,feature_sin_monthly_1,feature_cos_monthly_1,feature_sin_monthly_2,feature_cos_monthly_2,feature_sin_weekly_1,feature_cos_weekly_1,feature_sin_weekly_2,feature_cos_weekly_2,feature_sin_weekly_3,feature_cos_weekly_3,feature_periods,feature_periods_expanding,feature_periods_sqrt,sales_units_ma_4,sales_units_ma_13,inventory_units_ma_4,inventory_units_ma_13,sales_units_min_4,sales_units_max_4,sales_units_mean_4,sales_units_min_13,sales_units_max_13,sales_units_mean_13,inventory_units_min_4,inventory_units_max_4,inventory_units_mean_4,inventory_units_min_13,inventory_units_max_13,inventory_units_mean_13,feature_sales_units_lag_4,feature_sales_units_lag_13,feature_sales_units_lag_26,feature_sales_units_lag_52,feature_inventory_units_lag_4,feature_inventory_units_lag_13,feature_inventory_units_lag_26,feature_inventory_units_lag_52,feature_inventory_units_cov,feature_inventory_units_intermittence,feature_inventory_units_cluster,train_weight,horizon,baseline_sales_units_ma_13,baseline_inventory_units_ma_13
0,15,100674,Conan,158588,15.6,Premium,2022-06-18,0.0,28.0,2023-01-07,train,0,0,3,40,4,2,2022,2,6,24,18,5,0.232243,-0.972658,-0.451786,0.892126,0.646623,-0.76281,-0.8061,0.591779,-0.954067,-0.299595,-0.587785,-0.809017,0.951057,0.309017,-0.974928,-0.222521,0.433884,-0.900969,0.781831,0.62349,1.0,1.0,1.0,0.0,0.0,28.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,28.0,28.0,28.0,28.0,28.0,28.0,,,,,,,,,-9.272274,0.0,1,0.034483,0,0.0,28.0
1,15,100674,Conan,158588,15.6,Premium,2022-06-25,0.0,45.0,2023-01-07,train,0,0,3,40,4,2,2022,2,6,25,25,5,0.11372,-0.993513,-0.225964,0.974136,0.335276,-0.94212,-0.440238,0.897881,-0.790563,0.61238,-0.866025,0.5,-0.866025,-0.5,-0.974928,-0.222521,0.433884,-0.900969,0.781831,0.62349,2.0,2.143547,1.414214,0.0,0.0,36.5,36.5,0.0,0.0,0.0,0.0,0.0,0.0,28.0,45.0,36.5,28.0,45.0,36.5,,,,,,,,,-9.272274,0.0,1,0.068966,0,0.0,36.5
2,15,100674,Conan,158588,15.6,Premium,2022-07-02,14.0,68.0,2023-01-07,train,0,0,3,40,4,2,2022,3,7,26,2,5,-0.006451,-0.999979,0.012901,0.999917,-0.019352,-0.999813,0.025801,0.999667,0.051584,0.998669,0.394356,0.918958,0.724793,0.688967,-0.974928,-0.222521,0.433884,-0.900969,0.781831,0.62349,3.0,3.34837,1.732051,4.666667,4.666667,47.0,47.0,0.0,14.0,4.666667,0.0,14.0,4.666667,28.0,68.0,47.0,28.0,68.0,47.0,,,,,,,,,-9.272274,0.0,1,0.103448,0,4.666667,47.0
3,15,100674,Conan,158588,15.6,Premium,2022-07-09,8.0,55.0,2023-01-07,train,0,0,3,40,4,2,2022,3,7,27,9,5,-0.126528,-0.991963,0.251022,0.967981,-0.371481,-0.928441,0.485969,0.873976,0.84945,0.527668,0.968077,-0.250653,-0.485302,-0.874347,-0.974928,-0.222521,0.433884,-0.900969,0.781831,0.62349,4.0,4.594793,2.0,5.5,5.5,49.0,49.0,0.0,14.0,5.5,0.0,14.0,5.5,28.0,68.0,49.0,28.0,68.0,49.0,,,,,,,,,-9.272274,0.0,1,0.137931,0,5.5,49.0
4,15,100674,Conan,158588,15.6,Premium,2022-07-16,6.0,75.0,2023-01-07,train,0,0,3,40,4,2,2022,3,7,28,16,5,-0.244772,-0.969581,0.474653,0.880173,-0.675656,-0.737217,0.835553,0.549409,0.918121,-0.396299,-0.101168,-0.994869,0.201299,0.97953,-0.974928,-0.222521,0.433884,-0.900969,0.781831,0.62349,5.0,5.873095,2.236068,7.0,5.6,60.75,54.2,0.0,14.0,7.0,0.0,14.0,5.6,45.0,75.0,60.75,28.0,75.0,54.2,0.0,,,,28.0,,,,-9.272274,0.0,1,0.172414,0,5.6,54.2


# Forecasting model

## Features

In [19]:
# Automatically find all feature columns containing 'feature' in their names
features = [col for col in df_backtesting.columns if "feature" in col]

# Create default trainining group
df_backtesting['training_group'] = 1

# Show
features

['feature_reporterhq_id',
 'feature_product_number',
 'feature_prod_category',
 'feature_specs',
 'feature_display_size',
 'feature_segment',
 'feature_year',
 'feature_quarter',
 'feature_month',
 'feature_week',
 'feature_day',
 'feature_dayofweek',
 'feature_sin_yearly_1',
 'feature_cos_yearly_1',
 'feature_sin_yearly_2',
 'feature_cos_yearly_2',
 'feature_sin_yearly_3',
 'feature_cos_yearly_3',
 'feature_sin_quarterly_1',
 'feature_cos_quarterly_1',
 'feature_sin_quarterly_2',
 'feature_cos_quarterly_2',
 'feature_sin_monthly_1',
 'feature_cos_monthly_1',
 'feature_sin_monthly_2',
 'feature_cos_monthly_2',
 'feature_sin_weekly_1',
 'feature_cos_weekly_1',
 'feature_sin_weekly_2',
 'feature_cos_weekly_2',
 'feature_sin_weekly_3',
 'feature_cos_weekly_3',
 'feature_periods',
 'feature_periods_expanding',
 'feature_periods_sqrt',
 'feature_sales_units_lag_4',
 'feature_sales_units_lag_13',
 'feature_sales_units_lag_26',
 'feature_sales_units_lag_52',
 'feature_inventory_units_lag_4',


In [20]:
# Number of features
print("Number of features:", len(features))

Number of features: 46


## Configuration

In [21]:
# Parameters
group_cols = ['reporterhq_id', 'product_number']
training_group = 'training_group'
target_col = 'inventory_units'
model = 'LGBM'
tune_hyperparameters = False
search_method = 'halving'
default_params = None
param_distributions = None
scoring_metric = 'neg_root_mean_squared_error'
n_iter = 50
best_features = False
n_best_features = 25
use_weights = True

# Outliers
remove_outliers = False
outlier_column = 'inventory_units'
lower_quantile = 0.025
upper_quantile = 0.975
ts_decomposition = False

# Guardrail
baseline_col = f'baseline_{target_col}_ma_{bs_window_size}'
use_guardrail = True
guardrail_limit = 2.5

# Parallel
use_parallel = False
num_cpus = 5

## Run Predictor

In [None]:
# Initialize the Forecaster class with df_input
forecaster = Forecaster(df_backtesting)

: 

In [None]:
%%time
# Call the function
df_with_preds = forecaster.run_backtesting(
    group_cols,
    features,
    default_params,
    training_group,
    target_col,
    model,
    tune_hyperparameters,
    search_method,
    param_distributions,
    scoring_metric,
    n_iter,
    best_features,
    n_best_features,
    remove_outliers,
    outlier_column,
    lower_quantile,
    upper_quantile,
    ts_decomposition,
    baseline_col,
    use_guardrail,
    guardrail_limit,
    use_weights,
    use_parallel,
    num_cpus
)

## Hyperparameters

In [None]:
# Your existing code
if tune_hyperparameters:
    # Get best hyperparameters
    best_hyperparams = forecaster.get_best_hyperparams()
    print("Best Hyperparameters for each cutoff:")
    
    # Convert to DataFrame and display
    params_df = pd.DataFrame(best_hyperparams).T
    display(params_df)
    
else:
    # If no tuning, get all model parameters
    all_params = []
    
    for (cutoff, training_group), model in forecaster.models.items():
        params = model.get_params()
        params.update({
            'cutoff': cutoff,
            'training_group': training_group
        })
        all_params.append(params)
    
    # Create and display DataFrame
    params_df = pd.DataFrame(all_params)
    params_df = params_df.set_index(['cutoff', 'training_group'])
    display(params_df)

## Guardrail

In [None]:
# If guardrail is activated
if use_guardrail:
  # Assuming 'df_with_preds' is the DataFrame
  group_cols = ['reporterhq_id', 'product_number', 'cutoff']

  # Group by the specified columns and check for at least one True in each group
  guardrail_groups = df_with_preds.groupby(group_cols)['guardrail'].any().reset_index()

  # Calculate the total number of groups
  total_groups = guardrail_groups.shape[0]

  # Count the number of groups where guardrail is True
  true_guardrail_groups_count = guardrail_groups['guardrail'].sum()

  # Calculate the percentage of groups with guardrail == True
  percentage_guardrail_true = (true_guardrail_groups_count / total_groups) * 100

  # Show
  print(f"Percentage of groups where guardrail is True: {percentage_guardrail_true:.2f}%")

# Evaluate results

## Feature importance

In [None]:
# Plot the average feature importance across all cutoffs
forecaster.plot_feature_importance(top_n=15, show_std=True, color_scale=True)

# Guardrail

In [None]:
# Assets
from utils.guardrail import Guardrail

# Init class
guardrail = Guardrail()

In [None]:
# Run function
df_with_preds = guardrail.create_smart_ensemble(
    df=df_with_preds,
    group_cols=['reporterhq_id', 'product_number', 'cutoff'],
    pred_col='prediction',
    baseline_col=f'baseline_{target_col}_ma_{bs_window_size}'
    intermittence_threshold=70.0,
    cov_threshold=1.2
)

## Run Evaluator

In [None]:
# Params
actuals_col = 'inventory_units'
baseline_col = 'baseline_inventory_units_ma_13'

# Base prediction columns
preds_col = ['model_prediction', 'prediction']

# Add 'prediction_ensemble' if it exists in the DataFrame
if 'prediction_ensemble' in df_input.columns:
    preds_col.append('prediction_ensemble')

# Initialize the Evaluator class
evaluator = Evaluator(df_with_preds, actuals_col, baseline_col, preds_col)

# Call the function
metric_table = evaluator.create_metric_table()

# Show
metric_table

## Lag metrics

In [None]:
# Params
metric_name = 'RMSE'
group_col = 'horizon'
group_filter = range(1,14)

# Call the function
lag_metrics = evaluator.calculate_grouped_metric(metric_name, group_col, group_filter)

# Show
lag_metrics

# Plot predictions


In [None]:
# Get the max cutoff
max_cutoff = df_with_preds['cutoff'].max()

# Filter the DataFrame where cutoff is equal to the max cutoff
df_filtered = df_with_preds[df_with_preds['cutoff'] == max_cutoff]

# Reset index
df_filtered = df_filtered.reset_index(drop=True)

# Show
df_filtered.head()

In [None]:
# Params
baseline_col = 'baseline_inventory_units_ma_13'
target_col = 'inventory_units'
top_n_plots = 6

## Plot by Reporter

In [None]:
# Plot by client
process_and_plot(
    df_filtered,
    group_col='reporterhq_id',
    baseline_col=baseline_col,
    target_col=target_col,
    top_n=top_n_plots,
    title='Top Reporter Target vs Baseline vs Prediction'
)

## Plot by Product

In [None]:
# Plot by product
process_and_plot(
    df_filtered,
    group_col='product_number',
    baseline_col=baseline_col,
    target_col=target_col,
    top_n=top_n_plots,
    title='Top Products Target vs Baseline vs Prediction'
)

# Save output

In [None]:
# Select columns
df_filtered = df_with_preds.loc[:, ~df_with_preds.columns.str.startswith('feature_')]

# Create the filename with the phase appended
file_name = f'outputs/predictions.csv'

# Write the dataframe to the CSV file
df_filtered.to_csv(file_name, index=False)

# Clean

In [None]:
# Garbage collection
gc.collect()