# Setup

In [1]:
# General libraries
import pandas as pd
import numpy as np

# Utilities
from pathlib import Path
import psutil
import gc
import os

In [2]:
# Warnings
import warnings

# Suppress general warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)

# Suppress the specific PerformanceWarning from pandas
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

# Suppress SettingWithCopyWarning
pd.options.mode.chained_assignment = None

# Directory

In [3]:
# Set the right directory
current_path = Path.cwd()
if current_path.name in ['notebooks','src']:
    os.chdir('..')
    print(f"Moved up from {current_path.name} to: {os.getcwd()}")
else:
    print(f"Already in project directory: {os.getcwd()}")

Moved up from notebooks to: /Users/ignasipascual/Documents/GitHub/Forecaster


# Assets

## Import assets

In [4]:
# Import assets
from utils.data_preparation import DataPreparation
from utils.feature_engineering import FeatureEngineering
from utils.create_baselines import CreateBaselines
from utils.auxiliar import *

# Load data

In [5]:
# Raw URL of the CSV file
url = 'https://raw.githubusercontent.com/jordisc97/Kaggle_HackUPC/main/train_v_2_kaggle_23.csv'

# Load the CSV into a DataFrame
df_input = pd.read_csv(url)

# Show
df_input.head()

Unnamed: 0,id,date,year_week,product_number,reporterhq_id,prod_category,specs,display_size,segment,sales_units,inventory_units
0,202224-2961,2022-06-18,202224,2961,15,Doraemon,186631,15.6,Gaming,52.0,88.0
1,202225-2961,2022-06-25,202225,2961,15,Doraemon,186631,15.6,Gaming,6.0,88.0
2,202226-2961,2022-07-02,202226,2961,15,Doraemon,186631,15.6,Gaming,60.0,138.0
3,202227-2961,2022-07-09,202227,2961,15,Doraemon,186631,15.6,Gaming,158.0,65.0
4,202228-2961,2022-07-16,202228,2961,15,Doraemon,186631,15.6,Gaming,23.0,30.0


## Formatting

In [6]:
# Convert all column names to lowercase
df_input.columns = df_input.columns.str.lower()

# Date format
df_input['date'] = pd.to_datetime(df_input['date'])

# Convert each specified column to string
character_cols = ['reporterhq_id', 'product_number', 'prod_category', 'specs', 'display_size', 'segment']
for col in character_cols:
    df_input[col] = df_input[col].astype(str)

# Convert signal columns to numeric, coercing errors to NaN
signal_cols = ['sales_units', 'inventory_units']
for col in signal_cols:
    df_input[col] = pd.to_numeric(df_input[col], errors='coerce')

# Select the specified columns
df_input = df_input[['reporterhq_id', 'product_number', 'prod_category', 'specs', 'display_size', 'segment', 'date', 'sales_units', 'inventory_units']]

# Sort by 'id_column' and 'date' in ascending order
df_input = df_input.sort_values(by=['reporterhq_id', 'product_number', 'date'])

# Show
df_input.head()

Unnamed: 0,reporterhq_id,product_number,prod_category,specs,display_size,segment,date,sales_units,inventory_units
913,12,10857,Doraemon,108304,15.6,Gaming,2021-01-30,0.0,52.0
914,12,10857,Doraemon,108304,15.6,Gaming,2021-02-06,8.0,48.0
915,12,10857,Doraemon,108304,15.6,Gaming,2021-02-13,39.0,72.0
916,12,10857,Doraemon,108304,15.6,Gaming,2021-02-20,0.0,138.0
917,12,10857,Doraemon,108304,15.6,Gaming,2021-02-27,69.0,87.0


## Statistics

In [7]:
# Print the number of rows and columns
print(f"Number of columns in df_input: {df_input.shape[1]}")
print(f"Number of rows in df_input: {df_input.shape[0]}")

# Print the number of distinct values for each specified column
print(f"Number of distinct reporterhq_id: {df_input['reporterhq_id'].nunique()}")
print(f"Number of distinct product_number: {df_input['product_number'].nunique()}")

Number of columns in df_input: 9
Number of rows in df_input: 25139
Number of distinct reporterhq_id: 20
Number of distinct product_number: 233


In [8]:
# Calculate the percentage of NA values in the entire DataFrame
total_rows = len(df_input)
total_na = df_input.isna().sum().sum()
percent_na_total = (total_na / (total_rows * df_input.shape[1])) * 100
print(f"Percentage of NA values: {percent_na_total:.2f}%")

Percentage of NA values: 0.44%


# Data Preparation

In [9]:
# Parameters for data preparation
group_columns = ['reporterhq_id', 'product_number']
date_column = 'date'
target = 'inventory_units'
signal_columns = ['sales_units', 'inventory_units']
dp_window_size = 13
horizon = 13
freq = 'W-SAT'

In [10]:
# Init class
data_preparation = DataPreparation()

## Smoothing

In [11]:
# Call the function
df_smoothed = data_preparation.smoothing(df_input, group_columns, date_column, signal_columns, dp_window_size)

# Show
df_smoothed.head()

Unnamed: 0,reporterhq_id,product_number,prod_category,specs,display_size,segment,date,sales_units,inventory_units,filled_sales_units,filled_inventory_units
913,12,10857,Doraemon,108304,15.6,Gaming,2021-01-30,0.0,52.0,0.0,52.0
914,12,10857,Doraemon,108304,15.6,Gaming,2021-02-06,8.0,48.0,8.0,48.0
915,12,10857,Doraemon,108304,15.6,Gaming,2021-02-13,39.0,72.0,39.0,72.0
916,12,10857,Doraemon,108304,15.6,Gaming,2021-02-20,0.0,138.0,0.0,138.0
917,12,10857,Doraemon,108304,15.6,Gaming,2021-02-27,69.0,87.0,69.0,87.0


## Backtesting

In [12]:
# Parameters
n_cutoffs = 4

# Get the cutoff list
cutoff_list = data_preparation.get_first_dates_last_n_months(df_smoothed, date_column, n_cutoffs)

# Append the latest date to the list
latest_date = df_smoothed[date_column].max()
cutoff_list.append(latest_date)

# Convert list to DataFrame to use sort_values
cutoff_df = pd.DataFrame(cutoff_list, columns=[date_column])

# Drop duplicates and sort the DataFrame in ascending order
cutoff_df = cutoff_df.drop_duplicates().sort_values(by=date_column, ascending=True).reset_index(drop=True)

# Convert back to list
cutoff_list_sorted = cutoff_df[date_column].tolist()

# Show the sorted, distinct cutoff list
print(cutoff_list_sorted)

[Timestamp('2023-02-04 00:00:00'), Timestamp('2023-03-04 00:00:00'), Timestamp('2023-04-01 00:00:00'), Timestamp('2023-05-06 00:00:00')]


In [13]:
# Call the function
df_backtesting = data_preparation.create_backtesting_df(df_smoothed, date_column, cutoff_list_sorted)

# Show
df_backtesting.head()

Unnamed: 0,reporterhq_id,product_number,prod_category,specs,display_size,segment,date,sales_units,inventory_units,filled_sales_units,filled_inventory_units,cutoff,sample
0,12,10857,Doraemon,108304,15.6,Gaming,2021-01-30,0.0,52.0,0.0,52.0,2023-02-04,train
1,12,10857,Doraemon,108304,15.6,Gaming,2021-01-30,0.0,52.0,0.0,52.0,2023-03-04,train
2,12,10857,Doraemon,108304,15.6,Gaming,2021-01-30,0.0,52.0,0.0,52.0,2023-04-01,train
3,12,10857,Doraemon,108304,15.6,Gaming,2021-01-30,0.0,52.0,0.0,52.0,2023-05-06,train
4,12,10857,Doraemon,108304,15.6,Gaming,2021-02-06,8.0,48.0,8.0,48.0,2023-02-04,train


In [14]:
# Print the number of rows and columns
print(f"Number of columns in df_backtesting: {df_backtesting.shape[1]}")
print(f"Number of rows in df_backtesting: {df_backtesting.shape[0]}")

Number of columns in df_backtesting: 13
Number of rows in df_backtesting: 100556


## Fill horizon

In [15]:
# Call the function
df_backtesting_with_horizon = data_preparation.add_horizon_last_cutoff(df_backtesting, group_columns, date_column, horizon, freq)

# Show
df_backtesting_with_horizon.head()

Unnamed: 0,reporterhq_id,product_number,prod_category,specs,display_size,segment,date,sales_units,inventory_units,filled_sales_units,filled_inventory_units,cutoff,sample
0,12,10857,Doraemon,108304,15.6,Gaming,2021-01-30,0.0,52.0,0.0,52.0,2023-02-04,train
3,12,10857,Doraemon,108304,15.6,Gaming,2021-02-06,8.0,48.0,8.0,48.0,2023-02-04,train
6,12,10857,Doraemon,108304,15.6,Gaming,2021-02-13,39.0,72.0,39.0,72.0,2023-02-04,train
9,12,10857,Doraemon,108304,15.6,Gaming,2021-02-20,0.0,138.0,0.0,138.0,2023-02-04,train
12,12,10857,Doraemon,108304,15.6,Gaming,2021-02-27,69.0,87.0,69.0,87.0,2023-02-04,train


In [16]:
# Print the number of rows and columns
print(f"Number of columns in df_backtesting_with_horizon: {df_backtesting_with_horizon.shape[1]}")
print(f"Number of rows in df_backtesting_with_horizon: {df_backtesting_with_horizon.shape[0]}")

Number of columns in df_backtesting_with_horizon: 13
Number of rows in df_backtesting_with_horizon: 107134


In [17]:
# Get memory details
memory = psutil.virtual_memory()

# Convert from bytes to GB
total_memory = memory.total / (1024 ** 3)
available_memory = memory.available / (1024 ** 3)
used_memory = memory.used / (1024 ** 3)

# Print memory details
print(f"Total Memory: {total_memory:.2f} GB")
print(f"Available Memory: {available_memory:.2f} GB")
print(f"Used Memory: {used_memory:.2f} GB")

Total Memory: 16.00 GB
Available Memory: 5.36 GB
Used Memory: 5.82 GB


# Feature Engineering

In [18]:
# Parameters for feature engineering
group_columns_fe = group_columns + ['cutoff']
date_column = 'date'
target = 'filled_inventory_units'
signal_columns_fe = ['filled_' + column for column in signal_columns]
lower_level_group = 'product_number'
fe_window_size = [4, 13]
lags = [13, 52]
fill_lags = True
freq = 'W'

# Number of clusters
n_groups = 15

In [19]:
# Init class
feature_engineering = FeatureEngineering()

## Create encodings

In [20]:
# Find categorial columns
categorical_columns = df_backtesting_with_horizon.select_dtypes(include='object').columns.tolist()

# Exclude 'sample_column' from the list
categorical_columns = [col for col in categorical_columns if col != 'sample']

# Show
categorical_columns

['reporterhq_id',
 'product_number',
 'prod_category',
 'specs',
 'display_size',
 'segment']

In [21]:
# Apply the function
df_backtesting_with_categories = feature_engineering.create_encoded_features(df_backtesting_with_horizon, categorical_columns)

# Show
df_backtesting_with_categories.head()

Unnamed: 0,reporterhq_id,product_number,prod_category,specs,display_size,segment,date,sales_units,inventory_units,filled_sales_units,filled_inventory_units,cutoff,sample,feature_reporterhq_id,feature_product_number,feature_prod_category,feature_specs,feature_display_size,feature_segment
0,12,10857,Doraemon,108304,15.6,Gaming,2021-01-30,0.0,52.0,0.0,52.0,2023-02-04,train,0,7,4,9,6,1
3,12,10857,Doraemon,108304,15.6,Gaming,2021-02-06,8.0,48.0,8.0,48.0,2023-02-04,train,0,7,4,9,6,1
6,12,10857,Doraemon,108304,15.6,Gaming,2021-02-13,39.0,72.0,39.0,72.0,2023-02-04,train,0,7,4,9,6,1
9,12,10857,Doraemon,108304,15.6,Gaming,2021-02-20,0.0,138.0,0.0,138.0,2023-02-04,train,0,7,4,9,6,1
12,12,10857,Doraemon,108304,15.6,Gaming,2021-02-27,69.0,87.0,69.0,87.0,2023-02-04,train,0,7,4,9,6,1


## Period features

In [22]:
# Apply the function
df_backtesting_with_periods = feature_engineering.create_periods_feature(df_backtesting_with_categories, group_columns_fe, date_column, target)

# Show
df_backtesting_with_periods.head()

Unnamed: 0,reporterhq_id,product_number,prod_category,specs,display_size,segment,date,sales_units,inventory_units,filled_sales_units,...,sample,feature_reporterhq_id,feature_product_number,feature_prod_category,feature_specs,feature_display_size,feature_segment,feature_periods,feature_periods_expanding,feature_periods_sqrt
0,12,10857,Doraemon,108304,15.6,Gaming,2021-01-30,0.0,52.0,0.0,...,train,0,7,4,9,6,1,1.0,1.0,1.0
1,12,10857,Doraemon,108304,15.6,Gaming,2021-02-06,8.0,48.0,8.0,...,train,0,7,4,9,6,1,2.0,2.143547,1.414214
2,12,10857,Doraemon,108304,15.6,Gaming,2021-02-13,39.0,72.0,39.0,...,train,0,7,4,9,6,1,3.0,3.34837,1.732051
3,12,10857,Doraemon,108304,15.6,Gaming,2021-02-20,0.0,138.0,0.0,...,train,0,7,4,9,6,1,4.0,4.594793,2.0
4,12,10857,Doraemon,108304,15.6,Gaming,2021-02-27,69.0,87.0,69.0,...,train,0,7,4,9,6,1,5.0,5.873095,2.236068


## Calendar features

In [23]:
# Call the function
df_with_dates = feature_engineering.create_date_features(df_backtesting_with_periods, date_column, freq)

# Show
df_with_dates.head()

Unnamed: 0,reporterhq_id,product_number,prod_category,specs,display_size,segment,date,sales_units,inventory_units,filled_sales_units,...,feature_cos_weekly_3,feature_days_until_end_of_month,feature_days_until_end_of_week,feature_days_until_end_of_quarter,feature_days_until_end_of_year,feature_weeks_until_end_of_month,feature_weeks_until_end_of_quarter,feature_weeks_until_end_of_year,feature_months_until_end_of_quarter,feature_months_until_end_of_year
0,12,10857,Doraemon,108304,15.6,Gaming,2021-01-30,0.0,52.0,0.0,...,0.62349,1,1,60,335,0.142857,8.571429,47.857143,2.0,11.0
1,12,10857,Doraemon,108304,15.6,Gaming,2021-02-06,8.0,48.0,8.0,...,0.62349,22,1,53,328,3.142857,7.571429,46.857143,1.0,10.0
2,12,10857,Doraemon,108304,15.6,Gaming,2021-02-13,39.0,72.0,39.0,...,0.62349,15,1,46,321,2.142857,6.571429,45.857143,1.0,10.0
3,12,10857,Doraemon,108304,15.6,Gaming,2021-02-20,0.0,138.0,0.0,...,0.62349,8,1,39,314,1.142857,5.571429,44.857143,1.0,10.0
4,12,10857,Doraemon,108304,15.6,Gaming,2021-02-27,69.0,87.0,69.0,...,0.62349,1,1,32,307,0.142857,4.571429,43.857143,1.0,10.0


## Moving average

In [24]:
# Call the function
df_with_ma = feature_engineering.create_ma_features(df_with_dates, group_columns_fe, signal_columns_fe, fe_window_size)

# Show
df_with_ma.head()

Unnamed: 0,reporterhq_id,product_number,prod_category,specs,display_size,segment,date,sales_units,inventory_units,filled_sales_units,...,feature_days_until_end_of_year,feature_weeks_until_end_of_month,feature_weeks_until_end_of_quarter,feature_weeks_until_end_of_year,feature_months_until_end_of_quarter,feature_months_until_end_of_year,filled_sales_units_ma_4,filled_sales_units_ma_13,filled_inventory_units_ma_4,filled_inventory_units_ma_13
0,12,10857,Doraemon,108304,15.6,Gaming,2021-01-30,0.0,52.0,0.0,...,335,0.142857,8.571429,47.857143,2.0,11.0,0.0,0.0,52.0,52.0
1,12,10857,Doraemon,108304,15.6,Gaming,2021-02-06,8.0,48.0,8.0,...,328,3.142857,7.571429,46.857143,1.0,10.0,4.0,4.0,50.0,50.0
2,12,10857,Doraemon,108304,15.6,Gaming,2021-02-13,39.0,72.0,39.0,...,321,2.142857,6.571429,45.857143,1.0,10.0,15.666667,15.666667,57.333333,57.333333
3,12,10857,Doraemon,108304,15.6,Gaming,2021-02-20,0.0,138.0,0.0,...,314,1.142857,5.571429,44.857143,1.0,10.0,11.75,11.75,77.5,77.5
4,12,10857,Doraemon,108304,15.6,Gaming,2021-02-27,69.0,87.0,69.0,...,307,0.142857,4.571429,43.857143,1.0,10.0,29.0,23.2,86.25,79.4


## Moving stats

In [25]:
# Apply the function
df_with_min_max = feature_engineering.create_moving_stats(df_with_ma, group_columns_fe, signal_columns_fe, fe_window_size)

# Display the result
df_with_min_max.head()

Unnamed: 0,reporterhq_id,product_number,prod_category,specs,display_size,segment,date,sales_units,inventory_units,filled_sales_units,...,filled_inventory_units_ma_4,filled_inventory_units_ma_13,filled_sales_units_min_4,filled_sales_units_max_4,filled_sales_units_min_13,filled_sales_units_max_13,filled_inventory_units_min_4,filled_inventory_units_max_4,filled_inventory_units_min_13,filled_inventory_units_max_13
0,12,10857,Doraemon,108304,15.6,Gaming,2021-01-30,0.0,52.0,0.0,...,52.0,52.0,0.0,0.0,0.0,0.0,52.0,52.0,52.0,52.0
1,12,10857,Doraemon,108304,15.6,Gaming,2021-02-06,8.0,48.0,8.0,...,50.0,50.0,0.0,8.0,0.0,8.0,48.0,52.0,48.0,52.0
2,12,10857,Doraemon,108304,15.6,Gaming,2021-02-13,39.0,72.0,39.0,...,57.333333,57.333333,0.0,39.0,0.0,39.0,48.0,72.0,48.0,72.0
3,12,10857,Doraemon,108304,15.6,Gaming,2021-02-20,0.0,138.0,0.0,...,77.5,77.5,0.0,39.0,0.0,39.0,48.0,138.0,48.0,138.0
4,12,10857,Doraemon,108304,15.6,Gaming,2021-02-27,69.0,87.0,69.0,...,86.25,79.4,0.0,69.0,0.0,69.0,48.0,138.0,48.0,138.0


## Generate lags

In [26]:
# Parameters
lag_columns = [col for col in df_with_min_max.columns if col.startswith('filled_')]

# Create lag features in the DataFrame
df_with_lags = feature_engineering.create_lag_features(df_with_min_max, group_columns_fe, date_column, lag_columns, lags, fill_lags)

# Show
df_with_lags.head()

Unnamed: 0,reporterhq_id,product_number,prod_category,specs,display_size,segment,date,sales_units,inventory_units,filled_sales_units,...,feature_filled_sales_units_max_13_lag_13,feature_filled_sales_units_max_13_lag_52,feature_filled_inventory_units_min_4_lag_13,feature_filled_inventory_units_min_4_lag_52,feature_filled_inventory_units_max_4_lag_13,feature_filled_inventory_units_max_4_lag_52,feature_filled_inventory_units_min_13_lag_13,feature_filled_inventory_units_min_13_lag_52,feature_filled_inventory_units_max_13_lag_13,feature_filled_inventory_units_max_13_lag_52
0,12,10857,Doraemon,108304,15.6,Gaming,2021-01-30,0.0,52.0,0.0,...,,,,,,,,,,
1,12,10857,Doraemon,108304,15.6,Gaming,2021-02-06,8.0,48.0,8.0,...,,,,,,,,,,
2,12,10857,Doraemon,108304,15.6,Gaming,2021-02-13,39.0,72.0,39.0,...,,,,,,,,,,
3,12,10857,Doraemon,108304,15.6,Gaming,2021-02-20,0.0,138.0,0.0,...,,,,,,,,,,
4,12,10857,Doraemon,108304,15.6,Gaming,2021-02-27,69.0,87.0,69.0,...,,,,,,,,,,


## Get Cov ratio

In [27]:
# Call the function
df_with_cov = feature_engineering.create_cov(df_with_lags, group_columns_fe, target)

# Show
df_with_cov.head()

Unnamed: 0,reporterhq_id,product_number,prod_category,specs,display_size,segment,date,sales_units,inventory_units,filled_sales_units,...,feature_filled_sales_units_max_13_lag_52,feature_filled_inventory_units_min_4_lag_13,feature_filled_inventory_units_min_4_lag_52,feature_filled_inventory_units_max_4_lag_13,feature_filled_inventory_units_max_4_lag_52,feature_filled_inventory_units_min_13_lag_13,feature_filled_inventory_units_min_13_lag_52,feature_filled_inventory_units_max_13_lag_13,feature_filled_inventory_units_max_13_lag_52,feature_filled_inventory_units_cov
0,12,10857,Doraemon,108304,15.6,Gaming,2021-01-30,0.0,52.0,0.0,...,,,,,,,,,,0.627141
1,12,10857,Doraemon,108304,15.6,Gaming,2021-02-06,8.0,48.0,8.0,...,,,,,,,,,,0.627141
2,12,10857,Doraemon,108304,15.6,Gaming,2021-02-13,39.0,72.0,39.0,...,,,,,,,,,,0.627141
3,12,10857,Doraemon,108304,15.6,Gaming,2021-02-20,0.0,138.0,0.0,...,,,,,,,,,,0.627141
4,12,10857,Doraemon,108304,15.6,Gaming,2021-02-27,69.0,87.0,69.0,...,,,,,,,,,,0.627141


## ID combinations

In [28]:
# Call the function
df_with_combinations = feature_engineering.create_distinct_combinations(df_with_cov, group_columns_fe, lower_level_group)

# Show
df_with_combinations.head()

Unnamed: 0,reporterhq_id,product_number,prod_category,specs,display_size,segment,date,sales_units,inventory_units,filled_sales_units,...,feature_filled_inventory_units_min_4_lag_52,feature_filled_inventory_units_max_4_lag_13,feature_filled_inventory_units_max_4_lag_52,feature_filled_inventory_units_min_13_lag_13,feature_filled_inventory_units_min_13_lag_52,feature_filled_inventory_units_max_13_lag_13,feature_filled_inventory_units_max_13_lag_52,feature_filled_inventory_units_cov,feature_distinct_product_number_reporterhq_id,feature_distinct_product_number_cutoff
0,12,10857,Doraemon,108304,15.6,Gaming,2021-01-30,0.0,52.0,0.0,...,,,,,,,,0.627141,2,4
1,12,10857,Doraemon,108304,15.6,Gaming,2021-02-06,8.0,48.0,8.0,...,,,,,,,,0.627141,2,4
2,12,10857,Doraemon,108304,15.6,Gaming,2021-02-13,39.0,72.0,39.0,...,,,,,,,,0.627141,2,4
3,12,10857,Doraemon,108304,15.6,Gaming,2021-02-20,0.0,138.0,0.0,...,,,,,,,,0.627141,2,4
4,12,10857,Doraemon,108304,15.6,Gaming,2021-02-27,69.0,87.0,69.0,...,,,,,,,,0.627141,2,4


## Add clusters

In [29]:
# Call the function
df_with_clusters = feature_engineering.create_quantile_clusters(df_with_combinations, group_columns, signal_columns_fe, n_groups)

# Show
df_with_clusters.head()

Unnamed: 0,reporterhq_id,product_number,prod_category,specs,display_size,segment,date,sales_units,inventory_units,filled_sales_units,...,feature_filled_inventory_units_max_4_lag_52,feature_filled_inventory_units_min_13_lag_13,feature_filled_inventory_units_min_13_lag_52,feature_filled_inventory_units_max_13_lag_13,feature_filled_inventory_units_max_13_lag_52,feature_filled_inventory_units_cov,feature_distinct_product_number_reporterhq_id,feature_distinct_product_number_cutoff,feature_filled_sales_units_cluster,feature_filled_inventory_units_cluster
0,12,10857,Doraemon,108304,15.6,Gaming,2021-01-30,0.0,52.0,0.0,...,,,,,,0.627141,2,4,5,10
1,12,10857,Doraemon,108304,15.6,Gaming,2021-02-06,8.0,48.0,8.0,...,,,,,,0.627141,2,4,5,10
2,12,10857,Doraemon,108304,15.6,Gaming,2021-02-13,39.0,72.0,39.0,...,,,,,,0.627141,2,4,5,10
3,12,10857,Doraemon,108304,15.6,Gaming,2021-02-20,0.0,138.0,0.0,...,,,,,,0.627141,2,4,5,10
4,12,10857,Doraemon,108304,15.6,Gaming,2021-02-27,69.0,87.0,69.0,...,,,,,,0.627141,2,4,5,10


In [30]:
# Call the function
df_with_clusters = feature_engineering.create_history_clusters(df_with_clusters, group_columns, signal_columns_fe, n_groups)

# Show
df_with_clusters.head()

Unnamed: 0,reporterhq_id,product_number,prod_category,specs,display_size,segment,date,sales_units,inventory_units,filled_sales_units,...,feature_filled_inventory_units_min_13_lag_52,feature_filled_inventory_units_max_13_lag_13,feature_filled_inventory_units_max_13_lag_52,feature_filled_inventory_units_cov,feature_distinct_product_number_reporterhq_id,feature_distinct_product_number_cutoff,feature_filled_sales_units_cluster,feature_filled_inventory_units_cluster,feature_filled_sales_units_history_cluster,feature_filled_inventory_units_history_cluster
0,12,10857,Doraemon,108304,15.6,Gaming,2021-01-30,0.0,52.0,0.0,...,,,,0.627141,2,4,5,10,4,12
1,12,10857,Doraemon,108304,15.6,Gaming,2021-02-06,8.0,48.0,8.0,...,,,,0.627141,2,4,5,10,4,12
2,12,10857,Doraemon,108304,15.6,Gaming,2021-02-13,39.0,72.0,39.0,...,,,,0.627141,2,4,5,10,4,12
3,12,10857,Doraemon,108304,15.6,Gaming,2021-02-20,0.0,138.0,0.0,...,,,,0.627141,2,4,5,10,4,12
4,12,10857,Doraemon,108304,15.6,Gaming,2021-02-27,69.0,87.0,69.0,...,,,,0.627141,2,4,5,10,4,12


In [31]:
# Call the function
df_with_clusters = feature_engineering.create_intermittence_clusters(df_with_clusters, group_columns, signal_columns_fe, n_groups)

# Show
df_with_clusters.head()

Unnamed: 0,reporterhq_id,product_number,prod_category,specs,display_size,segment,date,sales_units,inventory_units,filled_sales_units,...,feature_filled_inventory_units_max_13_lag_52,feature_filled_inventory_units_cov,feature_distinct_product_number_reporterhq_id,feature_distinct_product_number_cutoff,feature_filled_sales_units_cluster,feature_filled_inventory_units_cluster,feature_filled_sales_units_history_cluster,feature_filled_inventory_units_history_cluster,feature_intermittence_filled_sales_units_cluster,feature_intermittence_filled_inventory_units_cluster
0,12,10857,Doraemon,108304,15.6,Gaming,2021-01-30,0.0,52.0,0.0,...,,0.627141,2,4,5,10,4,12,9,1
1,12,10857,Doraemon,108304,15.6,Gaming,2021-02-06,8.0,48.0,8.0,...,,0.627141,2,4,5,10,4,12,9,1
2,12,10857,Doraemon,108304,15.6,Gaming,2021-02-13,39.0,72.0,39.0,...,,0.627141,2,4,5,10,4,12,9,1
3,12,10857,Doraemon,108304,15.6,Gaming,2021-02-20,0.0,138.0,0.0,...,,0.627141,2,4,5,10,4,12,9,1
4,12,10857,Doraemon,108304,15.6,Gaming,2021-02-27,69.0,87.0,69.0,...,,0.627141,2,4,5,10,4,12,9,1


## Train weights

In [32]:
# Call the function
df_with_train_weights = feature_engineering.create_train_weights(df_with_clusters, group_columns)

# Show
df_with_train_weights.head()

Unnamed: 0,reporterhq_id,product_number,prod_category,specs,display_size,segment,date,sales_units,inventory_units,filled_sales_units,...,feature_filled_inventory_units_cov,feature_distinct_product_number_reporterhq_id,feature_distinct_product_number_cutoff,feature_filled_sales_units_cluster,feature_filled_inventory_units_cluster,feature_filled_sales_units_history_cluster,feature_filled_inventory_units_history_cluster,feature_intermittence_filled_sales_units_cluster,feature_intermittence_filled_inventory_units_cluster,train_weight
0,12,10857,Doraemon,108304,15.6,Gaming,2021-01-30,0.0,52.0,0.0,...,0.627141,2,4,5,10,4,12,9,1,0.008547
1,12,10857,Doraemon,108304,15.6,Gaming,2021-02-06,8.0,48.0,8.0,...,0.627141,2,4,5,10,4,12,9,1,0.017094
2,12,10857,Doraemon,108304,15.6,Gaming,2021-02-13,39.0,72.0,39.0,...,0.627141,2,4,5,10,4,12,9,1,0.025641
3,12,10857,Doraemon,108304,15.6,Gaming,2021-02-20,0.0,138.0,0.0,...,0.627141,2,4,5,10,4,12,9,1,0.034188
4,12,10857,Doraemon,108304,15.6,Gaming,2021-02-27,69.0,87.0,69.0,...,0.627141,2,4,5,10,4,12,9,1,0.042735


## Add Fcst Lag

In [33]:
# Call the function
df_with_fcst_lag = feature_engineering.create_fcst_lag_number(df_with_train_weights, group_columns, date_column)

# Show
df_with_fcst_lag.head()

Unnamed: 0,reporterhq_id,product_number,prod_category,specs,display_size,segment,date,sales_units,inventory_units,filled_sales_units,...,feature_distinct_product_number_reporterhq_id,feature_distinct_product_number_cutoff,feature_filled_sales_units_cluster,feature_filled_inventory_units_cluster,feature_filled_sales_units_history_cluster,feature_filled_inventory_units_history_cluster,feature_intermittence_filled_sales_units_cluster,feature_intermittence_filled_inventory_units_cluster,train_weight,fcst_lag
0,12,10857,Doraemon,108304,15.6,Gaming,2021-01-30,0.0,52.0,0.0,...,2,4,5,10,4,12,9,1,0.008547,0
118,12,10857,Doraemon,108304,15.6,Gaming,2021-01-30,0.0,52.0,0.0,...,2,4,5,10,4,12,9,1,0.008547,0
236,12,10857,Doraemon,108304,15.6,Gaming,2021-01-30,0.0,52.0,0.0,...,2,4,5,10,4,12,9,1,0.008547,0
354,12,10857,Doraemon,108304,15.6,Gaming,2021-01-30,0.0,52.0,0.0,...,2,4,5,10,4,12,9,1,0.008547,0
1,12,10857,Doraemon,108304,15.6,Gaming,2021-02-06,8.0,48.0,8.0,...,2,4,5,10,4,12,9,1,0.017094,0


# Create baselines

In [None]:
# Parameters for baseline creation
group_columns_cb = group_columns + ['cutoff']
date_column = 'date'

# MA Baseline parameters
signal_columns = ['filled_sales_units', 'filled_inventory_units']
bs_window_size = 13

In [35]:
# Prepare class
create_baselines = CreateBaselines()

## MA Baseline

In [36]:
# Call the function
df_with_ma_baseline = create_baselines.create_ma_baseline(df_with_fcst_lag, group_columns_cb, date_column, signal_columns, bs_window_size)

# Show
df_with_ma_baseline.head()

Unnamed: 0,reporterhq_id,product_number,prod_category,specs,display_size,segment,date,sales_units,inventory_units,filled_sales_units,...,feature_filled_sales_units_history_cluster,feature_filled_inventory_units_history_cluster,feature_intermittence_filled_sales_units_cluster,feature_intermittence_filled_inventory_units_cluster,train_weight,fcst_lag,feature_baseline_filled_sales_units_ma_13_ma_13,feature_baseline_filled_inventory_units_ma_13_ma_13,baseline_filled_sales_units_ma_13_ma_13,baseline_filled_inventory_units_ma_13_ma_13
0,12,10857,Doraemon,108304,15.6,Gaming,2021-01-30,0.0,52.0,0.0,...,4,12,9,1,0.008547,0,0.0,52.0,,
1,12,10857,Doraemon,108304,15.6,Gaming,2021-02-06,8.0,48.0,8.0,...,4,12,9,1,0.017094,0,2.0,51.0,,
2,12,10857,Doraemon,108304,15.6,Gaming,2021-02-13,39.0,72.0,39.0,...,4,12,9,1,0.025641,0,6.555556,53.111111,,
3,12,10857,Doraemon,108304,15.6,Gaming,2021-02-20,0.0,138.0,0.0,...,4,12,9,1,0.034188,0,7.854167,59.208333,,
4,12,10857,Doraemon,108304,15.6,Gaming,2021-02-27,69.0,87.0,69.0,...,4,12,9,1,0.042735,0,10.923333,63.246667,,


# Save output

In [37]:
# Check if any column is completely filled with NaN values
na_columns = df_with_ma_baseline.columns[df_with_ma_baseline.isna().all()].tolist()

# Show
print("Columns full of NaN values:", na_columns)

Columns full of NaN values: []


In [38]:
# Round all float columns to 2 decimal places
float_cols = df_with_ma_baseline.select_dtypes(include=['float']).columns
df_with_ma_baseline[float_cols] = df_with_ma_baseline[float_cols].round(2)

# Show
df_with_ma_baseline.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107134 entries, 0 to 107133
Columns: 112 entries, reporterhq_id to baseline_filled_inventory_units_ma_13_ma_13
dtypes: datetime64[ns](2), float64(78), int32(5), int64(20), object(7)
memory usage: 89.5+ MB


In [39]:
# Get memory details
memory = psutil.virtual_memory()

# Convert from bytes to GB
total_memory = memory.total / (1024 ** 3)
available_memory = memory.available / (1024 ** 3)
used_memory = memory.used / (1024 ** 3)

# Print memory details
print(f"Total Memory: {total_memory:.2f} GB")
print(f"Available Memory: {available_memory:.2f} GB")
print(f"Used Memory: {used_memory:.2f} GB")

Total Memory: 16.00 GB
Available Memory: 4.88 GB
Used Memory: 6.16 GB


In [40]:
# Create the filename with the phase appended
file_name = f'data/df_backtesting.csv'

# Write the dataframe to the CSV file
df_with_ma_baseline.to_csv(file_name, index=False)

In [41]:
# Get memory details
memory = psutil.virtual_memory()

# Convert from bytes to GB
total_memory = memory.total / (1024 ** 3)
available_memory = memory.available / (1024 ** 3)
used_memory = memory.used / (1024 ** 3)

# Print memory details
print(f"Total Memory: {total_memory:.2f} GB")
print(f"Available Memory: {available_memory:.2f} GB")
print(f"Used Memory: {used_memory:.2f} GB")

Total Memory: 16.00 GB
Available Memory: 4.88 GB
Used Memory: 6.15 GB


# Clean

In [42]:
# Garbage collection
gc.collect()

0