# Setup

In [1]:
# General libraries
import pandas as pd
import numpy as np
import warnings
import psutil
import gc
import os

# Data preparation
from itertools import product

# Plots
import matplotlib.pyplot as plt

# Sklearn
from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor

# Sklearn
from sklearn.preprocessing import LabelEncoder

In [2]:
# Suppress warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)

# Suppress the specific PerformanceWarning
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

# Suppress SettingWithCopyWarning
pd.options.mode.chained_assignment = None

# Directory

In [3]:
# Set cd to parent directory
os.chdir("/Users/ignasipascual/Documents/GitHub/LightGBM-Forecaster")
cwd = os.getcwd()
print("Current Working Directory:", cwd)

Current Working Directory: /Users/ignasipascual/Documents/GitHub/LightGBM-Forecaster


In [4]:
# Set source
source = 'local'

# Print files in directory
if source == 'gdrive':
    from google.colab import drive
    drive.mount('/content/drive')
    os.chdir('/content/drive/MyDrive/Forecasting')
    print("Listing files from Google Drive 'Forecasting' directory:")
else:
    print("Listing files from the current local directory:")

# List files in the specified directory
files = os.listdir()

# Print each file
for file in files:
    print(file)

Listing files from the current local directory:
.DS_Store
LICENSE
requirements.txt
utils
docs
README.md
submissions
.git
data
notebooks


# Assets

## Import assets

In [5]:
# Import assets
from utils.data_preparation import DataPreparation
from utils.feature_engineering import FeatureEngineering
from utils.create_baselines import CreateBaselines
from utils.auxiliar import *

# Load data

In [6]:
# Load data for all phases
df_prices_0 = pd.read_csv('Data/Phase 0 - Price.csv')
df_sales_0 = pd.read_csv('Data/Phase 0 - Sales.csv')
df_prices_1 = pd.read_csv('Data/Phase 1 - Price.csv')
df_sales_1 = pd.read_csv('Data/Phase 1 - Sales.csv')

# Perform a left join on 'Client', 'Warehouse', and 'Product' for prices and sales
df_prices = pd.merge(df_prices_0, df_prices_1, on=['Client', 'Warehouse', 'Product'], how='left')
df_sales = pd.merge(df_sales_0, df_sales_1, on=['Client', 'Warehouse', 'Product'], how='left')

## Join tables

In [7]:
# Unpivot dataset
df_prices_unpiv = unpivot_data(df_prices, id_vars=['Client', 'Warehouse', 'Product'], var_name='Date', value_name='Price')
df_sales_unpiv = unpivot_data(df_sales, id_vars=['Client', 'Warehouse', 'Product'], var_name='Date', value_name='Sales')

# Merge the two dataframes
df_input = df_sales_unpiv.merge(df_prices_unpiv, how = 'left', on = ['Client', 'Warehouse', 'Product', 'Date'])

# Round to 2 decimals
df_input['Sales'] = df_input['Sales'].round(2)
df_input['Price'] = df_input['Price'].round(2)

# Display the unpivoted dataframe
df_input.head()

Unnamed: 0,Client,Warehouse,Product,Date,Sales,Price
0,0,1,367,2020-07-06,7.0,10.9
1,0,1,639,2020-07-06,0.0,
2,0,1,655,2020-07-06,21.0,21.34
3,0,1,1149,2020-07-06,7.0,11.48
4,0,1,1485,2020-07-06,0.0,


## Filter scope

In [8]:
# Filter for one client
clients = range(1, 7)

# Apply filter
df_input = df_input[df_input['Client'].isin(clients)]

# Show
df_input.head()

Unnamed: 0,Client,Warehouse,Product,Date,Sales,Price
930,1,36,6101,2020-07-06,0.0,
931,1,36,6795,2020-07-06,0.0,
932,1,36,11992,2020-07-06,0.0,
933,1,40,7628,2020-07-06,0.0,
934,1,46,81,2020-07-06,0.0,


## Formatting

In [9]:
# Convert all column names to lowercase
df_input.columns = df_input.columns.str.lower()

# Date format
df_input['date'] = pd.to_datetime(df_input['date'])

# Convert each specified column to string
character_cols = ['client', 'warehouse', 'product']
for col in character_cols:
    df_input[col] = df_input[col].astype(str)

# Convert signal columns to numeric, coercing errors to NaN
signal_cols = ['sales', 'price']
for col in signal_cols:
    df_input[col] = pd.to_numeric(df_input[col], errors='coerce')

# Create id column
df_input['id_column'] = df_input['client'] + '/' + df_input['warehouse'] + '/' + df_input['product']

# Select the specified columns
df_input = df_input[['client', 'warehouse', 'product', 'id_column', 'date', 'sales', 'price']]

# Sort by 'id_column' and 'date' in ascending order
df_input = df_input.sort_values(by=['id_column', 'date'])

# Show
df_input.head()

Unnamed: 0,client,warehouse,product,id_column,date,sales,price
1573,1,151,10060,1/151/10060,2020-07-06,0.0,
16626,1,151,10060,1/151/10060,2020-07-13,0.0,
31679,1,151,10060,1/151/10060,2020-07-20,0.0,
46732,1,151,10060,1/151/10060,2020-07-27,0.0,
61785,1,151,10060,1/151/10060,2020-08-03,0.0,


## Statistics

In [10]:
# Print the number of rows and columns
print(f"Number of columns in df_input: {df_input.shape[1]}")
print(f"Number of rows in df_input: {df_input.shape[0]}")

# Print the number of distinct values for each specified column
print(f"Number of distinct clients: {df_input['client'].nunique()}")
print(f"Number of distinct warehouses: {df_input['warehouse'].nunique()}")
print(f"Number of distinct products: {df_input['product'].nunique()}")
print(f"Number of distinct id_column values: {df_input['id_column'].nunique()}")

Number of columns in df_input: 7
Number of rows in df_input: 537288
Number of distinct clients: 6
Number of distinct warehouses: 37
Number of distinct products: 2378
Number of distinct id_column values: 2936


In [11]:
# Calculate the percentage of NA values in the entire DataFrame
total_rows = len(df_input)
total_na = df_input.isna().sum().sum()
percent_na_total = (total_na / (total_rows * df_input.shape[1])) * 100
print(f"Overall percentage of NA values: {percent_na_total:.2f}%")

Overall percentage of NA values: 10.25%


# Data preparation

In [12]:
# Parameters for data preparation
group_columns = ['client', 'warehouse', 'product']
date_column = 'date'
target = 'sales'
horizon = 13
freq = 'W'

In [13]:
# Init class
data_preparation = DataPreparation()

## Smoothing

In [14]:
# Parameters
signal_columns = ['sales', 'price']
window_size = 13

# Call the function
df_smoothed = data_preparation.smoothing(df_input, group_columns, date_column, signal_columns, window_size)

# Show
df_smoothed.head()

Unnamed: 0,client,warehouse,product,id_column,date,sales,price,filled_sales,filled_price
1573,1,151,10060,1/151/10060,2020-07-06,0.0,,0.0,
16626,1,151,10060,1/151/10060,2020-07-13,0.0,,0.0,
31679,1,151,10060,1/151/10060,2020-07-20,0.0,,0.0,
46732,1,151,10060,1/151/10060,2020-07-27,0.0,,0.0,
61785,1,151,10060,1/151/10060,2020-08-03,0.0,,0.0,


## Backtesting

In [15]:
# Parameters
n_cutoffs = 6

# Get the cutoff list
cutoff_list = data_preparation.get_first_dates_last_n_months(df_smoothed, date_column, n_cutoffs)

# Append the latest date to the list
latest_date = df_smoothed[date_column].max()
cutoff_list.append(latest_date)

# Convert list to DataFrame to use sort_values
cutoff_df = pd.DataFrame(cutoff_list, columns=[date_column])

# Drop duplicates and sort the DataFrame in ascending order
cutoff_df = cutoff_df.drop_duplicates().sort_values(by=date_column, ascending=True).reset_index(drop=True)

# Convert back to list
cutoff_list_sorted = cutoff_df[date_column].tolist()

# Show the sorted, distinct cutoff list
print(cutoff_list_sorted)

[Timestamp('2023-11-06 00:00:00'), Timestamp('2023-12-04 00:00:00'), Timestamp('2024-01-01 00:00:00')]


In [16]:
# Parameters
date_column = 'date'

# Call the function
df_backtesting = data_preparation.create_backtesting_df(df_smoothed, date_column, cutoff_list_sorted)

# Show
df_backtesting.head()

Unnamed: 0,client,warehouse,product,id_column,date,sales,price,filled_sales,filled_price,cutoff,sample
0,1,151,10060,1/151/10060,2020-07-06,0.0,,0.0,,2023-11-06,train
1,1,151,10060,1/151/10060,2020-07-13,0.0,,0.0,,2023-11-06,train
2,1,151,10060,1/151/10060,2020-07-20,0.0,,0.0,,2023-11-06,train
3,1,151,10060,1/151/10060,2020-07-27,0.0,,0.0,,2023-11-06,train
4,1,151,10060,1/151/10060,2020-08-03,0.0,,0.0,,2023-11-06,train


In [17]:
# Print the number of rows and columns
print(f"Number of columns in df_backtesting: {df_backtesting.shape[1]}")
print(f"Number of rows in df_backtesting: {df_backtesting.shape[0]}")

Number of columns in df_backtesting: 11
Number of rows in df_backtesting: 1611864


## Fill horizon

In [18]:
# Parameters
group_columns = ['client', 'warehouse', 'product']
date_column = 'date'
horizon = 13
freq = 'W-MON'

# Call the function
df_backtesting_with_horizon = data_preparation.add_horizon_last_cutoff(df_backtesting, group_columns, date_column, horizon, freq)

# Show
df_backtesting_with_horizon.head()

Unnamed: 0,client,warehouse,product,id_column,date,sales,price,filled_sales,filled_price,cutoff,sample
0,1,151,10060,1/151/10060,2020-07-06,0.0,,0.0,,2023-11-06,train
1,1,151,10060,1/151/10060,2020-07-13,0.0,,0.0,,2023-11-06,train
2,1,151,10060,1/151/10060,2020-07-20,0.0,,0.0,,2023-11-06,train
3,1,151,10060,1/151/10060,2020-07-27,0.0,,0.0,,2023-11-06,train
4,1,151,10060,1/151/10060,2020-08-03,0.0,,0.0,,2023-11-06,train


In [19]:
# Print the number of rows and columns
print(f"Number of columns in df_backtesting_with_horizon: {df_backtesting_with_horizon.shape[1]}")
print(f"Number of rows in df_backtesting_with_horizon: {df_backtesting_with_horizon.shape[0]}")

Number of columns in df_backtesting_with_horizon: 11
Number of rows in df_backtesting_with_horizon: 1650032


In [20]:
# Get memory details
memory = psutil.virtual_memory()

# Convert from bytes to GB
total_memory = memory.total / (1024 ** 3)
available_memory = memory.available / (1024 ** 3)
used_memory = memory.used / (1024 ** 3)

# Print memory details
print(f"Total Memory: {total_memory:.2f} GB")
print(f"Available Memory: {available_memory:.2f} GB")
print(f"Used Memory: {used_memory:.2f} GB")

Total Memory: 16.00 GB
Available Memory: 5.78 GB
Used Memory: 7.41 GB


# Feature Engineering

In [21]:
# Parameters for feature engineering
group_columns = ['client', 'warehouse', 'product', 'cutoff']
date_column = 'date'
target = 'filled_sales'
horizon = 13
freq = 'W'

# Clusters
n_groups = 10

In [22]:
# Init class
feature_engineering = FeatureEngineering()

## Create encodings

In [23]:
# Find categorial columns
categorical_columns = df_backtesting_with_horizon.select_dtypes(include='object').columns.tolist()

# Exclude 'sample_column' from the list
categorical_columns = [col for col in categorical_columns if col != 'sample']

# Show
categorical_columns

['client', 'warehouse', 'product', 'id_column']

In [24]:
# Apply the function
df_backtesting_with_categories = feature_engineering.create_encoded_features(df_backtesting_with_horizon, categorical_columns)

# Show
df_backtesting_with_categories.head()

Unnamed: 0,client,warehouse,product,id_column,date,sales,price,filled_sales,filled_price,cutoff,sample,feature_client,feature_warehouse,feature_product,feature_id_column
0,1,151,10060,1/151/10060,2020-07-06,0.0,,0.0,,2023-11-06,train,0,7,14,0
1,1,151,10060,1/151/10060,2020-07-13,0.0,,0.0,,2023-11-06,train,0,7,14,0
2,1,151,10060,1/151/10060,2020-07-20,0.0,,0.0,,2023-11-06,train,0,7,14,0
3,1,151,10060,1/151/10060,2020-07-27,0.0,,0.0,,2023-11-06,train,0,7,14,0
4,1,151,10060,1/151/10060,2020-08-03,0.0,,0.0,,2023-11-06,train,0,7,14,0


## Period features

In [25]:
# Apply the function
df_backtesting_with_periods = feature_engineering.create_periods_feature(df_backtesting_with_categories, group_columns, date_column, target)

# Show
df_backtesting_with_periods.head()

Unnamed: 0,client,warehouse,product,id_column,date,sales,price,filled_sales,filled_price,cutoff,sample,feature_client,feature_warehouse,feature_product,feature_id_column,feature_periods,feature_periods_expanding,feature_periods_sqrt
0,1,151,10060,1/151/10060,2020-07-06,0.0,,0.0,,2023-11-06,train,0,7,14,0,0.0,0.0,0.0
1,1,151,10060,1/151/10060,2020-07-13,0.0,,0.0,,2023-11-06,train,0,7,14,0,0.0,0.0,0.0
2,1,151,10060,1/151/10060,2020-07-20,0.0,,0.0,,2023-11-06,train,0,7,14,0,0.0,0.0,0.0
3,1,151,10060,1/151/10060,2020-07-27,0.0,,0.0,,2023-11-06,train,0,7,14,0,0.0,0.0,0.0
4,1,151,10060,1/151/10060,2020-08-03,0.0,,0.0,,2023-11-06,train,0,7,14,0,0.0,0.0,0.0


## Calendar features

In [26]:
# Call the function
df_with_dates = feature_engineering.create_date_features(df_backtesting_with_periods, date_column, freq)

# Show
df_with_dates.head()

Unnamed: 0,client,warehouse,product,id_column,date,sales,price,filled_sales,filled_price,cutoff,...,feature_periods_expanding,feature_periods_sqrt,feature_year,feature_quarter,feature_month,feature_week,feature_weeks_until_next_end_of_quarter,feature_weeks_until_end_of_year,feature_months_until_next_end_of_quarter,feature_months_until_end_of_year
0,1,151,10060,1/151/10060,2020-07-06,0.0,,0.0,,2023-11-06,...,0.0,0.0,2020,3,7,28,12,25,2,5
1,1,151,10060,1/151/10060,2020-07-13,0.0,,0.0,,2023-11-06,...,0.0,0.0,2020,3,7,29,11,24,2,5
2,1,151,10060,1/151/10060,2020-07-20,0.0,,0.0,,2023-11-06,...,0.0,0.0,2020,3,7,30,10,23,2,5
3,1,151,10060,1/151/10060,2020-07-27,0.0,,0.0,,2023-11-06,...,0.0,0.0,2020,3,7,31,9,22,2,5
4,1,151,10060,1/151/10060,2020-08-03,0.0,,0.0,,2023-11-06,...,0.0,0.0,2020,3,8,32,8,21,1,4


## Moving average

In [27]:
# Parameters
signal_columns = ['filled_sales', 'filled_price']
window_sizes = [4, 13]

# Call the function
df_with_ma = feature_engineering.create_ma_features(df_with_dates, group_columns, signal_columns, window_sizes)

# Show
df_with_ma.head()

Unnamed: 0,client,warehouse,product,id_column,date,sales,price,filled_sales,filled_price,cutoff,...,feature_month,feature_week,feature_weeks_until_next_end_of_quarter,feature_weeks_until_end_of_year,feature_months_until_next_end_of_quarter,feature_months_until_end_of_year,filled_sales_ma_4,filled_sales_ma_13,filled_price_ma_4,filled_price_ma_13
0,1,151,10060,1/151/10060,2020-07-06,0.0,,0.0,,2023-11-06,...,7,28,12,25,2,5,0.0,0.0,,
1,1,151,10060,1/151/10060,2020-07-13,0.0,,0.0,,2023-11-06,...,7,29,11,24,2,5,0.0,0.0,,
2,1,151,10060,1/151/10060,2020-07-20,0.0,,0.0,,2023-11-06,...,7,30,10,23,2,5,0.0,0.0,,
3,1,151,10060,1/151/10060,2020-07-27,0.0,,0.0,,2023-11-06,...,7,31,9,22,2,5,0.0,0.0,,
4,1,151,10060,1/151/10060,2020-08-03,0.0,,0.0,,2023-11-06,...,8,32,8,21,1,4,0.0,0.0,,


## Moving stats

In [28]:
# Define parameters
signal_columns = ['filled_sales', 'filled_price']
window_sizes = [13]

# Apply the function
df_with_min_max = feature_engineering.create_moving_stats(df_with_ma, group_columns, signal_columns, window_sizes)

# Display the result
df_with_min_max.head()

Unnamed: 0,client,warehouse,product,id_column,date,sales,price,filled_sales,filled_price,cutoff,...,feature_months_until_next_end_of_quarter,feature_months_until_end_of_year,filled_sales_ma_4,filled_sales_ma_13,filled_price_ma_4,filled_price_ma_13,filled_sales_min_13,filled_sales_max_13,filled_price_min_13,filled_price_max_13
0,1,151,10060,1/151/10060,2020-07-06,0.0,,0.0,,2023-11-06,...,2,5,0.0,0.0,,,0.0,0.0,,
1,1,151,10060,1/151/10060,2020-07-13,0.0,,0.0,,2023-11-06,...,2,5,0.0,0.0,,,0.0,0.0,,
2,1,151,10060,1/151/10060,2020-07-20,0.0,,0.0,,2023-11-06,...,2,5,0.0,0.0,,,0.0,0.0,,
3,1,151,10060,1/151/10060,2020-07-27,0.0,,0.0,,2023-11-06,...,2,5,0.0,0.0,,,0.0,0.0,,
4,1,151,10060,1/151/10060,2020-08-03,0.0,,0.0,,2023-11-06,...,1,4,0.0,0.0,,,0.0,0.0,,


## Generate lags

In [29]:
# Parameters
signal_columns = ['filled_sales', 'filled_price', 'filled_sales_ma_4', 'filled_price_ma_4', 'filled_sales_ma_13', 'filled_price_ma_13',
                  'filled_sales_min_13', 'filled_sales_max_13']
lags = [13, 52]

# Create lag features in the DataFrame
df_with_lags = feature_engineering.create_lag_features(df_with_min_max, group_columns, date_column, signal_columns, lags, horizon)

# Show
df_with_lags.head()

Unnamed: 0,client,warehouse,product,id_column,date,sales,price,filled_sales,filled_price,cutoff,...,feature_filled_price_ma_4_lag_13,feature_filled_price_ma_4_lag_52,feature_filled_sales_ma_13_lag_13,feature_filled_sales_ma_13_lag_52,feature_filled_price_ma_13_lag_13,feature_filled_price_ma_13_lag_52,feature_filled_sales_min_13_lag_13,feature_filled_sales_min_13_lag_52,feature_filled_sales_max_13_lag_13,feature_filled_sales_max_13_lag_52
0,1,151,10060,1/151/10060,2020-07-06,0.0,,0.0,,2023-11-06,...,,,,,,,,,,
1,1,151,10060,1/151/10060,2020-07-13,0.0,,0.0,,2023-11-06,...,,,,,,,,,,
2,1,151,10060,1/151/10060,2020-07-20,0.0,,0.0,,2023-11-06,...,,,,,,,,,,
3,1,151,10060,1/151/10060,2020-07-27,0.0,,0.0,,2023-11-06,...,,,,,,,,,,
4,1,151,10060,1/151/10060,2020-08-03,0.0,,0.0,,2023-11-06,...,,,,,,,,,,


## Get Cov ratio

In [30]:
# Parameters
value_columns = ['filled_sales']

# Call the function
df_with_cov = feature_engineering.create_cov(df_with_lags, group_columns, value_columns)

# Show
df_with_cov.head()

Unnamed: 0,client,warehouse,product,id_column,date,sales,price,filled_sales,filled_price,cutoff,...,feature_filled_price_ma_4_lag_52,feature_filled_sales_ma_13_lag_13,feature_filled_sales_ma_13_lag_52,feature_filled_price_ma_13_lag_13,feature_filled_price_ma_13_lag_52,feature_filled_sales_min_13_lag_13,feature_filled_sales_min_13_lag_52,feature_filled_sales_max_13_lag_13,feature_filled_sales_max_13_lag_52,feature_filled_sales_cov
0,1,151,10060,1/151/10060,2020-07-06,0.0,,0.0,,2023-11-06,...,,,,,,,,,,10.127639
1,1,151,10060,1/151/10060,2020-07-13,0.0,,0.0,,2023-11-06,...,,,,,,,,,,10.127639
2,1,151,10060,1/151/10060,2020-07-20,0.0,,0.0,,2023-11-06,...,,,,,,,,,,10.127639
3,1,151,10060,1/151/10060,2020-07-27,0.0,,0.0,,2023-11-06,...,,,,,,,,,,10.127639
4,1,151,10060,1/151/10060,2020-08-03,0.0,,0.0,,2023-11-06,...,,,,,,,,,,10.127639


## Add combinations

In [31]:
# Parameters
lower_level_group = 'product'

# Call the function
df_with_combinations = feature_engineering.create_distinct_combinations(df_with_cov, lower_level_group, group_columns)

# Show
df_with_combinations.head()

Unnamed: 0,client,warehouse,product,id_column,date,sales,price,filled_sales,filled_price,cutoff,...,feature_filled_price_ma_13_lag_13,feature_filled_price_ma_13_lag_52,feature_filled_sales_min_13_lag_13,feature_filled_sales_min_13_lag_52,feature_filled_sales_max_13_lag_13,feature_filled_sales_max_13_lag_52,feature_filled_sales_cov,feature_distinct_product_client,feature_distinct_product_warehouse,feature_distinct_product_cutoff
0,1,151,10060,1/151/10060,2020-07-06,0.0,,0.0,,2023-11-06,...,,,,,,,10.127639,1,1,3
1,1,151,10060,1/151/10060,2020-07-13,0.0,,0.0,,2023-11-06,...,,,,,,,10.127639,1,1,3
2,1,151,10060,1/151/10060,2020-07-20,0.0,,0.0,,2023-11-06,...,,,,,,,10.127639,1,1,3
3,1,151,10060,1/151/10060,2020-07-27,0.0,,0.0,,2023-11-06,...,,,,,,,10.127639,1,1,3
4,1,151,10060,1/151/10060,2020-08-03,0.0,,0.0,,2023-11-06,...,,,,,,,10.127639,1,1,3


## Add clusters

In [32]:
# Parameters
value_columns = ['filled_sales', 'filled_price']

# Call the function
df_with_clusters = feature_engineering.create_quantile_clusters(df_with_combinations, group_columns, value_columns, n_groups)

# Show
df_with_clusters.head()

Unnamed: 0,client,warehouse,product,id_column,date,sales,price,filled_sales,filled_price,cutoff,...,feature_filled_sales_min_13_lag_13,feature_filled_sales_min_13_lag_52,feature_filled_sales_max_13_lag_13,feature_filled_sales_max_13_lag_52,feature_filled_sales_cov,feature_distinct_product_client,feature_distinct_product_warehouse,feature_distinct_product_cutoff,feature_filled_sales_cluster,feature_filled_price_cluster
0,1,151,10060,1/151/10060,2020-07-06,0.0,,0.0,,2023-11-06,...,,,,,10.127639,1,1,3,2,10
1,1,151,10060,1/151/10060,2020-07-13,0.0,,0.0,,2023-11-06,...,,,,,10.127639,1,1,3,2,10
2,1,151,10060,1/151/10060,2020-07-20,0.0,,0.0,,2023-11-06,...,,,,,10.127639,1,1,3,2,10
3,1,151,10060,1/151/10060,2020-07-27,0.0,,0.0,,2023-11-06,...,,,,,10.127639,1,1,3,2,10
4,1,151,10060,1/151/10060,2020-08-03,0.0,,0.0,,2023-11-06,...,,,,,10.127639,1,1,3,2,10


In [33]:
# Parameters
value_columns = ['feature_periods']

# Call the function
df_with_clusters = feature_engineering.create_history_clusters(df_with_clusters, group_columns, value_columns, n_groups)

# Show
df_with_clusters.head()

Unnamed: 0,client,warehouse,product,id_column,date,sales,price,filled_sales,filled_price,cutoff,...,feature_filled_sales_min_13_lag_52,feature_filled_sales_max_13_lag_13,feature_filled_sales_max_13_lag_52,feature_filled_sales_cov,feature_distinct_product_client,feature_distinct_product_warehouse,feature_distinct_product_cutoff,feature_filled_sales_cluster,feature_filled_price_cluster,feature_periods_history_cluster
0,1,151,10060,1/151/10060,2020-07-06,0.0,,0.0,,2023-11-06,...,,,,10.127639,1,1,3,2,10,1
1,1,151,10060,1/151/10060,2020-07-13,0.0,,0.0,,2023-11-06,...,,,,10.127639,1,1,3,2,10,1
2,1,151,10060,1/151/10060,2020-07-20,0.0,,0.0,,2023-11-06,...,,,,10.127639,1,1,3,2,10,1
3,1,151,10060,1/151/10060,2020-07-27,0.0,,0.0,,2023-11-06,...,,,,10.127639,1,1,3,2,10,1
4,1,151,10060,1/151/10060,2020-08-03,0.0,,0.0,,2023-11-06,...,,,,10.127639,1,1,3,2,10,1


In [34]:
# Parameters
value_columns = ['filled_sales']

# Call the function
df_with_clusters = feature_engineering.create_intermittence_clusters(df_with_clusters, group_columns, value_columns, n_groups)

# Show
df_with_clusters.head()

Unnamed: 0,client,warehouse,product,id_column,date,sales,price,filled_sales,filled_price,cutoff,...,feature_filled_sales_max_13_lag_13,feature_filled_sales_max_13_lag_52,feature_filled_sales_cov,feature_distinct_product_client,feature_distinct_product_warehouse,feature_distinct_product_cutoff,feature_filled_sales_cluster,feature_filled_price_cluster,feature_periods_history_cluster,feature_intermittence_filled_sales_cluster
0,1,151,10060,1/151/10060,2020-07-06,0.0,,0.0,,2023-11-06,...,,,10.127639,1,1,3,2,10,1,8
1,1,151,10060,1/151/10060,2020-07-13,0.0,,0.0,,2023-11-06,...,,,10.127639,1,1,3,2,10,1,8
2,1,151,10060,1/151/10060,2020-07-20,0.0,,0.0,,2023-11-06,...,,,10.127639,1,1,3,2,10,1,8
3,1,151,10060,1/151/10060,2020-07-27,0.0,,0.0,,2023-11-06,...,,,10.127639,1,1,3,2,10,1,8
4,1,151,10060,1/151/10060,2020-08-03,0.0,,0.0,,2023-11-06,...,,,10.127639,1,1,3,2,10,1,8


## Train weights

In [35]:
# Parameters
feature_periods_col = 'feature_periods'
train_weight_type = 'linear'

# Call the function
df_with_train_weights = feature_engineering.create_train_weights(df_with_clusters, group_columns, feature_periods_col, train_weight_type)

# Show
df_with_train_weights.head()

Unnamed: 0,client,warehouse,product,id_column,date,sales,price,filled_sales,filled_price,cutoff,...,feature_filled_sales_max_13_lag_52,feature_filled_sales_cov,feature_distinct_product_client,feature_distinct_product_warehouse,feature_distinct_product_cutoff,feature_filled_sales_cluster,feature_filled_price_cluster,feature_periods_history_cluster,feature_intermittence_filled_sales_cluster,train_weight
0,1,151,10060,1/151/10060,2020-07-06,0.0,,0.0,,2023-11-06,...,,10.127639,1,1,3,2,10,1,8,0.0
1,1,151,10060,1/151/10060,2020-07-13,0.0,,0.0,,2023-11-06,...,,10.127639,1,1,3,2,10,1,8,0.0
2,1,151,10060,1/151/10060,2020-07-20,0.0,,0.0,,2023-11-06,...,,10.127639,1,1,3,2,10,1,8,0.0
3,1,151,10060,1/151/10060,2020-07-27,0.0,,0.0,,2023-11-06,...,,10.127639,1,1,3,2,10,1,8,0.0
4,1,151,10060,1/151/10060,2020-08-03,0.0,,0.0,,2023-11-06,...,,10.127639,1,1,3,2,10,1,8,0.0


# Create baselines

In [36]:
# Parameters for baseline creation
group_columns = ['client', 'warehouse', 'product', 'cutoff']
date_column = 'date'

In [37]:
# Prepare class
create_baselines = CreateBaselines()

## MA Baseline

In [38]:
# Parameters
signal_columns = ['filled_sales_ma_13', 'filled_price_ma_13']
window_size = 13

# Call the function
df_with_ma_baseline = create_baselines.create_ma_baseline(df_with_train_weights, group_columns, date_column, signal_columns, window_size)

# Show
df_with_ma_baseline.head()

Unnamed: 0,client,warehouse,product,id_column,date,sales,price,filled_sales,filled_price,cutoff,...,feature_distinct_product_cutoff,feature_filled_sales_cluster,feature_filled_price_cluster,feature_periods_history_cluster,feature_intermittence_filled_sales_cluster,train_weight,feature_baseline_filled_sales_ma_13,feature_baseline_filled_price_ma_13,baseline_filled_sales_ma_13,baseline_filled_price_ma_13
0,1,151,10060,1/151/10060,2020-07-06,0.0,,0.0,,2023-11-06,...,3,2,10,1,8,0.0,0.0,,,
1,1,151,10060,1/151/10060,2020-07-13,0.0,,0.0,,2023-11-06,...,3,2,10,1,8,0.0,0.0,,,
2,1,151,10060,1/151/10060,2020-07-20,0.0,,0.0,,2023-11-06,...,3,2,10,1,8,0.0,0.0,,,
3,1,151,10060,1/151/10060,2020-07-27,0.0,,0.0,,2023-11-06,...,3,2,10,1,8,0.0,0.0,,,
4,1,151,10060,1/151/10060,2020-08-03,0.0,,0.0,,2023-11-06,...,3,2,10,1,8,0.0,0.0,,,


## ML Baseline

In [39]:
# Parameters
group_columns = ['feature_filled_price_cluster', 'cutoff']
signal_columns = ['filled_price']
feature_columns = ['feature_client', 'feature_warehouse', 'feature_product', 'feature_periods', 'feature_year', 'feature_month', 'feature_week']

# Call the function
df_with_ml_baseline = create_baselines.create_lgbm_baseline(df_with_ma_baseline, group_columns, date_column, signal_columns, feature_columns)

# Show
df_with_ml_baseline.head()

Unnamed: 0,client,warehouse,product,id_column,date,sales,price,filled_sales,filled_price,cutoff,...,feature_filled_price_cluster,feature_periods_history_cluster,feature_intermittence_filled_sales_cluster,train_weight,feature_baseline_filled_sales_ma_13,feature_baseline_filled_price_ma_13,baseline_filled_sales_ma_13,baseline_filled_price_ma_13,baseline_filled_price_lgbm,feature_baseline_filled_price_lgbm
0,1,235,10012,1/235/10012,2020-07-06,0.0,,0.0,,2023-11-06,...,1,1,7,0.0,0.0,,,,,
1,1,235,11931,1/235/11931,2020-07-06,0.0,,0.0,,2023-11-06,...,1,1,6,0.0,0.0,,,,,
2,1,235,12201,1/235/12201,2020-07-06,0.0,,0.0,,2023-11-06,...,1,1,5,0.0,0.0,,,,,
3,1,235,13692,1/235/13692,2020-07-06,0.0,,0.0,,2023-11-06,...,1,1,8,0.0,0.0,,,,,
4,1,235,13720,1/235/13720,2020-07-06,0.0,,0.0,,2023-11-06,...,1,1,5,0.0,0.0,,,,,


# Save output

In [40]:
# Check if any column is completely filled with NaN values
na_columns = df_with_ml_baseline.columns[df_with_ml_baseline.isna().all()].tolist()

# Show
print("Columns full of NaN values:", na_columns)

Columns full of NaN values: []


In [41]:
# Round all float columns to 2 decimal places
float_cols = df_with_ml_baseline.select_dtypes(include=['float']).columns
df_with_ml_baseline[float_cols] = df_with_ml_baseline[float_cols].round(2)

# Show
df_with_ml_baseline.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1650032 entries, 0 to 1650031
Data columns (total 65 columns):
 #   Column                                      Non-Null Count    Dtype         
---  ------                                      --------------    -----         
 0   client                                      1650032 non-null  object        
 1   warehouse                                   1650032 non-null  object        
 2   product                                     1650032 non-null  object        
 3   id_column                                   1611864 non-null  object        
 4   date                                        1650032 non-null  datetime64[ns]
 5   sales                                       1611864 non-null  float64       
 6   price                                       455553 non-null   float64       
 7   filled_sales                                1611864 non-null  float64       
 8   filled_price                                792879 non-null   

In [42]:
# Get memory details
memory = psutil.virtual_memory()

# Convert from bytes to GB
total_memory = memory.total / (1024 ** 3)
available_memory = memory.available / (1024 ** 3)
used_memory = memory.used / (1024 ** 3)

# Print memory details
print(f"Total Memory: {total_memory:.2f} GB")
print(f"Available Memory: {available_memory:.2f} GB")
print(f"Used Memory: {used_memory:.2f} GB")

Total Memory: 16.00 GB
Available Memory: 3.82 GB
Used Memory: 5.38 GB


In [43]:
# Create the filename with the phase appended
file_name = f'Data/df_backtesting.csv'

# Write the dataframe to the CSV file
df_with_ml_baseline.to_csv(file_name, index=False)

In [44]:
# Get memory details
memory = psutil.virtual_memory()

# Convert from bytes to GB
total_memory = memory.total / (1024 ** 3)
available_memory = memory.available / (1024 ** 3)
used_memory = memory.used / (1024 ** 3)

# Print memory details
print(f"Total Memory: {total_memory:.2f} GB")
print(f"Available Memory: {available_memory:.2f} GB")
print(f"Used Memory: {used_memory:.2f} GB")

Total Memory: 16.00 GB
Available Memory: 3.87 GB
Used Memory: 5.47 GB


# Clean

In [45]:
# Garbage collection
gc.collect()

0