# Setup

In [1]:
# General libraries
import pandas as pd
import numpy as np
import warnings
from datetime import datetime
import gc
import os

# Plots
from matplotlib import pyplot as plt

# Plots
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

# Pd options
pd.set_option('display.max_columns', None)

In [2]:
# Suppress warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)

# Suppress the specific PerformanceWarning
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

# Suppress SettingWithCopyWarning
pd.options.mode.chained_assignment = None

In [3]:
# Get the current date and time
current_time = datetime.now()

# Format the date and time for the filename
formatted_date = current_time.strftime("%Y_%m_%d_%H_%M")

# Show
formatted_date

'2024_10_28_01_33'

# Directory

In [4]:
# Set cd to parent directory
os.chdir("/Users/ignasipascual/Documents/GitHub/LightGBM-Forecaster")
cwd = os.getcwd()
print("Current Working Directory:", cwd)

Current Working Directory: /Users/ignasipascual/Documents/GitHub/LightGBM-Forecaster


In [5]:
# Set source
source = 'local'

# Print files in directory
if source == 'gdrive':
    from google.colab import drive
    drive.mount('/content/drive')
    os.chdir('/content/drive/MyDrive/Forecasting')
    print("Listing files from Google Drive 'Forecasting' directory:")
else:
    print("Listing files from the current local directory:")

# List files in the specified directory
files = os.listdir()

# Print each file
for file in files:
    print(file)

Listing files from the current local directory:
.DS_Store
LICENSE
requirements.txt
utils
docs
README.md
.gitignore
submissions
.git
data
notebooks


# Assets

## Import assets

In [6]:
# Import assets
from utils.auxiliar import *

# Load data

In [7]:
# Create the filename with the phase appended
file_name = f'Data/df_lightgbm_preds.csv'

# Read the CSV file
df_input = pd.read_csv(file_name)

# Show
df_input.head()

Unnamed: 0,client,warehouse,product,date,sales,price,filled_sales,filled_price,cutoff,sample,feature_client,feature_warehouse,feature_product,feature_periods,feature_periods_expanding,feature_periods_sqrt,feature_year,feature_quarter,feature_month,feature_week,feature_weeks_until_next_end_of_quarter,feature_weeks_until_end_of_year,feature_months_until_next_end_of_quarter,feature_months_until_end_of_year,filled_sales_ma_4,filled_sales_ma_13,filled_price_ma_4,filled_price_ma_13,filled_sales_min_13,filled_sales_max_13,filled_price_min_13,filled_price_max_13,feature_filled_sales_lag_13,feature_filled_sales_lag_52,feature_filled_price_lag_13,feature_filled_price_lag_52,feature_filled_sales_ma_4_lag_13,feature_filled_sales_ma_4_lag_52,feature_filled_price_ma_4_lag_13,feature_filled_price_ma_4_lag_52,feature_filled_sales_ma_13_lag_13,feature_filled_sales_ma_13_lag_52,feature_filled_price_ma_13_lag_13,feature_filled_price_ma_13_lag_52,feature_filled_sales_min_13_lag_13,feature_filled_sales_min_13_lag_52,feature_filled_sales_max_13_lag_13,feature_filled_sales_max_13_lag_52,feature_filled_sales_cov,feature_distinct_product_client,feature_distinct_product_warehouse,feature_distinct_product_cutoff,feature_filled_sales_cluster,feature_filled_price_cluster,feature_feature_periods_history_cluster,feature_intermittence_filled_sales_cluster,train_weight,fcst_lag,feature_baseline_filled_sales_ma_13,feature_baseline_filled_price_ma_13,baseline_filled_sales_ma_13,baseline_filled_price_ma_13,baseline_filled_price_lgbm,feature_baseline_filled_price_lgbm,training_group,guardrail,prediction
0,1,151,1241,2020-07-06,0.0,,0.0,,2023-09-04,train,0,7,426,0.0,0.0,0.0,2020,3,7,28,12,25,2,5,0.0,0.0,,,0.0,0.0,,,,,,,,,,,,,,,,,,,,1,2,5,1,0,1,15,,,0.0,,,,,,1,False,
1,1,182,13789,2020-07-06,0.0,,0.0,,2023-09-04,train,0,8,677,0.0,0.0,0.0,2020,3,7,28,12,25,2,5,0.0,0.0,,,0.0,0.0,,,,,,,,,,,,,,,,,,,,1,2,5,1,0,1,15,,,0.0,,,,,,1,False,
2,1,182,3145,2020-07-06,0.0,,0.0,,2023-09-04,train,0,8,1104,0.0,0.0,0.0,2020,3,7,28,12,25,2,5,0.0,0.0,,,0.0,0.0,,,,,,,,,,,,,,,,,,,,1,1,5,1,0,1,15,,,0.0,,,,,,1,False,
3,1,182,3864,2020-07-06,0.0,,0.0,,2023-09-04,train,0,8,1223,0.0,0.0,0.0,2020,3,7,28,12,25,2,5,0.0,0.0,,,0.0,0.0,,,,,,,,,,,,,,,,,,,,1,2,5,1,0,1,15,,,0.0,,,,,,1,False,
4,1,182,3971,2020-07-06,0.0,,0.0,,2023-09-04,train,0,8,1243,0.0,0.0,0.0,2020,3,7,28,12,25,2,5,0.0,0.0,,,0.0,0.0,,,,,,,,,,,,,,,,,,,,1,3,5,1,0,1,15,,,0.0,,,,,,1,False,


# Prepare results

In [8]:
# Convert each specified column to string
character_cols = ['client', 'warehouse', 'product']
for col in character_cols:
    df_input[col] = df_input[col].astype(str)

# Create id column
df_input['id_column'] = df_input['client'] + '/' + df_input['warehouse'] + '/' + df_input['product']

# Convert the 'date' column to datetime
df_input['date'] = pd.to_datetime(df_input['date'])
df_input['cutoff'] = pd.to_datetime(df_input['cutoff'])

# Find the maximum date in the dataset
max_date = df_input['date'].max()

# Define the cutoff date (6 weeks before the max date)
cutoff_date = max_date - pd.Timedelta(weeks=52)

# Filter for sales data within the last 6 weeks
last_6_weeks_data = df_input[df_input['date'] > cutoff_date]

# Group by 'id_column' and calculate the sum of sales
sales_summary = last_6_weeks_data.groupby('id_column')['sales'].sum().reset_index()

# Identify inactive ids (sum of sales is 0 or lower)
inactive_ids = sales_summary[sales_summary['sales'] <= 0]['id_column']

# Filter out inactive groups from the original dataset
df_filtered = df_input[~df_input['id_column'].isin(inactive_ids)]

# Show
df_filtered.head()

Unnamed: 0,client,warehouse,product,date,sales,price,filled_sales,filled_price,cutoff,sample,feature_client,feature_warehouse,feature_product,feature_periods,feature_periods_expanding,feature_periods_sqrt,feature_year,feature_quarter,feature_month,feature_week,feature_weeks_until_next_end_of_quarter,feature_weeks_until_end_of_year,feature_months_until_next_end_of_quarter,feature_months_until_end_of_year,filled_sales_ma_4,filled_sales_ma_13,filled_price_ma_4,filled_price_ma_13,filled_sales_min_13,filled_sales_max_13,filled_price_min_13,filled_price_max_13,feature_filled_sales_lag_13,feature_filled_sales_lag_52,feature_filled_price_lag_13,feature_filled_price_lag_52,feature_filled_sales_ma_4_lag_13,feature_filled_sales_ma_4_lag_52,feature_filled_price_ma_4_lag_13,feature_filled_price_ma_4_lag_52,feature_filled_sales_ma_13_lag_13,feature_filled_sales_ma_13_lag_52,feature_filled_price_ma_13_lag_13,feature_filled_price_ma_13_lag_52,feature_filled_sales_min_13_lag_13,feature_filled_sales_min_13_lag_52,feature_filled_sales_max_13_lag_13,feature_filled_sales_max_13_lag_52,feature_filled_sales_cov,feature_distinct_product_client,feature_distinct_product_warehouse,feature_distinct_product_cutoff,feature_filled_sales_cluster,feature_filled_price_cluster,feature_feature_periods_history_cluster,feature_intermittence_filled_sales_cluster,train_weight,fcst_lag,feature_baseline_filled_sales_ma_13,feature_baseline_filled_price_ma_13,baseline_filled_sales_ma_13,baseline_filled_price_ma_13,baseline_filled_price_lgbm,feature_baseline_filled_price_lgbm,training_group,guardrail,prediction,id_column
0,1,151,1241,2020-07-06,0.0,,0.0,,2023-09-04,train,0,7,426,0.0,0.0,0.0,2020,3,7,28,12,25,2,5,0.0,0.0,,,0.0,0.0,,,,,,,,,,,,,,,,,,,,1,2,5,1,0,1,15,,,0.0,,,,,,1,False,,1/151/1241
1,1,182,13789,2020-07-06,0.0,,0.0,,2023-09-04,train,0,8,677,0.0,0.0,0.0,2020,3,7,28,12,25,2,5,0.0,0.0,,,0.0,0.0,,,,,,,,,,,,,,,,,,,,1,2,5,1,0,1,15,,,0.0,,,,,,1,False,,1/182/13789
2,1,182,3145,2020-07-06,0.0,,0.0,,2023-09-04,train,0,8,1104,0.0,0.0,0.0,2020,3,7,28,12,25,2,5,0.0,0.0,,,0.0,0.0,,,,,,,,,,,,,,,,,,,,1,1,5,1,0,1,15,,,0.0,,,,,,1,False,,1/182/3145
3,1,182,3864,2020-07-06,0.0,,0.0,,2023-09-04,train,0,8,1223,0.0,0.0,0.0,2020,3,7,28,12,25,2,5,0.0,0.0,,,0.0,0.0,,,,,,,,,,,,,,,,,,,,1,2,5,1,0,1,15,,,0.0,,,,,,1,False,,1/182/3864
4,1,182,3971,2020-07-06,0.0,,0.0,,2023-09-04,train,0,8,1243,0.0,0.0,0.0,2020,3,7,28,12,25,2,5,0.0,0.0,,,0.0,0.0,,,,,,,,,,,,,,,,,,,,1,3,5,1,0,1,15,,,0.0,,,,,,1,False,,1/182/3971


In [9]:
# Calculate stats about the inactive ids
total_ids = df_input['id_column'].nunique()
inactive_count = inactive_ids.nunique()
inactive_percentage = (inactive_count / total_ids) * 100 if total_ids > 0 else 0

# Show stats
print(f"Total number of unique ids: {total_ids}")
print(f"Number of inactive ids: {inactive_count}")
print(f"Percentage of inactive ids: {inactive_percentage:.2f}%")

Total number of unique ids: 2936
Number of inactive ids: 198
Percentage of inactive ids: 6.74%


In [10]:
# Coerce negative values in the 'baseline_filled_sales_ma_13' column to 0
df_filtered['baseline_filled_sales_ma_13'] = df_filtered['baseline_filled_sales_ma_13'].clip(lower=0)

# Coerce negative values in the 'prediction' column to 0
df_filtered['prediction'] = df_filtered['prediction'].clip(lower=0)

# Prepare submission

In [11]:
# Parameters
value_to_pivot = 'prediction'

# Prepare submission using the prediction values
submission_df = prepare_submission(df_filtered, value_to_pivot)

# Rename columns
submission_df = submission_df.rename(columns={
        'client': 'Client',
        'warehouse': 'Warehouse',
        'product': 'Product'
})

# Show
submission_df.head()

Latest cutoff date selected: 2024-01-01 00:00:00


Unnamed: 0,Client,Warehouse,Product,2024-01-08,2024-01-15,2024-01-22,2024-01-29,2024-02-05,2024-02-12,2024-02-19,2024-02-26,2024-03-04,2024-03-11,2024-03-18,2024-03-25,2024-04-01
0,1,151,10060,0.201816,0.21133,0.207915,0.205235,0.202422,0.199386,0.201566,0.201714,0.201637,0.201637,0.204046,0.211857,0.241658
1,1,151,12377,0.682513,0.670457,0.676705,0.676705,0.672245,0.63326,0.570758,0.665877,0.61408,0.606041,0.606098,0.606098,0.577526
2,1,151,1241,0.076988,0.04,0.043325,0.042059,0.042358,0.04,0.04,0.04,0.04,0.056937,0.056733,0.051479,0.04
3,1,151,5519,2.222952,2.534646,2.534356,2.414274,2.226729,2.342935,2.337138,2.325676,2.283056,2.283056,2.328879,2.328879,2.50437
4,1,151,7358,2.076746,2.107105,2.063094,1.839081,2.014947,2.243422,2.107458,2.007666,1.827871,1.856384,1.574977,1.607884,1.736298


# Final checks

In [12]:
# Print the columns of the resulting DataFrame
print("Columns in the submission DataFrame:")
print(submission_df.columns.tolist())

Columns in the submission DataFrame:
['Client', 'Warehouse', 'Product', '2024-01-08', '2024-01-15', '2024-01-22', '2024-01-29', '2024-02-05', '2024-02-12', '2024-02-19', '2024-02-26', '2024-03-04', '2024-03-11', '2024-03-18', '2024-03-25', '2024-04-01']


In [13]:
# Print the number of columns that are not keys
non_key_columns_count = submission_df.shape[1] - 3
print(f"Number of predictions: {non_key_columns_count}")

Number of predictions: 13


# Save output

In [14]:
# Define the model name
final_model = "ml"

In [15]:
# Create the filename with the phase appended
filename = f'Submissions/submission_{final_model}_{formatted_date}.csv'

# Save the output dataframe to the CSV file
submission_df.to_csv(filename, index=False)