In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.dates import DateFormatter
from pathlib import Path
from src.utils import constants
from src.eda import eda
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from src.models import demand_forecasting
from src.models import evaluate
from src.utils import utils



# Demand Forecasting

Given that we have tried a linear approach at understanding how demand changes with price and can see that the approach has a variety of limitations let's try to address some of them with a non-linear methodlohgy where we fit a Two Stage Classifier/Regression to model the intermittient demand. 

In the first stage we predict binary occurence/non-occurence and second stage we regress likely values given occurence. (similar to crostons method)

Like the elasticity model we'll model the demand of each product/subcategory/category at different time aggregations

In [2]:
PRODUCT_PATH = constants.DATA_RAW_DIR / 'products.csv'
INVENTORY_PATH = constants.DATA_RAW_DIR / 'inventory.csv'
TRANSACTIONS_PATH = constants.DATA_RAW_DIR / 'transactions.csv'
products = pd.read_csv(PRODUCT_PATH , parse_dates=["launch_date"])
transactions = pd.read_csv(TRANSACTIONS_PATH, parse_dates=["timestamp"])
inventory = pd.read_csv(INVENTORY_PATH , parse_dates=["date", "restock_date"])
#Ensure that the date columns are in datetime format
products['launch_date'] = pd.to_datetime(products['launch_date'])
transactions['timestamp'] = pd.to_datetime(transactions['timestamp'])
inventory['date'] = pd.to_datetime(inventory['date'])

transactions['promotion_type'] = transactions['promotion_type'].fillna('No Promotion')


In [3]:
# Need to build out the subcategory feature 
products['subcategory_identifier'] = products["category_id"].astype(str) + "_" + products["subcategory_id"].astype(str)

In [4]:
features = [
    "base_cost","quality_score","avg_competitor_price","price",      # ← added price
    "stock_level","days_in_stock","restock_quantity",
    "month","dow","woy",
    "category_id","brand_id","supplier_id",
    "is_seasonal","is_promotion","promotion_type","platform"
]

forecaster = demand_forecasting.DemandForecaster(
    products, transactions, inventory,
    group_key='product_id', 
    freq='D'
)
df_panel = forecaster.build_panel()
forecaster.split(features, test_split=0.2)  # no features param needed
df_train, df_test = forecaster.df_train, forecaster.df_test
# 3) Fit
forecaster.fit(cv_splits=5)
utils.save_forecaster(forecaster, constants.MODEL_DIR / 'product_by_day_demand_forecaster.pkl')
# 4) Predict on test
product_by_day_results_df = forecaster.predict()      # uses internal test split

# 5) Evaluate
metrics = forecaster.evaluate()


[LightGBM] [Info] Number of positive: 37326, number of negative: 255174
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004612 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1238
[LightGBM] [Info] Number of data points in the train set: 292500, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000818 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1232
[LightGBM] [Info] Number of data points in the train set: 37326, number of used features: 22
[LightGBM] [Info] Start training from score 1.166426


In [5]:
product_by_day_results_df

Unnamed: 0,product_id,period,demand_qty,is_promotion,price,promotion_type,platform,occurrence,stock_level,days_in_stock,...,is_seasonal,launch_date,subcategory_identifier,month,dow,woy,pred_prob,pred_occurrence,pred_qty,forecast
0,PROD00001,2023-08-18,0.0,0.0,143.998778,none,none,0.0,179,594,...,0,2021-10-16,4_2,8,4,33,0.000026,0,3.231069,0.000000
1,PROD00001,2023-09-03,0.0,0.0,143.998778,none,none,0.0,200,610,...,0,2021-10-16,4_2,9,6,35,0.000026,0,3.604229,0.000000
2,PROD00001,2023-09-13,3.0,0.0,71.560000,No Promotion,Mobile App,1.0,178,620,...,0,2021-10-16,4_2,9,2,37,0.999974,1,3.116535,3.116535
3,PROD00001,2023-08-14,0.0,0.0,143.998778,none,none,0.0,200,590,...,0,2021-10-16,4_2,8,0,33,0.000026,0,3.540977,0.000000
4,PROD00001,2023-12-27,1.0,0.0,80.990000,No Promotion,Web,1.0,187,725,...,0,2021-10-16,4_2,12,2,52,0.999974,1,3.059199,3.059199
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72495,PROD00500,2023-10-12,0.0,0.0,154.516278,none,none,0.0,147,649,...,1,2020-04-28,1_2,10,3,41,0.000026,0,3.497060,0.000000
72496,PROD00500,2023-12-13,0.0,0.0,154.516278,none,none,0.0,183,711,...,1,2020-04-28,1_2,12,2,50,0.000026,0,3.543694,0.000000
72497,PROD00500,2023-12-29,3.0,0.0,94.320000,No Promotion,Web,1.0,184,727,...,1,2020-04-28,1_2,12,4,52,0.999974,1,3.282776,3.282776
72498,PROD00500,2023-09-08,0.0,0.0,154.516278,none,none,0.0,175,615,...,1,2020-04-28,1_2,9,4,36,0.000026,0,3.524054,0.000000


In [6]:
features = [
    "base_cost","quality_score","avg_competitor_price","price",      # ← added price
    "stock_level","days_in_stock","restock_quantity",
    "month","dow","woy", "brand_id","supplier_id",
    "is_seasonal","is_promotion","promotion_type","platform"
]

forecaster = demand_forecasting.DemandForecaster(
    products, transactions, inventory,
    group_key='subcategory_identifier',  # ← use the string key you created
    freq='W',
    intermittent= False
)
df_panel = forecaster.build_panel()
forecaster.split(features, test_split=0.2)  # no features param needed
df_train, df_test = forecaster.df_train, forecaster.df_test
# 3) Fit
forecaster.fit(cv_splits=5)

utils.save_forecaster(forecaster, constants.MODEL_DIR / 'subcat_by_week_demand_forecaster.pkl')
# 4) Predict on test
results_df = forecaster.predict()      # uses internal test split

# 5) Evaluate
metrics = forecaster.evaluate()

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000163 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 571
[LightGBM] [Info] Number of data points in the train set: 1700, number of used features: 12
[LightGBM] [Info] Start training from score 4.264997


In [7]:
features = [
    "base_cost","quality_score","avg_competitor_price","price",      # ← added price
    "stock_level","days_in_stock","restock_quantity",
    "month","dow","woy", "brand_id","supplier_id",
    "is_seasonal","is_promotion","promotion_type","platform"
]

forecaster = demand_forecasting.DemandForecaster(
    products, transactions, inventory,
    group_key='subcategory_identifier',  # ← use the string key you created
    freq='D',
    intermittent= False
)
df_panel = forecaster.build_panel()
forecaster.split(features, test_split=0.2)  # no features param needed
df_train, df_test = forecaster.df_train, forecaster.df_test
# 3) Fit
forecaster.fit(cv_splits=5)

utils.save_forecaster(forecaster, constants.MODEL_DIR / 'subcat_by_day_demand_forecaster.pkl')
# 4) Predict on test
results_df = forecaster.predict()      # uses internal test split

# 5) Evaluate
metrics = forecaster.evaluate()

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000471 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 764
[LightGBM] [Info] Number of data points in the train set: 11269, number of used features: 20
[LightGBM] [Info] Start training from score 2.364060


In [8]:
metrics

{'rmse': 5.4169687640731015}