In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import lightgbm as lgb


In [12]:
import joblib
joblib.dump(model, "../Data/lgbm_model.pkl") 

['../Data/lgbm_model.pkl']

In [8]:
pip install lightgbm

Note: you may need to restart the kernel to use updated packages.


In [3]:
dataset = pd.read_csv("../Data/data/demand.csv")
dataset['date'] = pd.to_datetime(dataset['date'])
dataset.sort_values(['sku','warehouse','date'], inplace=True)

In [4]:
dataset['day_of_the_week'] = dataset['date'].dt.dayofweek
dataset['lag_1'] = dataset.groupby(['sku', 'warehouse'])['demand_units'].shift(1)
dataset['rolling_7'] = dataset.groupby(['sku', 'warehouse'])['demand_units'].transform(lambda x: x.shift(1).rolling(7).mean())
dataset.dropna(inplace=True)

In [5]:
#TrainingModel

Features  = ['rolling_7', 'day_of_the_week', 'lag_1']
X = dataset[Features]
y = dataset['demand_units']

X_train, X_test, y_train, y_test = train_test_split(X,y, shuffle=False, test_size=0.3)
model = lgb.LGBMRegressor(learning_rate=0.1, random_state=42, n_estimators=600, max_depth=6, min_child_samples=25)
model.fit(X_train,y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006883 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 517
[LightGBM] [Info] Number of data points in the train set: 325780, number of used features: 3
[LightGBM] [Info] Start training from score 276.169768


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,6
,learning_rate,0.1
,n_estimators,600
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [6]:
y_pred = model.predict(X_test)
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

RMSE: 16.19969698772664


In [7]:
print("Demand stats:")
print(dataset['demand_units'].describe())


Demand stats:
count    465400.000000
mean        282.152018
std         123.763783
min           0.000000
25%         178.000000
50%         286.000000
75%         388.000000
max         567.000000
Name: demand_units, dtype: float64


In [8]:
def forecast_tomorrow(sku_id, dataset, model):
    sku_df = dataset[dataset['sku'] == sku_id].sort_values('date').copy()
    if len(sku_df) < 8:
        return "Not enough data"
    
    last_row = sku_df.iloc[-1]
    last_date = last_row['date'] + pd.Timedelta(days=1)
    
    features = {
        'rolling_7': sku_df['demand_units'].shift(1).rolling(7).mean().iloc[-1],
        'day_of_week': last_date.dayofweek,
        'lag_1': last_row['demand_units'],
    }
    
    X_next = pd.DataFrame([features])
    pred = model.predict(X_next)[0]
    
    return f"Predicted demand for {sku_id} on {last_date.date()}: {pred:.2f} units"



In [9]:
Necessary_demand = forecast_tomorrow("P1079", dataset, model)
print(Necessary_demand)

Predicted demand for P1079 on 2025-07-23: 457.88 units


In [11]:
#Saving forecast to input into our Optimizer 
forecast_df = X_test.copy()
forecast_df['predicted_demand'] = y_pred
forecast_df['sku'] = dataset.loc[X_test.index, 'sku'].values
forecast_df['warehouse'] = dataset.loc[X_test.index, 'warehouse'].values
forecast_df['date'] = dataset.loc[X_test.index, 'date'].values

forecast_df = forecast_df[['sku', 'warehouse', 'date', 'predicted_demand']]
forecast_df.to_csv("../data/forecasted_demand.csv", index=False)
