In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df=pd.read_csv('retail_store_inventory.csv')

df.head(10)

df.drop(columns=['Category','Region'], inplace=True)

df.drop(columns=['Holiday/Promotion'], inplace=True)

df.isnull().sum()

df.dropna(inplace=True)

df.head()

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# Feature and target separation
X = df.drop(['Discount', 'Demand Forecast'], axis=1)
y_discount = df['Discount']
y_demand = df['Demand Forecast']

# Identify categorical and numerical columns
categorical_cols = ['Store ID', 'Product ID', 'Weather Condition', 'Seasonality']
numerical_cols = ['Inventory Level', 'Units Sold', 'Units Ordered', 'Price', 'Competitor Pricing']

# Define preprocessor (only fit once)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

# Train-test split
X_train, X_test, y_train_d, y_test_d = train_test_split(X, y_discount, test_size=0.2, random_state=42)
_, _, y_train_f, y_test_f = train_test_split(X, y_demand, test_size=0.2, random_state=42)

# Fit preprocessor on the entire training data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Use the same transformed data for both targets
X_train_d = X_train_transformed
X_train_f = X_train_transformed
X_test_d = X_test_transformed
X_test_f = X_test_transformed

X_train_d.toarray()

X_test_d.shape

from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Function to evaluate models
def evaluate_model(y_true, y_pred, model_name):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    accuracy = 100 - mape
    print(f"{model_name} - MAE: {mae:.2f}, RMSE: {rmse:.2f}, R²: {r2:.4f}, Accuracy: {accuracy:.2f}%")

# Base models for stacking
base_models = [
    ('lgbm', LGBMRegressor(n_estimators=200, learning_rate=0.05, max_depth=5, subsample=0.8)),
    ('xgb', XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=5, subsample=0.8)),
    ('rf', RandomForestRegressor(n_estimators=200, max_depth=10)),
    ('gb', GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, max_depth=5))
]

# Meta-model (Linear Regression for final prediction)
meta_model = LinearRegression()

# Stacking Regressors for Demand and Discount
demand_model = StackingRegressor(estimators=base_models, final_estimator=meta_model)
discount_model = StackingRegressor(estimators=base_models, final_estimator=meta_model)


In [2]:
df


Unnamed: 0,Date,Store ID,Product ID,Inventory Level,Units Sold,Units Ordered,Demand Forecast,Price,Discount,Weather Condition,Competitor Pricing,Seasonality,Base Cost
0,2022-01-01,S001,P0001,231,127,55,135.47,33.50,20,Rainy,29.69,Autumn,25.1250
1,2022-01-01,S001,P0002,204,150,66,144.04,63.01,20,Sunny,66.16,Autumn,47.2575
2,2022-01-01,S001,P0003,102,65,51,74.02,27.99,10,Sunny,31.32,Summer,20.9925
3,2022-01-01,S001,P0004,469,61,164,62.18,32.72,10,Cloudy,34.74,Autumn,24.5400
4,2022-01-01,S001,P0005,166,14,135,9.26,73.64,0,Sunny,68.95,Summer,55.2300
...,...,...,...,...,...,...,...,...,...,...,...,...,...
73095,2024-01-01,S005,P0016,96,8,127,18.46,73.73,20,Snowy,72.45,Winter,55.2975
73096,2024-01-01,S005,P0017,313,51,101,48.43,82.57,10,Cloudy,83.78,Autumn,61.9275
73097,2024-01-01,S005,P0018,278,36,151,39.65,11.11,10,Rainy,10.91,Winter,8.3325
73098,2024-01-01,S005,P0019,374,264,21,270.52,53.14,20,Rainy,55.80,Spring,39.8550


In [3]:
demand_model.fit(X_train_f, y_train_f)
discount_model.fit(X_train_d, y_train_d)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002094 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1265
[LightGBM] [Info] Number of data points in the train set: 58480, number of used features: 38
[LightGBM] [Info] Start training from score 141.776563
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000257 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1265
[LightGBM] [Info] Number of data points in the train set: 46784, number of used features: 38
[LightGBM] [Info] Start training from score 141.474872




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000789 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1265
[LightGBM] [Info] Number of data points in the train set: 46784, number of used features: 38
[LightGBM] [Info] Start training from score 141.926765




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000725 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1266
[LightGBM] [Info] Number of data points in the train set: 46784, number of used features: 38
[LightGBM] [Info] Start training from score 141.956782




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000291 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1264
[LightGBM] [Info] Number of data points in the train set: 46784, number of used features: 38
[LightGBM] [Info] Start training from score 141.763652




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000270 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1265
[LightGBM] [Info] Number of data points in the train set: 46784, number of used features: 38
[LightGBM] [Info] Start training from score 141.760745




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002813 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1265
[LightGBM] [Info] Number of data points in the train set: 58480, number of used features: 38
[LightGBM] [Info] Start training from score 10.000427
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001125 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1265
[LightGBM] [Info] Number of data points in the train set: 46784, number of used features: 38
[LightGBM] [Info] Start training from score 10.000107
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002059 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1265
[LightGBM] [Info] Number of data points in the train s



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001019 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1266
[LightGBM] [Info] Number of data points in the train set: 46784, number of used features: 38
[LightGBM] [Info] Start training from score 9.980335








[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001558 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1264
[LightGBM] [Info] Number of data points in the train set: 46784, number of used features: 38
[LightGBM] [Info] Start training from score 10.036230




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001865 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1265
[LightGBM] [Info] Number of data points in the train set: 46784, number of used features: 38
[LightGBM] [Info] Start training from score 9.986213




0,1,2
,estimators,"[('lgbm', ...), ('xgb', ...), ...]"
,final_estimator,LinearRegression()
,cv,
,n_jobs,
,passthrough,False
,verbose,0

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,5
,learning_rate,0.05
,n_estimators,200
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False

0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True

0,1,2
,loss,'squared_error'
,learning_rate,0.05
,n_estimators,200
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,5
,min_impurity_decrease,0.0

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [4]:
y_demand_pred = demand_model.predict(X_test_f)
y_discount_pred = discount_model.predict(X_test_d)

# Evaluate models
evaluate_model(y_test_f, y_demand_pred, 'Ensemble Demand Forecast Model')
evaluate_model(y_test_d, y_discount_pred, 'Ensemble Discount Model')



Ensemble Demand Forecast Model - MAE: 7.49, RMSE: 8.68, R²: 0.9937, Accuracy: 57.46%
Ensemble Discount Model - MAE: 5.96, RMSE: 7.04, R²: -0.0001, Accuracy: -inf%


In [5]:
import joblib


In [6]:
joblib.dump(demand_model, 'demand_forecast_model.pkl')
joblib.dump(discount_model, 'discount_prediction_model.pkl')

print("Models saved successfully!")

import joblib

# Load trained models
demand_model = joblib.load('demand_forecast_model.pkl')
discount_model = joblib.load('discount_prediction_model.pkl')


Models saved successfully!


In [7]:
import pandas as pd

data = pd.DataFrame([
    {
        'Date': '01-01-2023',
        'Store ID': 'S001',
        'Product ID': 'P0001',
        'Inventory Level': 231.0,
        'Units Sold': 127.0,
        'Units Ordered': 55.0,
        'Price': 33.50,
        'Weather Condition': 'Rainy',

        'Competitor Pricing': 29.69,
        'Seasonality': 'Autumn'
    },
    {
        'Date': '01-01-2023',
        'Store ID': 'S001',
        'Product ID': 'P0002',
        'Inventory Level': 204.0,
        'Units Sold': 150.0,
        'Units Ordered': 66.0,
        'Price': 63.01,
        'Weather Condition': 'Sunny',

        'Competitor Pricing': 66.16,
        'Seasonality': 'Autumn'
    }
])

# Drop target columns if present
input_features = data.drop(columns=['Demand_Forecast', 'Discount'], errors='ignore')

print(input_features)

input_features=preprocessor.transform(input_features)
predicted_demand=demand_model.predict(input_features)
predicted_discount=discount_model.predict(input_features)

preprocessor.feature_names_in_

         Date Store ID Product ID  Inventory Level  Units Sold  Units Ordered  \
0  01-01-2023     S001      P0001            231.0       127.0           55.0   
1  01-01-2023     S001      P0002            204.0       150.0           66.0   

   Price Weather Condition  Competitor Pricing Seasonality  
0  33.50             Rainy               29.69      Autumn  
1  63.01             Sunny               66.16      Autumn  




array(['Date', 'Store ID', 'Product ID', 'Inventory Level', 'Units Sold',
       'Units Ordered', 'Price', 'Weather Condition',
       'Competitor Pricing', 'Seasonality', 'Base Cost'], dtype=object)

In [8]:
import joblib
joblib.dump(preprocessor, 'preprocessor.pkl')


['preprocessor.pkl']