## STEP 1:  IMPORT LIBRARIES

In [72]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

## STEP 2: LOAD THE PREPARED DATA

In [73]:
df = pd.read_csv("../Data/final_prepared.csv", parse_dates=["date"])
print(df.head())

  store_id       date product_id      product_name_x  units_sold  revenue  \
0     S001 2025-01-05       P001     Coca-Cola 500ml          33  1689.58   
1     S001 2025-01-05       P002         Pepsi 500ml          25  2394.15   
2     S001 2025-01-05       P003     Lays Chips 100g          15   407.39   
3     S001 2025-01-05       P004  Doritos Nacho 100g          44  2055.15   
4     S001 2025-01-05       P005   Oreo Biscuit 120g           6    73.59   

      cost  price_x      product_name_y   category  price_y    store_name  \
0  1091.23    51.20     Coca-Cola 500ml  Beverages       35  Mumbai Store   
1  1529.87    95.77         Pepsi 500ml  Beverages       34  Mumbai Store   
2   291.21    27.16     Lays Chips 100g     Snacks       20  Mumbai Store   
3  1596.89    46.71  Doritos Nacho 100g     Snacks       25  Mumbai Store   
4    57.43    12.26   Oreo Biscuit 120g   Biscuits       30  Mumbai Store   

         city  store_type  week  month  profit  profit_margin  \
0  Metro 

## STEP 3: Select Features and Target

In [74]:
numerical_features = ["revenue", "cost", "profit", "profit_margin", "revenue_per_unit"]

categorical_features = ["store_type", "category", "city", "month", "week"]

target = "units_sold"

X = df[numerical_features + categorical_features]
y = df[target]

## STEP 4: Train_Test_Split

In [75]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

print(x_train.shape)
print(x_test.shape)

(192, 10)
(48, 10)


## STEP 5: Built Preprocessing + Model Pipeline

In [76]:
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", "passthrough", numerical_features),
    ]
)

model = RandomForestRegressor(
    n_estimators=120,
    max_depth=8,
    random_state=42,
)

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", model)
])

## STEP 6: Train Model

In [77]:
pipeline.fit(x_train, y_train)
print("Model Training Completed")

Model Training Completed


## STEP 7: Evaluate Model

In [78]:
y_pred = pipeline.predict(x_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Model Performance Metrics:")
print(f"MAE  : {mae:.3f}")
print(f"RMSE  : {rmse:.3f}")
print(f"R*2  : {r2:.3f}")

Model Performance Metrics:
MAE  : 1.753
RMSE  : 2.417
R*2  : 0.954


In [85]:
import os
os.makedirs("../src/models", exist_ok=True)

import joblib
joblib.dump(pipeline, "../src/models/pipeline_units.pkl")

print(" Pipeline saved successfully!")


 Pipeline saved successfully!
