Sales Forecast (not time series) + weather data updated

Start Modelling: Linear, Polynomial, XGBOOST, NEURAL NETWORKS, then need manipulate data (no sales duplicate data), then can do time series

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv(r" ... Master Weather + Store Subcluster + Holiday + Sales.csv")

# ───────────────────────────────────────────────────────────────────────────────
# 1) Preparing data for regression
# ───────────────────────────────────────────────────────────────────────────────
# List of categorical columns you want to encode
cat_cols = ['Store_No','Name', 'State', 'Day', 'CODE (subcluster 1)',
            'CODE FY26 1 (subcluster 2)', 'CODE FY26 2 (subcluster 3)', 'Rain?','Public Holiday']

# Fill missing values in these columns only
df[cat_cols] = df[cat_cols].fillna("Missing")

# Encode
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))

# Making sure PH, Days from Holiday, Puasa count are all integers
object_cols = ["Public Holiday", "Days From Holiday","Puasa Count"]
for cols in object_cols:
    df[cols] = df[col].astype(int)

# Using a numeric placeholder instead of string
df[['Net_Amount', 'TC']] = df[['Net_Amount', 'TC']].replace("No Sales", 0).astype(float)

# Dropping datetime columns
df = df.drop(columns=['Date','Opening_Date'])  # if not useful

# Splitting into features and target
X = df.drop(['Net_Amount', 'TC'], axis=1)

In [None]:
# ───────────────────────────────────────────────────────────────────────────────
# 2) Implementing Polynomial Regression
# ───────────────────────────────────────────────────────────────────────────────
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pandas as pd

y = df[['Net_Amount','TC']]

# Identify continuous columns
numeric_cols = [
    'Average Daily Temperature (°C)',
    'Days From Holiday',
    'Puasa Count',
    'Days_after_Opening'
]

# 2) Split before scaling so you don’t leak info
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 3a) StandardScaler: center @0, std=1
std_scaler = StandardScaler()
X_train_std = X_train.copy()
X_test_std  = X_test.copy()

X_train_std[numeric_cols] = std_scaler.fit_transform( X_train[numeric_cols] )
X_test_std [numeric_cols] = std_scaler .transform( X_test [numeric_cols] )

# Creating a polynomial regression pipeline
degree = 2  # You can tune this
poly_model = make_pipeline(PolynomialFeatures(degree=degree), LinearRegression())
multi_model = MultiOutputRegressor(poly_model)

# Fit the model
multi_model.fit(X_train, y_train)

# Performance
y_pred = multi_model.predict(X_test)

for i, col in enumerate(y.columns):
    y_true = y_test.iloc[:, i]
    y_hat  = y_pred[:,    i]

    rmse = np.sqrt(mean_squared_error(y_true, y_hat))
    r2   = r2_score(             y_true, y_hat)      # ← define r2 here

    print(f"{col} → RMSE: {rmse:.2f} | R²: {r2:.3f}")


# my laptop can't run gridsearchcv cuz it takes too much memory
# multi_base = MultiOutputRegressor(base_pipeline)

# # 3) Define a neg‐RMSE scorer (so lower RMSE is “better” for GridSearch)
# rmse_scorer = make_scorer(
#     lambda yt, yp: np.sqrt(mean_squared_error(yt, yp)),
#     greater_is_better=False
# )

# # 4) Parameter grid: tuning degree of polynomial (and intercept if you like)
# param_grid = {
#     'estimator__polynomialfeatures__degree':       [1, 2, 3, 4],
#     'estimator__linearregression__fit_intercept':  [True, False]
# }

# # 5) Set up GridSearchCV
# grid = GridSearchCV(
#     estimator = multi_base,
#     param_grid = param_grid,
#     scoring    = {'rmse': rmse_scorer, 'r2': 'r2'},
#     refit      = 'rmse',   # after CV, refit on whole train set using best RMSE
#     cv         = 5,
#     verbose    = 1,
#     n_jobs     = -1
# )

# # 6) Run grid search
# grid.fit(X_train, y_train)

# # 7) Evaluate best model on the test set
# best_model = grid.best_estimator_
# y_pred_gs  = best_model.predict(X_test)

# print("Best parameters:", grid.best_params_)
# print("\n=== Performance on TEST set ===")
# for i, col in enumerate(y.columns):
#     rmse = np.sqrt(mean_squared_error(y_test.iloc[:, i], y_pred_gs[:, i]))
#     r2   = r2_score(y_test.iloc[:, i],       y_pred_gs[:, i])
#     print(f"{col:12s} → RMSE: {rmse:8.2f} | R²: {r2:.3f}")

In [None]:
# ───────────────────────────────────────────────────────────────────────────────
# 3) XGBOOST Regressor for Net_Amount
# ───────────────────────────────────────────────────────────────────────────────
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
import pandas as pd

y = df['Net_Amount']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initial model
model_net = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=5)
model_net.fit(X_train, y_train)

# Predict and evaluate
y_pred = model_net.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test,y_pred)

print(f"[Net_Amount] RMSE: {rmse}")
print(f"[Net_Amount] R-squared: {r2}")
print(f"[Net_Amount] MAE: {mae}")

# Hyperparameter tuning
params = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}

grid = GridSearchCV(xgb.XGBRegressor(objective='reg:squarederror'), params, scoring='neg_root_mean_squared_error', cv=3)
grid.fit(X_train, y_train)

best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test,y_pred)

print(f"[Net_Amount] Tuned RMSE: {rmse}")
print(f"[Net_Amount] Tuned R²: {r2}")
print(f"[Net_Amount] Tuned MAE: {mae}")
print(f"[Net_Amount] Target Std Dev: {y.std()}")

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
std_dev = y_test.std()
ratio_sd = rmse / std_dev
ratio_mean = mae / y.mean()

print(f"[Net_Amount] RMSE / SD ratio: {ratio_sd:.2f}")
print(f"[Net_Amount] MAE / mean ratio: {ratio_mean:.2f}")

# Interpretation for RMSE
if ratio_sd < 0.5:
    print("🟢 Excellent RMSE")
elif 0.5 <= ratio_sd < 0.75:
    print("🟢 Good RMSE")
elif 0.75 <= ratio_sd <= 1.0:
    print("🟡 Acceptable RMSE, but can be improved")
else:
    print("🔴 Poor RMSE – model performs worse than mean prediction")

# Interpretation for R-squared
if r2 >= 0.9:
    print("🟢 Excellent R²")
elif 0.75 <= r2 < 0.9:
    print("🟢 Good R²")
elif 0.5 <= r2 < 0.75:
    print("🟡 Acceptable R², but can be improved")
else:
    print("🔴 Weak R² – model needs improvement")

# Interpretation for MAE
if ratio_mean < 0.05:
    print("🟢 Excellent MAE")
elif 0.05 <= ratio_mean < 0.10:
    print("🟢 Good MAE")
elif 0.10 <= ratio_mean < 0.20:
    print("🟡 Acceptable MAE, but can be improved")
elif 0.20 <= ratio_mean <= 0.30:
    print("🔴 Weak MAE, high prediction error")
else:
    print("🔴 Poor MAE – likely underfitting/noisy data")

In [None]:
# ───────────────────────────────────────────────────────────────────────────────
# 3) XGBOOST Regressor for TC
# ───────────────────────────────────────────────────────────────────────────────

y = df['TC']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initial model
model_tc = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=5)
model_tc.fit(X_train, y_train)

# Predict and evaluate
y_pred = model_tc.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test,y_pred)

print(f"[TC] RMSE: {rmse}")
print(f"[TC] R-squared: {r2}")
print(f"[TC] MAE: {mae}")

# Hyperparameter tuning
grid = GridSearchCV(xgb.XGBRegressor(objective='reg:squarederror'), params, scoring='neg_root_mean_squared_error', cv=3)
grid.fit(X_train, y_train)

best_net = grid.best_estimator_
y_pred = best_net.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test,y_pred)

print(f"[TC] Tuned RMSE: {rmse}")
print(f"[TC] Tuned R²: {r2}")
print(f"[TC] Tuned MAE: {mae}")
print(f"[TC] Target Std Dev: {y.std()}")

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
std_dev = y_test.std()
ratio = rmse / std_dev
ratio_mean = mae / y.mean()
 
print(f"[TC] RMSE / SD ratio: {ratio:.2f}")
print(f"[TC] MAE / mean ratio: {ratio_mean:.2f}")

# Interpretation for RMSE
if ratio < 0.5:
    print("🟢 Excellent RMSE")
elif 0.5 <= ratio < 0.75:
    print("🟢 Good RMSE")
elif 0.75 <= ratio <= 1.0:
    print("🟡 Acceptable RMSE, but can be improved")
else:
    print("🔴 Poor RMSE – model performs worse than mean prediction")

# Interpretation for R-squared
if r2 >= 0.9:
    print("🟢 Excellent R²")
elif 0.75 <= r2 < 0.9:
    print("🟢 Good R²")
elif 0.5 <= r2 < 0.75:
    print("🟡 Acceptable R², but can be improved")
else:
    print("🔴 Weak R² – model needs improvement")

# Interpretation for MAE
if ratio_mean < 0.05:
    print("🟢 Excellent MAE")
elif 0.05 <= ratio_mean < 0.10:
    print("🟢 Good MAE")
elif 0.10 <= ratio_mean < 0.20:
    print("🟡 Acceptable MAE, but can be improved")
elif 0.20 <= ratio_mean <= 0.30:
    print("🔴 Weak MAE, high prediction error")
else:
    print("🔴 Poor MAE – likely underfitting/noisy data")

In [None]:
# ───────────────────────────────────────────────────────────────────────────────
# 3) LightGBM for Net Amount (1st)
# ───────────────────────────────────────────────────────────────────────────────
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# FEATURES: set Store_No, CODE FY26 1 2 3, Name, state, day as categories


y = df['Net_Amount']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model
model_net = LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=7)
model_net.fit(X_train, y_train)

# Predict
y_pred = model_net.predict(X_test)

# Metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"[Net_Amount] MAE: {mae:.2f}")
print(f"[Net_Amount] RMSE: {rmse:.2f}")
print(f"[Net_Amount] R²: {r2:.2f}")

# Param grid
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [5, 7],
    'subsample': [0.8, 1.0]
}

# Grid search
grid = GridSearchCV(LGBMRegressor(), param_grid, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)
grid.fit(X_train, y_train)

# Best model
best_net = grid.best_estimator_
y_pred = best_net.predict(X_test)

# Metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"[Net_Amount] Best Params: {grid.best_params_}")
print(f"[Net_Amount] Tuned MAE: {mae:.2f}")
print(f"[Net_Amount] Tuned RMSE: {rmse:.2f}")
print(f"[Net_Amount] Tuned R²: {r2:.2f}")

print(f"[Net_Amount] Target Std Dev: {y.std()}")

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
std_dev = y_test.std()
ratio = rmse / std_dev
ratio_mean = mae / y.mean()
 
print(f"[Net_Amount] RMSE / SD ratio: {ratio:.2f}")
print(f"[Net_Amount] MAE / mean ratio: {ratio_mean:.2f}")

# Interpretation for RMSE
if ratio < 0.5:
    print("🟢 Excellent RMSE")
elif 0.5 <= ratio < 0.75:
    print("🟢 Good RMSE")
elif 0.75 <= ratio <= 1.0:
    print("🟡 Acceptable RMSE, but can be improved")
else:
    print("🔴 Poor RMSE – model performs worse than mean prediction")

# Interpretation for R-squared
if r2 >= 0.9:
    print("🟢 Excellent R²")
elif 0.75 <= r2 < 0.9:
    print("🟢 Good R²")
elif 0.5 <= r2 < 0.75:
    print("🟡 Acceptable R², but can be improved")
else:
    print("🔴 Weak R² – model needs improvement")

# Interpretation for MAE
if ratio_mean < 0.05:
    print("🟢 Excellent MAE")
elif 0.05 <= ratio_mean < 0.10:
    print("🟢 Good MAE")
elif 0.10 <= ratio_mean < 0.20:
    print("🟡 Acceptable MAE, but can be improved")
elif 0.20 <= ratio_mean <= 0.30:
    print("🔴 Weak MAE, high prediction error")
else:
    print("🔴 Poor MAE – likely underfitting/noisy data")

import matplotlib.pyplot as plt

import pandas as pd

# Create sorted DataFrame
importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': best_net.feature_importances_
}).sort_values(by='Importance', ascending=True)

# Plotbest
plt.figure(figsize=(10, 6))
plt.barh(importance_df['Feature'], importance_df['Importance'])
plt.title('Sorted Feature Importance - Net_Amount', fontsize=14, fontweight='bold')
plt.xlabel('Importance', fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
# ───────────────────────────────────────────────────────────────────────────────
# 3) LightGBM for Net Amount (2nd)
# ───────────────────────────────────────────────────────────────────────────────
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

y = df['Net_Amount']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Change some variables to category type so that categorical_features can detect and use those categories
cat_cols = [
    'Store_No', 
    'CODE (subcluster 1)', 
    'CODE FY26 1 (subcluster 2)', 
    'CODE FY26 2 (subcluster 3)', 
    'Name', 
    'State', 
    'Day'
]

for col in cat_cols:
    X_train[col] = X_train[col].astype('category')
    X_test[col] = X_test[col].astype('category')

categorical_features = [f"{col}" for col in cat_cols]

# Model
model_net = LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=7) # try to change this also
model_net.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    categorical_feature=categorical_features
)

# Predict
y_pred = model_net.predict(X_test)

# Metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"[Net_Amount] MAE: {mae:.2f}")
print(f"[Net_Amount] RMSE: {rmse:.2f}")
print(f"[Net_Amount] R²: {r2:.2f}")

# Param grid
param_grid = {
    'num_leaves': [31, 50],
    'max_depth': [-1, 10, 20],
    'learning_rate': [0.05, 0.1],
    'n_estimators': [100, 200],
    'reg_lambda': [0.0, 0.5, 1.0],
    'min_child_samples': [20, 50]
}

# Grid search
grid = GridSearchCV(LGBMRegressor(), param_grid, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)
grid.fit(X_train, y_train)

# Best model
best_net = grid.best_estimator_
y_pred = best_net.predict(X_test)

# Metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"[Net_Amount] Best Params: {grid.best_params_}")
print(f"[Net_Amount] Tuned MAE: {mae:.2f}")
print(f"[Net_Amount] Tuned RMSE: {rmse:.2f}")
print(f"[Net_Amount] Tuned R²: {r2:.2f}")

print(f"[Net_Amount] Target Std Dev: {y.std()}")

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
std_dev = y_test.std()
ratio = rmse / std_dev
ratio_mean = mae / y.mean()
 
print(f"[Net_Amount] RMSE / SD ratio: {ratio:.2f}")
print(f"[Net_Amount] MAE / mean ratio: {ratio_mean:.2f}")

# Interpretation for RMSE
if ratio < 0.5:
    print("🟢 Excellent RMSE")
elif 0.5 <= ratio < 0.75:
    print("🟢 Good RMSE")
elif 0.75 <= ratio <= 1.0:
    print("🟡 Acceptable RMSE, but can be improved")
else:
    print("🔴 Poor RMSE – model performs worse than mean prediction")

# Interpretation for R-squared
if r2 >= 0.9:
    print("🟢 Excellent R²")
elif 0.75 <= r2 < 0.9:
    print("🟢 Good R²")
elif 0.5 <= r2 < 0.75:
    print("🟡 Acceptable R², but can be improved")
else:
    print("🔴 Weak R² – model needs improvement")

# Interpretation for MAE
if ratio_mean < 0.05:
    print("🟢 Excellent MAE")
elif 0.05 <= ratio_mean < 0.10:
    print("🟢 Good MAE")
elif 0.10 <= ratio_mean < 0.20:
    print("🟡 Acceptable MAE, but can be improved")
elif 0.20 <= ratio_mean <= 0.30:
    print("🔴 Weak MAE, high prediction error")
else:
    print("🔴 Poor MAE – likely underfitting/noisy data")

import matplotlib.pyplot as plt

import pandas as pd

# Create sorted DataFrame
importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': best_net.feature_importances_
}).sort_values(by='Importance', ascending=True)

# Plot
plt.figure(figsize=(10, 6))
plt.barh(importance_df['Feature'], importance_df['Importance'])
plt.title('Sorted Feature Importance - Net_Amount', fontsize=14, fontweight='bold')
plt.xlabel('Importance', fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
# ───────────────────────────────────────────────────────────────────────────────
# 3) LightGBM for Net Amount drop Store_No (3rd)
# ───────────────────────────────────────────────────────────────────────────────
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
df.columns = df.columns.str.replace(r'[^\w]+', '_', regex=True)
X = df.drop(['Net_Amount', 'TC','Store_No'], axis=1)
y = df['Net_Amount']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
cat_cols = ['CODE_subcluster_1', 'CODE_FY26_1_subcluster_2_', 'CODE_FY26_2_subcluster_3_']

# Change some variables to category type so that categorical_features can detect and use those categories
cat_cols = [col for col in X.columns if col in ['Name', 'State', 'Day', 'CODE_subcluster_1_', 
                                                'CODE_FY26_1_subcluster_2_','CODE_FY26_2_subcluster_3_']]

for col in cat_cols:
    X_train[col] = X_train[col].astype('category')
    X_test[col] = X_test[col].astype('category')
    X_test[col] = X_test[col].cat.set_categories(X_train[col].cat.categories)

categorical_features = [f"{col}" for col in cat_cols]

# Model
model_net = LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=7) # try to change this also
model_net.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    categorical_feature=categorical_features
)

# Predict
y_pred = model_net.predict(X_test)

# Metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"[Net_Amount] MAE: {mae:.2f}")
print(f"[Net_Amount] RMSE: {rmse:.2f}")
print(f"[Net_Amount] R²: {r2:.2f}")

# Param grid
param_grid = {
    'num_leaves': [31, 50],
    'max_depth': [-1, 10, 20],
    'learning_rate': [0.05, 0.1],
    'n_estimators': [100, 200],
    'reg_lambda': [0.0, 0.5, 1.0],
    'min_child_samples': [20, 50]
}

# Grid search
base_model = LGBMRegressor()
base_model.set_params(categorical_feature=cat_cols)
grid = GridSearchCV(base_model, param_grid, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)
grid.fit(X_train, y_train)

# Best model
best_net = grid.best_estimator_
y_pred = best_net.predict(X_test)

# Metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"[Net_Amount] Best Params: {grid.best_params_}")
print(f"[Net_Amount] Tuned MAE: {mae:.2f}")
print(f"[Net_Amount] Tuned RMSE: {rmse:.2f}")
print(f"[Net_Amount] Tuned R²: {r2:.2f}")

print(f"[Net_Amount] Target Std Dev: {y.std()}")

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
std_dev = y_test.std()
ratio = rmse / std_dev
ratio_mean = mae / y.mean()
 
print(f"[Net_Amount] RMSE / SD ratio: {ratio:.2f}")
print(f"[Net_Amount] MAE / mean ratio: {ratio_mean:.2f}")

# Interpretation for RMSE
if ratio < 0.5:
    print("🟢 Excellent RMSE")
elif 0.5 <= ratio < 0.75:
    print("🟢 Good RMSE")
elif 0.75 <= ratio <= 1.0:
    print("🟡 Acceptable RMSE, but can be improved")
else:
    print("🔴 Poor RMSE – model performs worse than mean prediction")

# Interpretation for R-squared
if r2 >= 0.9:
    print("🟢 Excellent R²")
elif 0.75 <= r2 < 0.9:
    print("🟢 Good R²")
elif 0.5 <= r2 < 0.75:
    print("🟡 Acceptable R², but can be improved")
else:
    print("🔴 Weak R² – model needs improvement")

# Interpretation for MAE
if ratio_mean < 0.05:
    print("🟢 Excellent MAE")
elif 0.05 <= ratio_mean < 0.10:
    print("🟢 Good MAE")
elif 0.10 <= ratio_mean < 0.20:
    print("🟡 Acceptable MAE, but can be improved")
elif 0.20 <= ratio_mean <= 0.30:
    print("🔴 Weak MAE, high prediction error")
else:
    print("🔴 Poor MAE – likely underfitting/noisy data")

import matplotlib.pyplot as plt

import pandas as pd

# Create sorted DataFrame
importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': best_net.feature_importances_
}).sort_values(by='Importance', ascending=True)

# Plot
plt.figure(figsize=(10, 6))
plt.barh(importance_df['Feature'], importance_df['Importance'])
plt.title('Sorted Feature Importance - Net_Amount', fontsize=14, fontweight='bold')
plt.xlabel('Importance', fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
# ───────────────────────────────────────────────────────────────────────────────
# 3) LightGBM for Net Amount drop Store_No, Rain?, Days From Holiday, Puasa Count (4th)
# ───────────────────────────────────────────────────────────────────────────────
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
df.columns = df.columns.str.replace(r'[^\w]+', '_', regex=True)
X = df.drop(['Net_Amount', 'TC','Store_No','Rain_', 'Days_From_Holiday','Puasa_Count'], axis=1)
y = df['Net_Amount']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
cat_cols = ['CODE_subcluster_1', 'CODE_FY26_1_subcluster_2_', ...]

# Change some variables to category type so that categorical_features can detect and use those categories
cat_cols = [col for col in X.columns if col in ['Name', 'State', 'Day', 'CODE_subcluster_1_', 
                                                'CODE_FY26_1_subcluster_2_','CODE_FY26_2_subcluster_3_']]

for col in cat_cols:
    X_train[col] = X_train[col].astype('category')
    X_test[col] = X_test[col].astype('category')
    X_test[col] = X_test[col].cat.set_categories(X_train[col].cat.categories)

categorical_features = [f"{col}" for col in cat_cols]

# Model
model_net = LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=7) # try to change this also
model_net.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    categorical_feature=categorical_features
)

# Predict
y_pred = model_net.predict(X_test)

# Metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"[Net_Amount] MAE: {mae:.2f}")
print(f"[Net_Amount] RMSE: {rmse:.2f}")
print(f"[Net_Amount] R²: {r2:.2f}")

# Param grid
param_grid = {
    'num_leaves': [31, 50],
    'max_depth': [-1, 10, 20],
    'learning_rate': [0.05, 0.1],
    'n_estimators': [100, 200],
    'reg_lambda': [0.0, 0.5, 1.0],
    'min_child_samples': [20, 50]
}

# Grid search
base_model = LGBMRegressor()
base_model.set_params(categorical_feature=cat_cols)
grid = GridSearchCV(base_model, param_grid, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)
grid.fit(X_train, y_train)

# Best model
best_net = grid.best_estimator_
y_pred = best_net.predict(X_test)

# Metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"[Net_Amount] Best Params: {grid.best_params_}")
print(f"[Net_Amount] Tuned MAE: {mae:.2f}")
print(f"[Net_Amount] Tuned RMSE: {rmse:.2f}")
print(f"[Net_Amount] Tuned R²: {r2:.2f}")

print(f"[Net_Amount] Target Std Dev: {y.std()}")

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
std_dev = y_test.std()
ratio = rmse / std_dev
ratio_mean = mae / y.mean()
 
print(f"[Net_Amount] RMSE / SD ratio: {ratio:.2f}")
print(f"[Net_Amount] MAE / mean ratio: {ratio_mean:.2f}")

# Interpretation for RMSE
if ratio < 0.5:
    print("🟢 Excellent RMSE")
elif 0.5 <= ratio < 0.75:
    print("🟢 Good RMSE")
elif 0.75 <= ratio <= 1.0:
    print("🟡 Acceptable RMSE, but can be improved")
else:
    print("🔴 Poor RMSE – model performs worse than mean prediction")

# Interpretation for R-squared
if r2 >= 0.9:
    print("🟢 Excellent R²")
elif 0.75 <= r2 < 0.9:
    print("🟢 Good R²")
elif 0.5 <= r2 < 0.75:
    print("🟡 Acceptable R², but can be improved")
else:
    print("🔴 Weak R² – model needs improvement")

# Interpretation for MAE
if ratio_mean < 0.05:
    print("🟢 Excellent MAE")
elif 0.05 <= ratio_mean < 0.10:
    print("🟢 Good MAE")
elif 0.10 <= ratio_mean < 0.20:
    print("🟡 Acceptable MAE, but can be improved")
elif 0.20 <= ratio_mean <= 0.30:
    print("🔴 Weak MAE, high prediction error")
else:
    print("🔴 Poor MAE – likely underfitting/noisy data")

import matplotlib.pyplot as plt

import pandas as pd

# Create sorted DataFrame
importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': best_net.feature_importances_
}).sort_values(by='Importance', ascending=True)

# Plot
plt.figure(figsize=(10, 6))
plt.barh(importance_df['Feature'], importance_df['Importance'])
plt.title('Sorted Feature Importance - Net_Amount', fontsize=14, fontweight='bold')
plt.xlabel('Importance', fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
# Little note: need to rerun 1) since I removed Store No for draft 3

# ───────────────────────────────────────────────────────────────────────────────
# 4) LightGBM for TC (1st)
# ───────────────────────────────────────────────────────────────────────────────
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

y = df['TC']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model
model_net = LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=7)
model_net.fit(X_train, y_train)

# Predict
y_pred = model_net.predict(X_test)

# Metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"[TC] MAE: {mae:.2f}")
print(f"[TC] RMSE: {rmse:.2f}")
print(f"[TC] R²: {r2:.2f}")

# Param grid
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [5, 7],
    'subsample': [0.8, 1.0]
}

# Grid search
grid = GridSearchCV(LGBMRegressor(), param_grid, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)
grid.fit(X_train, y_train)

# Best model
best_tc = grid.best_estimator_
y_pred = best_tc.predict(X_test)

# Metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"[TC] Best Params: {grid.best_params_}")
print(f"[TC] Tuned MAE: {mae:.2f}")
print(f"[TC] Tuned RMSE: {rmse:.2f}")
print(f"[TC] Tuned R²: {r2:.2f}")

print(f"[TC] Target Std Dev: {y.std()}")

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
std_dev = y_test.std()
ratio = rmse / std_dev
ratio_mean = mae / y.mean()
 
print(f"[TC] RMSE / SD ratio: {ratio:.2f}")
print(f"[TC] MAE / mean ratio: {ratio_mean:.2f}")

# Interpretation for RMSE
if ratio < 0.5:
    print("🟢 Excellent RMSE")
elif 0.5 <= ratio < 0.75:
    print("🟢 Good RMSE")
elif 0.75 <= ratio <= 1.0:
    print("🟡 Acceptable RMSE, but can be improved")
else:
    print("🔴 Poor RMSE – model performs worse than mean prediction")

# Interpretation for R-squared
if r2 >= 0.9:
    print("🟢 Excellent R²")
elif 0.75 <= r2 < 0.9:
    print("🟢 Good R²")
elif 0.5 <= r2 < 0.75:
    print("🟡 Acceptable R², but can be improved")
else:
    print("🔴 Weak R² – model needs improvement")

# Interpretation for MAE
if ratio_mean < 0.05:
    print("🟢 Excellent MAE")
elif 0.05 <= ratio_mean < 0.10:
    print("🟢 Good MAE")
elif 0.10 <= ratio_mean < 0.20:
    print("🟡 Acceptable MAE, but can be improved")
elif 0.20 <= ratio_mean <= 0.30:
    print("🔴 Weak MAE, high prediction error")
else:
    print("🔴 Poor MAE – likely underfitting/noisy data")

import matplotlib.pyplot as plt

# Feature importance
import pandas as pd

# Create sorted DataFrame
importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': best_tc.feature_importances_
}).sort_values(by='Importance', ascending=True)

# Plot
plt.figure(figsize=(10, 6))
plt.barh(importance_df['Feature'], importance_df['Importance'])
plt.title('Sorted Feature Importance - TC', fontsize=14, fontweight='bold')
plt.xlabel('Importance', fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
# ───────────────────────────────────────────────────────────────────────────────
# 12) LightGBM for TC (2nd)
# ───────────────────────────────────────────────────────────────────────────────
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

y = df['TC']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Change some variables to category type so that categorical_features can detect and use those categories
cat_cols = [
    'Store_No', 
    'CODE (subcluster 1)', 
    'CODE FY26 1 (subcluster 2)', 
    'CODE FY26 2 (subcluster 3)', 
    'Name', 
    'State', 
    'Day'
]

for col in cat_cols:
    X_train[col] = X_train[col].astype('category')
    X_test[col] = X_test[col].astype('category')

categorical_features = [f"{col}" for col in cat_cols]

# Model
model_net = LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=7) # try to change this also
model_net.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    categorical_feature=categorical_features
)

# Predict
y_pred = model_net.predict(X_test)

# Metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"[Net_Amount] MAE: {mae:.2f}")
print(f"[Net_Amount] RMSE: {rmse:.2f}")
print(f"[Net_Amount] R²: {r2:.2f}")

# Param grid
param_grid = {
    'num_leaves': [31, 50],
    'max_depth': [-1, 10, 20],
    'learning_rate': [0.05, 0.1],
    'n_estimators': [100, 200],
    'reg_lambda': [0.0, 0.5, 1.0],
    'min_child_samples': [20, 50]
}

# Grid search
grid = GridSearchCV(LGBMRegressor(), param_grid, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)
grid.fit(X_train, y_train)

# Best model
best_tc = grid.best_estimator_
y_pred = best_tc.predict(X_test)

# Metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"[TC] Best Params: {grid.best_params_}")
print(f"[TC] Tuned MAE: {mae:.2f}")
print(f"[TC] Tuned RMSE: {rmse:.2f}")
print(f"[TC] Tuned R²: {r2:.2f}")

print(f"[TC] Target Std Dev: {y.std()}")

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
std_dev = y_test.std()
ratio = rmse / std_dev
ratio_mean = mae / y.mean()
 
print(f"[TC] RMSE / SD ratio: {ratio:.2f}")
print(f"[TC] MAE / mean ratio: {ratio_mean:.2f}")

# Interpretation for RMSE
if ratio < 0.5:
    print("🟢 Excellent RMSE")
elif 0.5 <= ratio < 0.75:
    print("🟢 Good RMSE")
elif 0.75 <= ratio <= 1.0:
    print("🟡 Acceptable RMSE, but can be improved")
else:
    print("🔴 Poor RMSE – model performs worse than mean prediction")

# Interpretation for R-squared
if r2 >= 0.9:
    print("🟢 Excellent R²")
elif 0.75 <= r2 < 0.9:
    print("🟢 Good R²")
elif 0.5 <= r2 < 0.75:
    print("🟡 Acceptable R², but can be improved")
else:
    print("🔴 Weak R² – model needs improvement")

# Interpretation for MAE
if ratio_mean < 0.05:
    print("🟢 Excellent MAE")
elif 0.05 <= ratio_mean < 0.10:
    print("🟢 Good MAE")
elif 0.10 <= ratio_mean < 0.20:
    print("🟡 Acceptable MAE, but can be improved")
elif 0.20 <= ratio_mean <= 0.30:
    print("🔴 Weak MAE, high prediction error")
else:
    print("🔴 Poor MAE – likely underfitting/noisy data")

import matplotlib.pyplot as plt

import pandas as pd

# Create sorted DataFrame
importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': best_tc.feature_importances_
}).sort_values(by='Importance', ascending=True)

# Plot
plt.figure(figsize=(10, 6))
plt.barh(importance_df['Feature'], importance_df['Importance'])
plt.title('Sorted Feature Importance - TC', fontsize=14, fontweight='bold')
plt.xlabel('Importance', fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
# ───────────────────────────────────────────────────────────────────────────────
# 12) LightGBM for TC drop Store_No (3rd)
# ───────────────────────────────────────────────────────────────────────────────
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
df.columns = df.columns.str.replace(r'[^\w]+', '_', regex=True)
X = df.drop(['Net_Amount', 'TC','Store_No'], axis=1)
y = df['TC']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
cat_cols = ['CODE_subcluster_1', 'CODE_FY26_1_subcluster_2_', 'CODE_FY26_2_subcluster_3_']

# Change some variables to category type so that categorical_features can detect and use those categories
cat_cols = [col for col in X.columns if col in ['Name', 'State', 'Day', 'CODE_subcluster_1_', 
                                                'CODE_FY26_1_subcluster_2_','CODE_FY26_2_subcluster_3_']]

for col in cat_cols:
    X_train[col] = X_train[col].astype('category')
    X_test[col] = X_test[col].astype('category')
    X_test[col] = X_test[col].cat.set_categories(X_train[col].cat.categories)

categorical_features = [f"{col}" for col in cat_cols]

# Model
model_net = LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=7) # try to change this also
model_net.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    categorical_feature=categorical_features
)

# Predict
y_pred = model_net.predict(X_test)

# Metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"[TC] MAE: {mae:.2f}")
print(f"[TC] RMSE: {rmse:.2f}")
print(f"[TC] R²: {r2:.2f}")

# Param grid
param_grid = {
    'num_leaves': [31, 50],
    'max_depth': [-1, 10, 20],
    'learning_rate': [0.05, 0.1],
    'n_estimators': [100, 200],
    'reg_lambda': [0.0, 0.5, 1.0],
    'min_child_samples': [20, 50]
}

# Grid search
base_model = LGBMRegressor()
base_model.set_params(categorical_feature=cat_cols)
grid = GridSearchCV(base_model, param_grid, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)
grid.fit(X_train, y_train)

# Best model
best_tc = grid.best_estimator_
y_pred = best_tc.predict(X_test)

# Metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"[TC] Best Params: {grid.best_params_}")
print(f"[TC] Tuned MAE: {mae:.2f}")
print(f"[TC] Tuned RMSE: {rmse:.2f}")
print(f"[TC] Tuned R²: {r2:.2f}")

print(f"[TC] Target Std Dev: {y.std()}")

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
std_dev = y_test.std()
ratio = rmse / std_dev
ratio_mean = mae / y.mean()
 
print(f"[TC] RMSE / SD ratio: {ratio:.2f}")
print(f"[TC] MAE / mean ratio: {ratio_mean:.2f}")

# Interpretation for RMSE
if ratio < 0.5:
    print("🟢 Excellent RMSE")
elif 0.5 <= ratio < 0.75:
    print("🟢 Good RMSE")
elif 0.75 <= ratio <= 1.0:
    print("🟡 Acceptable RMSE, but can be improved")
else:
    print("🔴 Poor RMSE – model performs worse than mean prediction")

# Interpretation for R-squared
if r2 >= 0.9:
    print("🟢 Excellent R²")
elif 0.75 <= r2 < 0.9:
    print("🟢 Good R²")
elif 0.5 <= r2 < 0.75:
    print("🟡 Acceptable R², but can be improved")
else:
    print("🔴 Weak R² – model needs improvement")

# Interpretation for MAE
if ratio_mean < 0.05:
    print("🟢 Excellent MAE")
elif 0.05 <= ratio_mean < 0.10:
    print("🟢 Good MAE")
elif 0.10 <= ratio_mean < 0.20:
    print("🟡 Acceptable MAE, but can be improved")
elif 0.20 <= ratio_mean <= 0.30:
    print("🔴 Weak MAE, high prediction error")
else:
    print("🔴 Poor MAE – likely underfitting/noisy data")

import matplotlib.pyplot as plt

import pandas as pd

# Create sorted DataFrame
importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': best_tc.feature_importances_
}).sort_values(by='Importance', ascending=True)

# Plot
plt.figure(figsize=(10, 6))
plt.barh(importance_df['Feature'], importance_df['Importance'])
plt.title('Sorted Feature Importance - TC', fontsize=14, fontweight='bold')
plt.xlabel('Importance', fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
# ───────────────────────────────────────────────────────────────────────────────
# 12) LightGBM for TC drop Store_No, Rain?, Days From Holiday, Puasa Count (4th)
# ───────────────────────────────────────────────────────────────────────────────
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
df.columns = df.columns.str.replace(r'[^\w]+', '_', regex=True)
X = df.drop(['Net_Amount', 'TC','Store_No','Rain_', 'Days_From_Holiday','Puasa_Count'], axis=1)
y = df['TC']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
cat_cols = ['CODE_subcluster_1', 'CODE_FY26_1_subcluster_2_', ...]

# Change some variables to category type so that categorical_features can detect and use those categories
cat_cols = [col for col in X.columns if col in ['Name', 'State', 'Day', 'CODE_subcluster_1_', 
                                                'CODE_FY26_1_subcluster_2_','CODE_FY26_2_subcluster_3_']]

for col in cat_cols:
    X_train[col] = X_train[col].astype('category')
    X_test[col] = X_test[col].astype('category')
    X_test[col] = X_test[col].cat.set_categories(X_train[col].cat.categories)

categorical_features = [f"{col}" for col in cat_cols]

# Model
model_net = LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=7) # try to change this also
model_net.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    categorical_feature=categorical_features
)

# Predict
y_pred = model_net.predict(X_test)

# Metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"[TC] MAE: {mae:.2f}")
print(f"[TC] RMSE: {rmse:.2f}")
print(f"[TC] R²: {r2:.2f}")

# Param grid
param_grid = {
    'num_leaves': [31, 50],
    'max_depth': [-1, 10, 20],
    'learning_rate': [0.05, 0.1],
    'n_estimators': [100, 200],
    'reg_lambda': [0.0, 0.5, 1.0],
    'min_child_samples': [20, 50]
}

# Grid search
base_model = LGBMRegressor()
base_model.set_params(categorical_feature=cat_cols)
grid = GridSearchCV(base_model, param_grid, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)
grid.fit(X_train, y_train)

# Best model
best_tc = grid.best_estimator_
y_pred = best_tc.predict(X_test)

# Metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"[TC] Best Params: {grid.best_params_}")
print(f"[TC] Tuned MAE: {mae:.2f}")
print(f"[TC] Tuned RMSE: {rmse:.2f}")
print(f"[TC] Tuned R²: {r2:.2f}")

print(f"[TC] Target Std Dev: {y.std()}")

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
std_dev = y_test.std()
ratio = rmse / std_dev
ratio_mean = mae / y.mean()
 
print(f"[TC] RMSE / SD ratio: {ratio:.2f}")
print(f"[TC] MAE / mean ratio: {ratio_mean:.2f}")

# Interpretation for RMSE
if ratio < 0.5:
    print("🟢 Excellent RMSE")
elif 0.5 <= ratio < 0.75:
    print("🟢 Good RMSE")
elif 0.75 <= ratio <= 1.0:
    print("🟡 Acceptable RMSE, but can be improved")
else:
    print("🔴 Poor RMSE – model performs worse than mean prediction")

# Interpretation for R-squared
if r2 >= 0.9:
    print("🟢 Excellent R²")
elif 0.75 <= r2 < 0.9:
    print("🟢 Good R²")
elif 0.5 <= r2 < 0.75:
    print("🟡 Acceptable R², but can be improved")
else:
    print("🔴 Weak R² – model needs improvement")

# Interpretation for MAE
if ratio_mean < 0.05:
    print("🟢 Excellent MAE")
elif 0.05 <= ratio_mean < 0.10:
    print("🟢 Good MAE")
elif 0.10 <= ratio_mean < 0.20:
    print("🟡 Acceptable MAE, but can be improved")
elif 0.20 <= ratio_mean <= 0.30:
    print("🔴 Weak MAE, high prediction error")
else:
    print("🔴 Poor MAE – likely underfitting/noisy data")

import matplotlib.pyplot as plt

import pandas as pd

# Create sorted DataFrame
importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': best_tc.feature_importances_
}).sort_values(by='Importance', ascending=True)

# Plot
plt.figure(figsize=(10, 6))
plt.barh(importance_df['Feature'], importance_df['Importance'])
plt.title('Sorted Feature Importance - TC', fontsize=14, fontweight='bold')
plt.xlabel('Importance', fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
    # ───────────────────────────────────────────────────────────────────────────────
    # 13) Neural Networks (Feed Forward)
    # ───────────────────────────────────────────────────────────────────────────────
    from sklearn.preprocessing import LabelEncoder, StandardScaler
    from sklearn.model_selection import train_test_split
    from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense, Dropout, BatchNormalization
    from tensorflow.keras.models import Model
    from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
    from tensorflow.keras.optimizers import Adam
    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
    import re
    import numpy as np 
    import matplotlib.pyplot as plt

    ### Data preprocessing
    y = df[['Net_Amount','TC']].values#

    # Label-encode categoricals in place
    cat_cols = [
        'Store_No','Name','State','Day',
        'CODE (subcluster 1)','CODE FY26 1 (subcluster 2)',
        'CODE FY26 2 (subcluster 3)','Rain?','Public Holiday'
    ]
    df[cat_cols] = df[cat_cols].fillna("Missing")
    for col in cat_cols:
        df[col] = LabelEncoder().fit_transform(df[col].astype(str))

    # Define numeric cols (still in df)
    numeric_cols = ['Days From Holiday','Puasa Count',
        'Days_after_Opening','Average Daily Temperature (°C)'
    ]

    # Split the *full* DataFrame so you keep both cat & num
    train_df, test_df, y_train, y_test = train_test_split(
        df, y, test_size=0.2, random_state=42
    )

    # Fit scaler on training numerics, transform both
    scaler = StandardScaler()
    X_train_num = scaler.fit_transform(train_df[numeric_cols])
    X_test_num  = scaler.transform(test_df[numeric_cols])

    # Extract each categorical column as an array
    X_train_cat = [train_df[col].values for col in cat_cols]
    X_test_cat  = [test_df[col].values  for col in cat_cols]

    # Combine into the list your model expects:
    X_train_input = X_train_cat + [X_train_num]
    X_test_input  = X_test_cat  + [X_test_num]

    ### Building the Embedding + Dense Model
    # ========== Define Inputs ==========
    n_uniques = {col: df[col].nunique() for col in cat_cols}

    def clean(col):
        # replace any character that is not A–Z, a–z, 0–9 or underscore with underscore
        return re.sub(r'[^A-Za-z0-9_]+', '_', col)             # to change Rain?embed into Rain_embed

    cat_inputs = []
    cat_embeds = []

    for col in cat_cols:
        safe = clean(col)
        inp = Input(shape=(1,), name=f"{safe}_in")
        
        vocab_size = n_uniques[col]
        embed_dim = int(np.log2(vocab_size)) + 1  # adaptive size

        emb = Embedding(
            input_dim=vocab_size,
            output_dim=embed_dim,
            name=f"{safe}_emb"
        )(inp)

        flat = Flatten(name=f"{safe}_flat")(emb)
        cat_inputs.append(inp)
        cat_embeds.append(flat)

    # Numeric Input
    numeric_input = Input(
        shape=(X_train_num.shape[1],), 
        name='numeric_in'
    )

    # Combine all Keras Inputs
    all_inputs = cat_inputs + [numeric_input]

    # ========== Concatenate Embeddings + Numeric Inputs ==========
    x = Concatenate()(cat_embeds + [numeric_input])

    # ========== Updated Network Architecture ==========
    x = Dense(128, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.5)(x)

    x = Dense(64, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.5)(x)

    x = Dense(32, activation='relu')(x)
    output = Dense(2)(x)  # Predict Net_Amount and TC

    # ========== Build Model ==========
    model = Model(inputs=all_inputs, outputs=output)
    model.compile(
        optimizer=Adam(learning_rate= 0.0005),  # Reduced from default 1e-3 if needed
        loss='mse',
        metrics=['mae'])

    lr_scheduler = ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,       # reduce by half
        patience=3,       # wait 3 epochs of no improvement
        min_lr=1e-6,      # don't go below this
        verbose=1
    )

    # ========== Add EarlyStopping ==========
    early_stop = EarlyStopping(
        monitor='val_loss',         # Watch validation loss
        patience=5,                # Stop after 5 bad epochs (no improvement)
        min_delta=0.0001,
        mode='min',                 
        restore_best_weights=True  # Roll back to the best weights seen
    )

    # Train and capture history for plotting
    history = model.fit(
        x=X_train_input,
        y=y_train,
        validation_data=(X_test_input, y_test),
        epochs=50,          
        batch_size=32,
        callbacks=[early_stop, lr_scheduler],
        verbose=1
    )

    ### Evaluation
    # 1) Raw evaluation
    loss, mae = model.evaluate(X_test_input, y_test, verbose=0)
    print(f"\nTest MSE: {loss:.2f}, Test MAE: {mae:.2f}")

    # 2) Predictions
    predictions = model.predict(X_test_input)
    true_net, true_tc = y_test[:, 0], y_test[:, 1]
    pred_net, pred_tc = predictions[:, 0], predictions[:, 1]

    # 3) Metric helper
    def summarize(y_true, y_pred, name):
        mse  = mean_squared_error(y_true, y_pred)
        rmse = np.sqrt(mse)
        mae  = mean_absolute_error(y_true, y_pred)
        r2   = r2_score(y_true, y_pred)
        std  = np.std(y_true)
        mean = np.mean(y_true)
        print(f"\n— {name} —")
        print(f"RMSE : {rmse:.2f} (vs STD {std:.2f})")
        print(f"MAE  : {mae:.2f} (vs Mean{mean:.2f})")
        print(f"R²   : {r2:.3f}")

    summarize(true_net, pred_net, "Net_Amount")
    summarize(true_tc,  pred_tc,  "TC")


    ### 4) Loss Curve Plot ###
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Val   Loss')
    plt.xlabel('Epoch')
    plt.ylabel('MSE Loss')
    plt.legend()
    plt.title('Training vs Validation Loss')
    plt.show()




Changes:
1. Putting Net_Amount and TC on the same scale so that MSE loss won't be dominated by larger-magnitude target (Net_Amount)
2. Setting two task-specific heads - after a shared trunk the network branches, letting each target learn features that matter only to it, reducing negative interference
3. L2 weight regularisation - penalising large weights, discouraging complex co-adaptations and reducing over-fitting
4. Lower dropout (0.30) + tighter patience (3) - a good amount of regularisation but allows the model to converge faster + earlier early-stop
5. Quicker ReduceLROnPlateau - halves the learning rate after just 2 stagnant epochs, helping the optimiser settle into a better minimum once progress plateaus
6. Interpretation - for easy interpretation of values

In [None]:
# ───────────────────────────────────────────────────────────────────────────────
# 13) Neural Networks (Feed-Forward) – Improved version with interpretation
# ───────────────────────────────────────────────────────────────────────────────
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import (Input, Embedding, Flatten, Concatenate,
                                     Dense, Dropout, BatchNormalization)
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import re, numpy as np, matplotlib.pyplot as plt

# ── 1) Data preprocessing ─────────────────────────────────────────────────────
y = df[['Net_Amount', 'TC']].values

cat_cols = ['Store_No','Name','State','Day',
            'CODE (subcluster 1)','CODE FY26 1 (subcluster 2)',
            'CODE FY26 2 (subcluster 3)','Rain?','Public Holiday']

df[cat_cols] = df[cat_cols].fillna("Missing")
for c in cat_cols:
    df[c] = LabelEncoder().fit_transform(df[c].astype(str))

numeric_cols = ['Days From Holiday','Puasa Count',
                'Days_after_Opening','Average Daily Temperature (°C)']

train_df, test_df, y_train, y_test = train_test_split(
    df, y, test_size=0.20, random_state=42)

scaler_X = StandardScaler()
X_train_num = scaler_X.fit_transform(train_df[numeric_cols])
X_test_num  = scaler_X.transform(test_df[numeric_cols])

X_train_cat = [train_df[c].values for c in cat_cols]
X_test_cat  = [test_df[c].values  for c in cat_cols]

scaler_y       = StandardScaler()
y_train_scaled = scaler_y.fit_transform(y_train)
y_test_scaled  = scaler_y.transform(y_test)

# ── 2) Model definition ───────────────────────────────────────────────────────
def clean(col): return re.sub(r'[^A-Za-z0-9_]+', '_', col)

cat_inputs, cat_embeds = [], []
n_uniques = {c: df[c].nunique() for c in cat_cols}

for col in cat_cols:
    safe     = clean(col)
    inp      = Input(shape=(1,), name=f"{safe}_in")
    emb_dim  = int(np.log2(n_uniques[col])) + 1
    emb_layer= Embedding(input_dim=n_uniques[col],
                         output_dim=emb_dim,
                         name=f"{safe}_emb")(inp)
    cat_inputs.append(inp)
    cat_embeds.append(Flatten(name=f"{safe}_flat")(emb_layer))

numeric_input = Input(shape=(X_train_num.shape[1],), name='numeric_in')
all_inputs    = cat_inputs + [numeric_input]
x = Concatenate()(cat_embeds + [numeric_input])

# shared trunk
x = Dense(128, activation='relu', kernel_regularizer=l2(1e-4))(x)
x = BatchNormalization()(x)
x = Dropout(0.30)(x)
x = Dense(64, activation='relu',  kernel_regularizer=l2(1e-4))(x)
x = BatchNormalization()(x)
x = Dropout(0.30)(x)

# ── split into two task‐specific heads ────────────────────────────────────────
h1      = Dense(32, activation='relu', kernel_regularizer=l2(1e-4))(x)
out_net = Dense(1, name='net')(h1)
h2      = Dense(32, activation='relu', kernel_regularizer=l2(1e-4))(x)
out_tc  = Dense(1, name='tc')(h2)

model = Model(inputs=all_inputs, outputs=[out_net, out_tc])
model.compile(
    optimizer=Adam(learning_rate=5e-4),
    loss   = ['mse','mse'],    # one per output
    metrics= ['mae','mae']
)

reduce_lr = ReduceLROnPlateau(
    monitor   ='val_loss',
    factor    =0.5,
    patience  =2,
    min_lr    =1e-6,
    verbose   =1
)
early_stop = EarlyStopping(
    monitor            ='val_loss',
    patience           =3,
    restore_best_weights=True,
    verbose            =1
)

# ── 3) Training ───────────────────────────────────────────────────────────────
X_train_input = X_train_cat + [X_train_num]
X_test_input  = X_test_cat  + [X_test_num]

history = model.fit(
    x             = X_train_input,
    y             = {'net': y_train_scaled[:,0], 'tc': y_train_scaled[:,1]},
    validation_data=(X_test_input,
                     {'net': y_test_scaled[:,0], 'tc': y_test_scaled[:,1]}),
    epochs        = 50,
    batch_size    = 32,
    callbacks     = [early_stop, reduce_lr],
    verbose       = 1
)

# ── 4) Predictions & inverse‐scale ───────────────────────────────────────────
scaled_preds  = model.predict(X_test_input)
preds_matrix  = np.hstack(scaled_preds)
predictions   = scaler_y.inverse_transform(preds_matrix)
true_net, true_tc = y_test[:,0], y_test[:,1]
pred_net, pred_tc = predictions[:,0], predictions[:,1]

# ── 5) Summary + interpretation ─────────────────────────────────────────────
def interpret_and_print(y_true, y_pred, name):
    mse         = mean_squared_error(y_true, y_pred)
    rmse        = np.sqrt(mse)
    mae         = mean_absolute_error(y_true, y_pred)
    r2          = r2_score(y_true, y_pred)
    std_y, mean = np.std(y_true), np.mean(y_true)
    mae_ratio   = mae/mean
    rmse_ratio  = rmse/std_y

    # Header
    print(f"\n=== {name} ===")
    print(f"MAE  : {mae:8.2f}   (mean = {mean:8.2f}, MAE/mean = {mae_ratio:.2f})")
    print(f"RMSE : {rmse:8.2f}   (std  = {std_y:8.2f}, RMSE/std  = {rmse_ratio:.2f})")
    print(f"R²   : {r2:8.3f}")

    # MAE‐to‐mean
    if   mae_ratio < 0.10: print("🔵 Excellent MAE (<10% of mean)")
    elif mae_ratio < 0.20: print("🟢 Good MAE     (<20% of mean)")
    elif mae_ratio < 0.30: print("🟡 Acceptable MAE(<30% of mean)")
    else:                  print("🔴 Poor MAE     (>30% of mean)")

    # RMSE‐to‐std
    if   rmse_ratio < 0.50: print("🔵 Excellent RMSE (<0.5 σ)")
    elif rmse_ratio < 0.75: print("🟢 Good RMSE     (<0.75 σ)")
    elif rmse_ratio < 1.00: print("🟡 Acceptable RMSE(<1.0 σ)")
    else:                  print("🔴 Poor RMSE     (>1.0 σ)")

    # R²
    if   r2 >= 0.90:        print("🔵 Excellent R² (≥0.90)")
    elif r2 >= 0.75:        print("🟢 Good R²      (0.75–0.90)")
    elif r2 >= 0.50:        print("🟡 Acceptable R²(0.50–0.75)")
    else:                   print("🔴 Weak R²      (<0.50)")

# Run for each target
interpret_and_print(true_net, pred_net, "Net_Amount")
interpret_and_print(true_tc,  pred_tc,  "TC")

# ── 6) Loss‐curve plot ─────────────────────────────────────────────────────────
plt.plot(history.history['loss'],  label='Train Loss')
plt.plot(history.history['val_loss'],label='Val   Loss')
plt.xlabel('Epoch')
plt.ylabel('Scaled MSE Loss')
plt.legend(); plt.title('Training vs Validation Loss')
plt.show()