In [1]:
# Version-proof model comparison + final training (no `squared=` keyword anywhere)
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [15]:
# load data for training and testing
train = pd.read_csv("train_modified.csv")
test = pd.read_csv("test_modified.csv")

In [16]:
X = train.drop("Item_Outlet_Sales", axis=1)
y = train["Item_Outlet_Sales"]

In [20]:
# models to try
models = {
   # "RandomForest": RandomForestRegressor(n_estimators=400, random_state=42),
   #  "XGBoost": XGBRegressor(
   #      n_estimators=600, learning_rate=0.03, max_depth=8,
   #      subsample=0.8, colsample_bytree=0.8, objective='reg:squarederror', random_state=42
   #  ),
    #"LightGBM": LGBMRegressor(n_estimators=500, learning_rate=0.02, max_depth=-1, random_state=42),
    "CatBoost": CatBoostRegressor(iterations=500, learning_rate=0.02, depth=7, loss_function='RMSE', verbose=0, random_state=42)
}

In [21]:
def evaluate_model_cv(model, X, y, n_splits=5, random_state=42):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    rmse_scores = []
    r2_scores = []
    mae_scores = []
    mape_scores = []

    for train_idx, val_idx in kf.split(X):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model.fit(X_tr, y_tr)
        preds = model.predict(X_val)

        mse = mean_squared_error(y_val, preds)  # NOTE: no squared= kwarg
        rmse = np.sqrt(mse)  # compute RMSE explicitly
        r2 = r2_score(y_val, preds)
        mae = mean_absolute_error(y_val, preds)
        # avoid division by zero in MAPE (small epsilon)
        eps = 1e-9
        mape = np.mean(np.abs((y_val - preds) / (y_val + eps))) * 100

        rmse_scores.append(rmse)
        r2_scores.append(r2)
        mae_scores.append(mae)
        mape_scores.append(mape)

    return {
        "RMSE_mean": np.mean(rmse_scores),
        "RMSE_std": np.std(rmse_scores),
        "R2_mean": np.mean(r2_scores),
        "MAE_mean": np.mean(mae_scores),
        "MAPE_mean": np.mean(mape_scores)
    }




In [22]:
# Evaluate all models
results = {}
print("\n=========== Model comparison (5-fold CV) ===========\n")
for name, model in models.items():
    print(f"Evaluating {name} ...")
    metrics = evaluate_model_cv(model, X, y, n_splits=5)
    results[name] = metrics
    print(f"RMSE: {metrics['RMSE_mean']:.2f} ¬± {metrics['RMSE_std']:.2f}")
    print(f"R¬≤  : {metrics['R2_mean']:.4f}")
    print(f"MAE : {metrics['MAE_mean']:.2f}")
    print(f"MAPE: {metrics['MAPE_mean']:.2f}%")
    print("---------------------------------------------------")

# pick best by RMSE
best_model_name = min(results, key=lambda x: results[x]["RMSE_mean"])
best_model = models[best_model_name]
print(f"\nüèÜ BEST MODEL (by RMSE): {best_model_name}\n")

# retrain best model on full training set
print("Retraining best model on full training set...")
best_model.fit(X, y)

# predict on test
print("Predicting test_modified.csv ...")
test_preds = best_model.predict(test)
test["Item_Outlet_Sales"] = test_preds
test.to_csv("final_submission.csv", index=False)
print("Saved: final_submission.csv")



Evaluating CatBoost ...
RMSE: 1081.11 ¬± 35.00
R¬≤  : 0.5980
MAE : 756.23
MAPE: 55.62%
---------------------------------------------------

üèÜ BEST MODEL (by RMSE): CatBoost

Retraining best model on full training set...
Predicting test_modified.csv ...
Saved: final_submission.csv
