In [1]:
import pandas as pd
import numpy as np
import time
import warnings
warnings.simplefilter('ignore')

# Cuml's TargetEncoder (GPU‐accelerated)
from cuml.preprocessing import TargetEncoder

# Models and utilities
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error
import gc
import xgboost as xgb
import lightgbm as lgb
from xgboost import XGBRegressor, callback
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import StackingRegressor
from tqdm.auto import tqdm
import optuna
from sklearn.preprocessing import StandardScaler
import pandas.api.types as ptypes

In [2]:
!nvidia-smi

Tue Feb 11 13:07:01 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   41C    P8              9W /   70W |       3MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla T4                      

In [3]:
class TqdmCallback(callback.TrainingCallback):
    def __init__(self, total):
        self.pbar = tqdm(total=total, desc="XGB Training", unit="iter")

    def after_iteration(self, model, epoch, evals_log):
        if evals_log and "validation_0" in evals_log and "rmse" in evals_log["validation_0"]:
            current_rmse = evals_log["validation_0"]["rmse"][-1]
            self.pbar.set_postfix(rmse=f"{current_rmse:.4f}")
        self.pbar.update(1)
        return False  # Continue training

    def after_training(self, model):
        self.pbar.close()
        return model

In [4]:
class XGBPipeline:
    def __init__(self, train: pd.DataFrame, test: pd.DataFrame, target: str,
                features: list, cats: list, te_params: dict = None,
                sample_frac: float = 0.5, n_trials:int = 1, random_state: int = 42):
        self.train = train.copy()
        self.test = test.copy()
        self.target = target
        self.features = features
        self.cats = cats
        self.sample_frac = sample_frac
        self.random_state = random_state
        self.n_trials = n_trials

        if te_params is None:
            te_params = {'n_folds': 25, 'smooth': 20, 'split_method': 'random', 'stat': 'mean'}
        self.te_params = te_params
        self.TE = TargetEncoder(**self.te_params)

        self.best_params_xgb = None
        self.best_params_cat = None
        self.best_cv_rmse_xgb = None
        self.best_cv_rmse_cat = None
        self.model_xgb = None
        self.model_lgbm = None
        self.model_cat = None
        self.stacking_model = None
        self.best_iteration_xgb = None
        self.best_iteration_cat = None
        self.all_features = None

        self.metrics_log = []

    def log(self, message: str):
        print(message)
        self.metrics_log.append(message)

    def save_metrics_log(self, filename: str = 'training_logs.txt'):
        with open(filename, 'w') as f:
            for message in self.metrics_log:
                f.write(message + '\n')
        self.log(f"Metrics log saved to {filename}")

    def preprocess_data(self):
        start_time = time.time()
        
        #target encoding all columns
        for col in self.features:
            self.train[f'TE_{col}'] = self.TE.fit_transform(self.train[col], self.train[self.target])
            self.test[f'TE_{col}'] = self.TE.transform(self.test[col])
        
        #setting categorical columns properly
        self.train[self.cats] = self.train[self.cats].fillna('missing').astype('category')
        self.test[self.cats] = self.test[self.cats].fillna('missing').astype('category')

        self.all_features = self.features + [f'TE_{col}' for col in self.features]

        #num_cols = [col for col in self.features if self.train[col].nunique() > 15]

        if len(self.cats) * len(self.features) < 50:
            for cat_col in self.cats:
                for num_col in self.features:
                    col_name = f'{cat_col}_x_{num_col}'
                    self.train[col_name] = self.train[cat_col].astype('str') + '_' + self.train[num_col].astype('str')
                    self.test[col_name] = self.test[cat_col].astype('str') + '_' + self.test[num_col].astype('str')
                    self.train[col_name] = self.train[col_name].cat.code
                    self.test[col_name] = self.test[col_name].cat.code
                    self.all_features.append(col_name)

        #self.all_features += self.cats

        for col in self.all_features:
            if not ptypes.is_numeric_dtype(self.train[col]):
                self.train[col] = self.train[col].astype('category').cat.codes
                self.test[col] = self.test[col].astype('category').cat.codes

        for col in self.all_features:
            median_val = self.train[col].median()
            self.train[col] = self.train[col].fillna(median_val)
            self.test[col] = self.test[col].fillna(median_val)

        scaler = StandardScaler()
        self.train[self.all_features] = scaler.fit_transform(self.train[self.all_features])
        self.test[self.all_features] = scaler.transform(self.test[self.all_features])

        elapsed = time.time() - start_time

        self.log(f'Preprocessing done with elapsed time: {elapsed:.2f} sec')

    def hyperparameter_tuning_xgb(self):
        self.log('Starting hyperparameter tuning with Optuna...')

        start_time = time.time()

        train_sample = self.train.sample(frac=self.sample_frac, random_state=self.random_state)

        def objective(trial):
            params = {
                "max_depth": trial.suggest_int("max_depth", 3, 10),
                "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
                "min_child_weight": trial.suggest_int("min_child_weight", 1, 100),
                "subsample": trial.suggest_float("subsample", 0.5, 1.0),
                "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
                "n_estimators": trial.suggest_int("n_estimators", 500, 1500),
                "reg_alpha": trial.suggest_float("reg_alpha", 0, 1),
                "reg_lambda": trial.suggest_float("reg_lambda", 0, 1)
            }

            cv = KFold(n_splits=3, shuffle=True, random_state=self.random_state)
            cv_scores = []
            for train_idx, val_idx in cv.split(train_sample):
                X_train_cv, y_train_cv = train_sample.iloc[train_idx][self.all_features], train_sample.iloc[train_idx][self.target]
                X_val_cv, y_val_cv = train_sample.iloc[val_idx][self.all_features], train_sample.iloc[val_idx][self.target]
    
                model = XGBRegressor(
                        tree_method="gpu_hist",
                        random_state=self.random_state,
                        **params
                    )
    
                model.fit(
                        X_train_cv, y_train_cv,
                        eval_set=[(X_val_cv, y_val_cv)],
                        eval_metric="rmse",
                        early_stopping_rounds=50,
                        verbose=False
                    )
    
                preds = model.predict(X_val_cv, iteration_range=(0, model.best_iteration + 1))
                rmse = np.sqrt(mean_squared_error(y_val_cv, preds))
                cv_scores.append(rmse)
            return np.mean(cv_scores)

        study = optuna.create_study(direction="minimize")
        study.optimize(objective, n_trials=self.n_trials)
        self.best_params_xgb = study.best_trial.params
        self.best_cv_rmse_xgb = study.best_value
        self.log(f"Hyperparameter tuning for XGB complete. Best params: {self.best_params_xgb}")
        self.log(f"Best CV RMSE: {self.best_cv_rmse_xgb}")

        elapsed = time.time() - start_time

        self.log(f'Hyperparameter tuning for XGB done with elapsed time: {elapsed:.2f} sec')

    def hyperparameter_tuning_cat(self):
        self.log('Starting hyperparameter tuning with Optuna...')

        start_time = time.time()

        train_sample = self.train.sample(frac=self.sample_frac, random_state=self.random_state)

        def objective(trial):
            params = {'iterations': trial.suggest_int('iterations', 50, 300),
              'depth': trial.suggest_int('depth', 4, 10),
              'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
              'random_strength': trial.suggest_int('random_strength', 0, 100),
              'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
              'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
              'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter'])}

            params["eval_metric"] = "RMSE"
            params["task_type"] = "GPU"

            cv = KFold(n_splits=3, shuffle=True, random_state=self.random_state)
            cv_scores = []
            for train_idx, val_idx in cv.split(train_sample):
                X_train_cv, y_train_cv = train_sample.iloc[train_idx][self.all_features], train_sample.iloc[train_idx][self.target]
                X_val_cv, y_val_cv = train_sample.iloc[val_idx][self.all_features], train_sample.iloc[val_idx][self.target]
    
                model = CatBoostRegressor(
                        random_state=self.random_state,
                        **params
                    )
    
                model.fit(
                        X_train_cv, y_train_cv,
                        eval_set=[(X_val_cv, y_val_cv)],
                        early_stopping_rounds=50,
                        verbose=False
                    )
                best_iteration = model.get_best_iteration()
                preds = model.predict(X_val_cv, ntree_end=best_iteration + 1)
                rmse = np.sqrt(mean_squared_error(y_val_cv, preds))
                cv_scores.append(rmse)
            return np.mean(cv_scores)

        study = optuna.create_study(direction="minimize")
        study.optimize(objective, n_trials=self.n_trials)
        self.best_params_cat = study.best_trial.params
        self.best_cv_rmse_cat = study.best_value
        self.log(f"Hyperparameter tuning for cat complete. Best params: {self.best_params_cat}")
        self.log(f"Best CV RMSE: {self.best_cv_rmse_cat}")

        elapsed = time.time() - start_time

        self.log(f'Hyperparameter tuning for cat done with elapsed time: {elapsed:.2f} sec')

    

    def train_final_models(self, early_stopping_rounds:int = 50):
        start_time = time.time()
        X_train, X_val, y_train, y_val = train_test_split(
            self.train[self.all_features], self.train[self.target],
            test_size=0.2, random_state=self.random_state
        )
        #Training XGB
        self.log("Training XGBoost...")
        self.model_xgb = XGBRegressor(
            tree_method="gpu_hist",
            random_state=self.random_state,
            **self.best_params_xgb
        )
        tqdm_cb_xgb = TqdmCallback(total=self.model_xgb.get_params()["n_estimators"])
        self.model_xgb.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            eval_metric="rmse",
            early_stopping_rounds=early_stopping_rounds,
            callbacks=[tqdm_cb_xgb],
            verbose=False
        )

        self.best_iteration_xgb = self.model_xgb.best_iteration
        preds_xgb = self.model_xgb.predict(X_val, iteration_range=(0, self.best_iteration_xgb + 1))
        rmse_xgb = np.sqrt(mean_squared_error(y_val, preds_xgb))
        self.log(f"XGBoost hold-out RMSE: {rmse_xgb:.4f} (Best Iteration: {self.best_iteration_xgb})")       
        xgb.rabit.finalize()
        gc.collect()

        #Training LGB
        self.log("Training LightGBM...")
        # Use GPU acceleration for LightGBM by adding the device parameter
        lgb_params = self.best_params_xgb.copy()
        lgb_params["device"] = "gpu"
        self.model_lgbm = lgb.LGBMRegressor(random_state=self.random_state, **lgb_params)
        self.model_lgbm.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            eval_metric='rmse',
            callbacks=[lgb.early_stopping(early_stopping_rounds)]
        )
        preds_lgbm = self.model_lgbm.predict(X_val)
        rmse_lgbm = np.sqrt(mean_squared_error(y_val, preds_lgbm))
        self.log(f"LightGBM hold-out RMSE: {rmse_lgbm:.4f}")
        lgb.reset_parameter()
        gc.collect()

        #Training cat
        self.log("Training catboost Regression...")
        self.model_cat = CatBoostRegressor(
            task_type="CPU",
            random_state=self.random_state,
            **self.best_params_cat,
            silent=False
        )
        self.model_cat.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            early_stopping_rounds=early_stopping_rounds,
            verbose=False
        )

        self.best_iteration_cat = self.model_cat.get_best_iteration()
        preds_cat = self.model_cat.predict(X_val, ntree_end=self.best_iteration_cat + 1)
        rmse_cat = np.sqrt(mean_squared_error(y_val, preds_cat))
        self.log(f"CatBoost hold-out RMSE: {rmse_cat:.4f} (Best Iteration: {self.best_iteration_cat})")
        gc.collect()

        # stacking ensembles
        self.log("Training stacking ensemble...")
        self.stacking_model = StackingRegressor(
            estimators=[
                ('xgb', self.model_xgb),
                ('lgbm', self.model_lgbm),
                ('cat', self.model_cat)
            ],
        final_estimator=Ridge(alpha=1.0),
        cv=KFold(n_splits=5, shuffle=True, random_state=self.random_state),
        n_jobs=-1,
        passthrough=True
        )
        self.stacking_model.fit(self.train[self.all_features], self.train[self.target])
        stacking_preds = self.stacking_model.predict(X_val)
        rmse_stack = np.sqrt(mean_squared_error(y_val, stacking_preds))
        self.log(f"Stacking Ensemble hold-out RMSE: {rmse_stack:.4f}")

        elapsed = time.time() - start_time
        self.log(f"Final model training complete. (Time taken: {elapsed:.2f} sec)")

    def predict_test(self):
        """
        Use the stacking ensemble to predict on the test set.
        """
        start_time = time.time()
        test_preds = self.stacking_model.predict(self.test[self.all_features])
        elapsed = time.time() - start_time
        self.log(f"Test prediction complete. (Time taken: {elapsed:.2f} sec)")
        return test_preds

    def save_submission(self, predictions, filename="sub_with_cat.csv"):
        start_time = time.time()
        sub = pd.DataFrame({ "id": self.test.index, self.target: predictions })
        sub.to_csv(filename, index=False)
        elapsed = time.time() - start_time
        self.log(f"Submission saved to {filename}. (Time taken: {elapsed:.2f} sec)")

    def run_pipeline(self):
        overall_start = time.time()
        steps = [
            ("Preprocessing Data", self.preprocess_data),
            ("Hyperparameter Tuning XGB", self.hyperparameter_tuning_xgb),
            ("Hyperparameter Tuning CAT", self.hyperparameter_tuning_cat),
            ("Training Final Models", self.train_final_models),
        ]
        
        with tqdm(total=len(steps), desc="Pipeline Steps", unit="step") as pbar:
            self.log("Starting pipeline execution...")
            for step_name, step_func in steps:
                self.log(f"Starting step: {step_name}")
                step_func()
                pbar.update(1)
                self.log(f"Completed step: {step_name}")
        
        self.log("Predicting Test Set...")
        predictions = self.predict_test()
        self.log("Saving Submission...")
        self.save_submission(predictions)
        self.save_metrics_log()
        
        overall_elapsed = time.time() - overall_start
        self.log("Pipeline execution complete.")
        self.log(f"Total pipeline time: {overall_elapsed:.2f} sec")            

In [5]:
if __name__ == "__main__":
    # Load the datasets.
    train = pd.read_csv("/kaggle/input/playground-series-s5e2/train.csv", index_col='id')
    train_extra = pd.read_csv("/kaggle/input/playground-series-s5e2/training_extra.csv", index_col='id')
    train = pd.concat([train, train_extra], axis=0, ignore_index=True)
    test = pd.read_csv("/kaggle/input/playground-series-s5e2/test.csv", index_col='id')

    # Define target and feature columns.
    target = "Price"
    features = [col for col in train.columns if col != target]
    
    # Define categorical columns (example: all columns except Price and Weight Capacity)
    cats = [col for col in train.columns if col not in [target, "Weight Capacity (kg)"]]
    
    # Initialize and run the pipeline.
    pipeline = XGBPipeline(train=train, test=test, target=target, features=features, cats=cats, n_trials=20)
    pipeline.run_pipeline()

Pipeline Steps:   0%|          | 0/4 [00:00<?, ?step/s]

Starting pipeline execution...
Starting step: Preprocessing Data
Preprocessing done with elapsed time: 16.14 sec
Completed step: Preprocessing Data
Starting step: Hyperparameter Tuning XGB
Starting hyperparameter tuning with Optuna...


[I 2025-02-11 13:07:27,962] A new study created in memory with name: no-name-618a6e0d-13e6-4b5d-a44d-c62344d1636b
[I 2025-02-11 13:07:50,513] Trial 0 finished with value: 38.64462384426037 and parameters: {'max_depth': 5, 'learning_rate': 0.039064931247989765, 'min_child_weight': 27, 'subsample': 0.9988387461317092, 'colsample_bytree': 0.9162675436811903, 'n_estimators': 1348, 'reg_alpha': 0.6939494127788651, 'reg_lambda': 0.09971202015881075}. Best is trial 0 with value: 38.64462384426037.
[I 2025-02-11 13:08:19,697] Trial 1 finished with value: 38.64522548801581 and parameters: {'max_depth': 6, 'learning_rate': 0.019303597875226192, 'min_child_weight': 99, 'subsample': 0.7032404541249047, 'colsample_bytree': 0.8928867851984766, 'n_estimators': 1449, 'reg_alpha': 0.9312793529235397, 'reg_lambda': 0.31483643447108833}. Best is trial 0 with value: 38.64462384426037.
[I 2025-02-11 13:08:46,609] Trial 2 finished with value: 38.643992885906215 and parameters: {'max_depth': 4, 'learning_rat

Hyperparameter tuning for XGB complete. Best params: {'max_depth': 4, 'learning_rate': 0.03865685860825404, 'min_child_weight': 5, 'subsample': 0.7556269705169771, 'colsample_bytree': 0.8047425498887735, 'n_estimators': 769, 'reg_alpha': 0.707179451641986, 'reg_lambda': 0.4843437108098305}
Best CV RMSE: 38.643992885906215
Hyperparameter tuning for XGB done with elapsed time: 498.45 sec
Completed step: Hyperparameter Tuning XGB
Starting step: Hyperparameter Tuning CAT
Starting hyperparameter tuning with Optuna...


[I 2025-02-11 13:15:46,291] A new study created in memory with name: no-name-75d08865-a6cc-44ee-94f6-d2dae2f1cb77
[I 2025-02-11 13:16:01,836] Trial 0 finished with value: 38.6731916241865 and parameters: {'iterations': 195, 'depth': 9, 'learning_rate': 0.015894890727746157, 'random_strength': 48, 'bagging_temperature': 0.03256703921058675, 'od_type': 'IncToDec'}. Best is trial 0 with value: 38.6731916241865.
[I 2025-02-11 13:16:17,646] Trial 1 finished with value: 38.725630709277645 and parameters: {'iterations': 285, 'depth': 10, 'learning_rate': 0.0854842305044913, 'random_strength': 33, 'bagging_temperature': 20.61760773944071, 'od_type': 'IncToDec'}. Best is trial 0 with value: 38.6731916241865.
[I 2025-02-11 13:16:25,622] Trial 2 finished with value: 38.73597006634366 and parameters: {'iterations': 79, 'depth': 4, 'learning_rate': 0.12710994152118293, 'random_strength': 84, 'bagging_temperature': 24.83233097822276, 'od_type': 'Iter'}. Best is trial 0 with value: 38.6731916241865.


Hyperparameter tuning for cat complete. Best params: {'iterations': 229, 'depth': 5, 'learning_rate': 0.1702767698454954, 'random_strength': 43, 'bagging_temperature': 0.09060215385672248, 'od_type': 'Iter'}
Best CV RMSE: 38.64393786165989
Hyperparameter tuning for cat done with elapsed time: 223.88 sec
Completed step: Hyperparameter Tuning CAT
Starting step: Training Final Models
Training XGBoost...


XGB Training:   0%|          | 0/769 [00:00<?, ?iter/s]

XGBoost hold-out RMSE: 38.6400 (Best Iteration: 768)
Training LightGBM...
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1450
[LightGBM] [Info] Number of data points in the train set: 3195454, number of used features: 18
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 18 dense feature groups (60.95 MB) transferred to GPU in 0.057171 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 81.361311
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[762]	valid_0's rmse: 38.6377	valid_0's l2: 1492.87
LightGBM hold-out RMSE: 38.6377
Training catboost Regression...
CatBoost hold-out RMSE: 38.6391 (Best Iteration: 228)
Training stacking ensemble...
Stacking Ensemble hold-out RMSE: 38.6155
Final m

In [6]:
# tree_method = "hist", device = "cuda"