## 1. Import libraries

In [3]:
import xgboost as xgb
print(xgb.__version__)
import numpy as np
import pandas as pd
import optuna
import mlflow
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
import mlflow.xgboost

3.1.3


## 2. Load processed datasets

In [4]:
train_df = pd.read_csv("/Users/leduongkhoa/Regression_MachineLearning_End2End/data/processed/encoded_train.csv")
eval_df = pd.read_csv("/Users/leduongkhoa/Regression_MachineLearning_End2End/data/processed/encoded_eval.csv")

In [5]:
target = "price"
X_train, y_train = train_df.drop(columns=[target]), train_df[target]
X_eval, y_eval = eval_df.drop(columns=[target]), eval_df[target]

print("Train shape: ", X_train.shape, y_train.shape)
print("Eval shape: ", X_eval.shape, y_eval.shape)

Train shape:  (578878, 39) (578878,)
Eval shape:  (148697, 39) (148697,)


## 3. Define Optuna objective function and MLflow

In [6]:
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 200, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "random_state": 42,
        "n_jobs": -1,
        "tree_method": "hist"
    }

    with mlflow.start_run(nested=True):
        model = XGBRegressor(**params)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_eval)
        rmse = float(np.sqrt(mean_squared_error(y_eval, y_pred)))
        mae = float(mean_absolute_error(y_eval, y_pred))
        r2 = float(r2_score(y_eval, y_pred))

        # log parameters and metrics to MLflow
        mlflow.log_params(params)
        mlflow.log_metrics({"rmse": rmse, "mae": mae, "r2": r2})

    return rmse

## 4. Run Optuna with MLflow

In [7]:
mlflow.set_tracking_uri("/Users/leduongkhoa/Regression_MachineLearning_End2End/mlruns")
mlflow.set_experiment("XGBoost_Hyperparameter_Tuning")

study = optuna.create_study(direction="minimize", study_name="XGBoost_Hyperparameter_Tuning")
study.optimize(objective, n_trials=15)

print(f"Best params: {study.best_trial.params}")

  return FileStore(store_uri, store_uri)
2026/02/05 21:25:50 INFO mlflow.tracking.fluent: Experiment with name 'XGBoost_Hyperparameter_Tuning' does not exist. Creating a new experiment.
[32m[I 2026-02-05 21:25:50,500][0m A new study created in memory with name: XGBoost_Hyperparameter_Tuning[0m
[32m[I 2026-02-05 21:26:05,640][0m Trial 0 finished with value: 83366.14424146038 and parameters: {'n_estimators': 440, 'max_depth': 5, 'learning_rate': 0.010635112726326936, 'subsample': 0.5976154599632177, 'colsample_bytree': 0.8566501889270856, 'min_child_weight': 5, 'gamma': 4.354451368269224, 'reg_alpha': 0.36546550061165844, 'reg_lambda': 1.1218879006531735e-05}. Best is trial 0 with value: 83366.14424146038.[0m
[32m[I 2026-02-05 21:26:40,190][0m Trial 1 finished with value: 67663.46003956138 and parameters: {'n_estimators': 751, 'max_depth': 8, 'learning_rate': 0.05404603338274638, 'subsample': 0.8722726049026706, 'colsample_bytree': 0.5605552743117814, 'min_child_weight': 6, 'gamm

Best params: {'n_estimators': 996, 'max_depth': 10, 'learning_rate': 0.028310362529526935, 'subsample': 0.8928352731168709, 'colsample_bytree': 0.5052593737039531, 'min_child_weight': 8, 'gamma': 0.751548299603698, 'reg_alpha': 5.307817810496198e-08, 'reg_lambda': 7.5453068305914215}


## 5. Train the model with the best params and log into MLflow

In [8]:
best_params = study.best_trial.params
best_model = XGBRegressor(**best_params)
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_eval)

rmse = float(np.sqrt(mean_squared_error(y_eval, y_pred)))
mae = float(mean_absolute_error(y_eval, y_pred))
r2 = float(r2_score(y_eval, y_pred))

print(f"Best Model Performance on Eval Set:\nRMSE: {rmse}\nMAE: {mae}\nR2: {r2}")

with mlflow.start_run(run_name="Best_XGBoost_Model"):
    mlflow.log_params(best_params)
    mlflow.log_metrics({"rmse": rmse, "mae": mae, "r2": r2})
    mlflow.xgboost.log_model(best_model, artifact_path="model")



Best Model Performance on Eval Set:
RMSE: 68632.15363535204
MAE: 30543.184797070266
R2: 0.9635907364673416
