In [1]:
import xgboost as xgb
print(xgb.__version__)

3.1.2


In [2]:
import sys, xgboost as xgb
print(sys.executable)        # should point to .../.venv/bin/python
print(xgb.__version__)       # should print 3.0.4
print(xgb.__file__)          # should live under .../.venv/...

f:\AI Projects\Regression Project\.venv\Scripts\python.exe
3.1.2
f:\AI Projects\Regression Project\.venv\Lib\site-packages\xgboost\__init__.py


In [3]:

# ==============================================
# 1. Imports
# ==============================================
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
import optuna
import mlflow
import mlflow.xgboost

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# ==============================================
# 2. Load processed datasets
# ==============================================

train_df = pd.read_csv(r"F:\AI Projects\Regression Project\data\processed\feature_engineered_train.csv")
eval_df = pd.read_csv(r"F:\AI Projects\Regression Project\data\processed\feature_engineered_eval.csv")

# Define target + features
target = "price"
X_train, y_train = train_df.drop(columns=[target]), train_df[target]
X_eval, y_eval = eval_df.drop(columns=[target]), eval_df[target]

print(f'Train shape: {X_train.shape}')
print(f'Eval shape: {X_eval.shape}')

Train shape: (974026, 39)
Eval shape: (250982, 39)


In [20]:
# ==============================================
# 3. Define Optuna objective function with MLflow
# ==============================================

def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 200, 1000),
        "max_depth": trial.suggest_int("max_depth", 3,10),
         "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "random_state": 42,
        "n_jobs": -1,
        "tree_method": "hist",
    }
    with mlflow.start_run(nested =True):
        model = XGBRegressor(**params)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_eval)
        rmse = float(mean_squared_error(y_eval, y_pred))
        mae  = float(mean_absolute_error(y_eval, y_pred))
        r2 = float(r2_score(y_eval, y_pred))

        # Log hyperparameters + metrics
        mlflow.log_params(params)
        mlflow.log_metrics({"rmse": rmse, "mae": mae, "r2_score": r2})

    return rmse

In [21]:
# ==============================================
# 4. Run Optuna study with MLflow
# ==============================================
# Force MLflow to always use the root project mlruns folder

mlflow.set_tracking_uri("file:///f:/AI Projects/Regression Project/mlruns")
mlflow.set_experiment("xgboost_optuna_housing")

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=15)

print("Best params:", study.best_trial.params)

2025/12/24 19:37:53 INFO mlflow.tracking.fluent: Experiment with name 'xgboost_optuna_housing' does not exist. Creating a new experiment.
[I 2025-12-24 19:37:53,957] A new study created in memory with name: no-name-649e6c54-f9b4-4fd6-9a9c-587d1091fd63
[I 2025-12-24 19:40:04,174] Trial 0 finished with value: 8998278045.45576 and parameters: {'n_estimators': 778, 'max_depth': 3, 'learning_rate': 0.12168983441150313, 'subsample': 0.5886078348143937, 'colsample_bytree': 0.5361421349580469, 'min_child_weight': 9, 'gamma': 1.6623662889606634, 'reg_alpha': 0.00021479813095301415, 'reg_lambda': 9.252693351189392e-08}. Best is trial 0 with value: 8998278045.45576.
[I 2025-12-24 19:41:21,387] Trial 1 finished with value: 10702732503.523022 and parameters: {'n_estimators': 480, 'max_depth': 3, 'learning_rate': 0.11459014999805744, 'subsample': 0.9832575274858409, 'colsample_bytree': 0.5348239635537748, 'min_child_weight': 6, 'gamma': 3.5522770280785725, 'reg_alpha': 9.170486180661228e-05, 'reg_la

Best params: {'n_estimators': 213, 'max_depth': 5, 'learning_rate': 0.24077048518116537, 'subsample': 0.5019119386130505, 'colsample_bytree': 0.9862641504647774, 'min_child_weight': 10, 'gamma': 2.798177678426236, 'reg_alpha': 0.024612582488105463, 'reg_lambda': 1.8470932131300654e-08}


In [23]:
# ==============================================
# 5. Train final model with best params and log to MLflow
# ==============================================
# Train final model with best params
best_params = study.best_trial.params

best_model = XGBRegressor(**best_params)
best_model.fit(X_train, y_train)

# Predictions
y_pred = best_model.predict(X_eval)

# Metrics
mae  = mean_absolute_error(y_eval, y_pred)
rmse = np.sqrt(mean_squared_error(y_eval, y_pred))
r2   = r2_score(y_eval, y_pred)

print("Final tuned model performance:")
print("MAE:", mae)
print("RMSE:", rmse)
print("R²:", r2)

# Log final model to MLflow (CORRECT)
with mlflow.start_run(run_name="best_xgboost_model"):
    mlflow.log_params(best_params)
    mlflow.log_metrics({
        "rmse": rmse,
        "mae": mae,
        "r2": r2
    })
    mlflow.sklearn.log_model(best_model, artifact_path="model")




Final tuned model performance:
MAE: 43253.01521893009
RMSE: 98997.21387541447
R²: 0.9523168801785092


