In [10]:
import pandas as pd
import numpy as np
import xgboost as xgb
import optuna

from sklearn.metrics import mean_squared_error
from optuna.integration import XGBoostPruningCallback
from sklearn.model_selection import KFold, cross_val_score

In [2]:
X_train = pd.read_csv("../01_data/02_processed/production/X_train_nonlinear.csv")
y_train = pd.read_csv("../01_data/02_processed/production/y_train.csv").squeeze()

X_test = pd.read_csv("../01_data/02_processed/production/X_test_nonlinear.csv")
y_test = pd.read_csv("../01_data/02_processed/production/y_test.csv").squeeze()

print(X_train.shape, X_test.shape)

(34754, 26) (8689, 26)


In [14]:
def objective(trial):

    params = {
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.25),
        "n_estimators": trial.suggest_int("n_estimators", 400, 1200),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 5),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 5),
        "random_state": 42,
        "n_jobs": -1,
        "tree_method": "hist"
    }

    model = xgb.XGBRegressor(**params)

    cv = KFold(n_splits=3, shuffle=True, random_state=42)

    scores = cross_val_score(
        model,
        X_train,
        y_train,
        cv=cv,
        scoring="neg_root_mean_squared_error",
        n_jobs=-1
    )

    return -scores.mean()


study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=120, timeout=600)

[32m[I 2026-02-26 16:43:36,361][0m A new study created in memory with name: no-name-9003217c-123b-44ee-adab-01099472eedb[0m
[32m[I 2026-02-26 16:43:38,192][0m Trial 0 finished with value: 23.490358352661133 and parameters: {'max_depth': 5, 'learning_rate': 0.14563063805133183, 'n_estimators': 931, 'min_child_weight': 6, 'subsample': 0.835071825545822, 'colsample_bytree': 0.6348920358456533, 'gamma': 4.644666543660408, 'reg_alpha': 1.0918860003043662, 'reg_lambda': 4.359236753260648}. Best is trial 0 with value: 23.490358352661133.[0m
[32m[I 2026-02-26 16:43:38,844][0m Trial 1 finished with value: 23.727603276570637 and parameters: {'max_depth': 3, 'learning_rate': 0.18236121160149749, 'n_estimators': 471, 'min_child_weight': 10, 'subsample': 0.9507142672818957, 'colsample_bytree': 0.6025954161854915, 'gamma': 4.305472904257272, 'reg_alpha': 4.747532951484455, 'reg_lambda': 3.9940743299879804}. Best is trial 0 with value: 23.490358352661133.[0m
[32m[I 2026-02-26 16:43:40,955]

In [15]:
print("Optuna_CV_RMSE:", study.best_value)

print("Optuna_Best_Params:")
for k, v in study.best_params.items():
    print(f"{k}: {v}")

best_params = study.best_params

Optuna_CV_RMSE: 22.082878748575848
Optuna_Best_Params:
max_depth: 9
learning_rate: 0.010060639856494418
n_estimators: 781
min_child_weight: 9
subsample: 0.9489683862116686
colsample_bytree: 0.8242906843023347
gamma: 1.312471976818824
reg_alpha: 3.5732574140884252
reg_lambda: 4.255828376719713


In [16]:
final_model = xgb.XGBRegressor(
    **best_params,
    random_state=42,
    n_jobs=-1,
    tree_method="hist"
)

final_model.fit(X_train, y_train)


# 예측
train_pred = final_model.predict(X_train)
test_pred  = final_model.predict(X_test)

train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
test_rmse  = np.sqrt(mean_squared_error(y_test, test_pred))

print(f"Train_RMSE : {train_rmse:.3f}")
print(f"Test_RMSE : {test_rmse:.3f}")

Train_RMSE : 19.927
Test_RMSE : 21.997
