In [12]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import optuna

In [2]:
X, y = fetch_california_housing(return_X_y=True, as_frame=False)
print("Dataset shape:", X.shape)

Dataset shape: (20640, 8)


In [3]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

def objective(trial: optuna.Trial) -> float:
    n_estimators = trial.suggest_int("n_estimators", 100, 1000, step=100)
    max_depth = trial.suggest_int("max_depth", 5, 35)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)
    max_features = trial.suggest_float("max_features", 0.2, 1.0, step=0.1)

    rf = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        n_jobs=-1,
        random_state=42
    )

    scores = []
    for train_idx, valid_idx in kf.split(X):
        X_train, X_valid = X[train_idx], X[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        rf.fit(X_train, y_train)
        preds = rf.predict(X_valid)
        mse = mean_squared_error(y_valid, preds)
        scores.append(mse)

        trial.report(np.mean(scores), step=len(scores))
        if trial.should_prune():
            raise optuna.TrialPruned()

    return np.mean(scores)

In [4]:
study = optuna.create_study(
    study_name="rf_regression_opt",
    direction="minimize",  # minimize MSE
    sampler=optuna.samplers.TPESampler(seed=42),
    pruner=optuna.pruners.MedianPruner(),
)
study.optimize(objective, n_trials=50, n_jobs=-1)

[I 2025-11-12 08:29:11,860] A new study created in memory with name: rf_regression_opt
[I 2025-11-12 08:29:34,566] Trial 1 finished with value: 0.25521564462996793 and parameters: {'n_estimators': 100, 'max_depth': 33, 'min_samples_split': 18, 'min_samples_leaf': 4, 'max_features': 0.6000000000000001}. Best is trial 1 with value: 0.25521564462996793.
[I 2025-11-12 08:29:54,765] Trial 8 finished with value: 0.5778870721101559 and parameters: {'n_estimators': 700, 'max_depth': 6, 'min_samples_split': 7, 'min_samples_leaf': 8, 'max_features': 0.2}. Best is trial 1 with value: 0.25521564462996793.
[I 2025-11-12 08:30:09,445] Trial 4 finished with value: 0.25633679397839815 and parameters: {'n_estimators': 200, 'max_depth': 16, 'min_samples_split': 17, 'min_samples_leaf': 2, 'max_features': 0.8}. Best is trial 1 with value: 0.25521564462996793.
[I 2025-11-12 08:30:21,326] Trial 5 finished with value: 0.26737656304002266 and parameters: {'n_estimators': 300, 'max_depth': 19, 'min_samples_spl

In [7]:
print("Best trial:")
print(f"  Value (MSE): {study.best_value}")
print(f"  Params: {study.best_params}")

Best trial:
  Value (MSE): 0.2350993214394105
  Params: {'n_estimators': 400, 'max_depth': 25, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 0.4}


In [None]:
best_params = study.best_params
final_rf = RandomForestRegressor(
    **best_params,
    n_jobs=-1,
    random_state=42
)

# Huấn luyện model trên TOÀN BỘ dữ liệu X và y
final_rf.fit(X, y)

In [13]:
y_pred = final_rf.predict(X)
mae = mean_absolute_error(y, y_pred)
mse = mean_squared_error(y, y_pred)
r2 = r2_score(y, y_pred)

In [14]:
print("\nFinal model performance on the entire dataset:")
print(f"  Mean Absolute Error (MAE): {mae}")
print(f"  Mean Squared Error (MSE): {mse}")
print(f"  R^2 Score: {r2}")


Final model performance on the entire dataset:
  Mean Absolute Error (MAE): 0.12324952756442711
  Mean Squared Error (MSE): 0.03635377507902487
  R^2 Score: 0.9726981586224783


In [11]:
from optuna.importance import get_param_importances
importances = get_param_importances(study)
print("\nParam importances:")
for p, imp in importances.items():
    print(f"{p:25s}: {imp:.3f}")


Param importances:
max_depth                : 0.364
max_features             : 0.304
min_samples_split        : 0.241
min_samples_leaf         : 0.075
n_estimators             : 0.016
