In [1]:
import pandas as pd
from scipy.stats import loguniform

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import HistGradientBoostingRegressor

In [2]:
data, target = fetch_california_housing(return_X_y=True, as_frame=True)
target *= 100  # rescale the target in k$
data_train, data_test, target_train, target_test = train_test_split(
    data, target, random_state=0
)

In [3]:
print(f"In this case, n_features={len(data.columns)}")

In this case, n_features=8


In [4]:
param_distributions = {
    "max_features": [1, 2, 3, 5, None],
    "max_leaf_nodes": [10, 100, 1000, None],
    "min_samples_leaf": [1, 2, 5, 10, 20, 50, 100],
}
search_cv = RandomizedSearchCV(
    RandomForestRegressor(n_jobs=2),
    param_distributions=param_distributions,
    scoring="neg_mean_absolute_error",
    n_iter=10,
    random_state=0,
    n_jobs=2,
)
search_cv.fit(data_train, target_train)

columns = [f"param_{name}" for name in param_distributions.keys()]
columns += ["mean_test_error", "std_test_error"]
cv_results = pd.DataFrame(search_cv.cv_results_)
cv_results["mean_test_error"] = -cv_results["mean_test_score"]
cv_results["std_test_error"] = cv_results["std_test_score"]
cv_results[columns].sort_values(by="mean_test_error")

Unnamed: 0,param_max_features,param_max_leaf_nodes,param_min_samples_leaf,mean_test_error,std_test_error
3,2.0,,2,33.923535,0.572301
0,2.0,1000.0,10,36.788562,0.541615
7,,,20,37.386687,0.39314
4,5.0,100.0,2,39.985603,0.679112
8,,100.0,10,40.557408,0.48924
6,,1000.0,50,40.875012,0.478954
9,1.0,100.0,2,49.533779,0.95596
2,1.0,100.0,1,49.890311,0.692592
5,1.0,,100,54.322665,0.817516
1,3.0,10.0,10,55.001764,0.746558


In [5]:
error = -search_cv.score(data_test, target_test)
print(
    f"On average, our random forest regressor makes an error of {error:.2f} k$"
)

On average, our random forest regressor makes an error of 33.56 k$


In [6]:
param_distributions = {
    "max_leaf_nodes": [2, 5, 10, 20, 50, 100],
    "learning_rate": loguniform(0.01, 1),
}
search_cv = RandomizedSearchCV(
    HistGradientBoostingRegressor(max_iter=1000, early_stopping=True),
    param_distributions=param_distributions,
    scoring="neg_mean_absolute_error",
    n_iter=20,
    random_state=0,
    n_jobs=2,
)
search_cv.fit(data_train, target_train)

columns = [f"param_{name}" for name in param_distributions.keys()]
columns += ["mean_test_error", "std_test_error"]
cv_results = pd.DataFrame(search_cv.cv_results_)
cv_results["mean_test_error"] = -cv_results["mean_test_score"]
cv_results["std_test_error"] = cv_results["std_test_score"]
cv_results[columns].sort_values(by="mean_test_error")

Unnamed: 0,param_max_leaf_nodes,param_learning_rate,mean_test_error,std_test_error
0,100,0.125207,30.68806,0.370626
19,50,0.01864,30.688383,0.453431
16,20,0.067503,31.381214,0.820853
18,10,0.081715,32.281624,0.293779
3,10,0.176656,32.605956,0.172063
17,10,0.023587,32.714589,0.153414
12,100,0.39978,33.279988,0.296523
2,5,0.122961,33.319372,0.518559
7,5,0.061034,33.415712,0.398194
14,5,0.145895,33.513197,0.419145


In [7]:
error = -search_cv.score(data_test, target_test)
print(f"On average, our HGBT regressor makes an error of {error:.2f} k$")

On average, our HGBT regressor makes an error of 29.73 k$
