In [1]:
import numpy as np

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import cross_validate
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import HistGradientBoostingRegressor

In [2]:
data, target = fetch_california_housing(return_X_y=True, as_frame=True)
target *= 100  # rescale the target in k$

In [3]:
gradient_boosting = GradientBoostingRegressor(n_estimators=200)
cv_results_gbdt = cross_validate(
    gradient_boosting,
    data,
    target,
    scoring="neg_mean_absolute_error",
    n_jobs=2,
)

In [4]:
print("Gradient Boosting Decision Tree")
print(
    "Mean absolute error via cross-validation: "
    f"{-cv_results_gbdt['test_score'].mean():.3f} ± "
    f"{cv_results_gbdt['test_score'].std():.3f} k$"
)
print(f"Average fit time: {cv_results_gbdt['fit_time'].mean():.3f} seconds")
print(
    f"Average score time: {cv_results_gbdt['score_time'].mean():.3f} seconds"
)

Gradient Boosting Decision Tree
Mean absolute error via cross-validation: 46.442 ± 2.915 k$
Average fit time: 11.234 seconds
Average score time: 0.006 seconds


In [14]:
discretizer = KBinsDiscretizer(
    n_bins=256, encode="ordinal", strategy="quantile"
)
data_trans = discretizer.fit_transform(data)
data_trans



array([[249.,  39., 231., ...,  83., 162.,  30.],
       [248.,  19., 203., ...,  28., 161.,  30.],
       [242.,  49., 249., ..., 125., 160.,  29.],
       ...,
       [ 17.,  15., 126., ...,  49., 200.,  82.],
       [ 23.,  16., 136., ...,  29., 200.,  77.],
       [ 53.,  14., 130., ...,  93., 199.,  81.]])

In [6]:
[len(np.unique(col)) for col in data_trans.T]

[256, 50, 256, 253, 256, 256, 207, 235]

In [7]:
gradient_boosting = make_pipeline(
    discretizer, GradientBoostingRegressor(n_estimators=200)
)
cv_results_gbdt = cross_validate(
    gradient_boosting,
    data,
    target,
    scoring="neg_mean_absolute_error",
    n_jobs=2,
)

In [8]:
print("Gradient Boosting Decision Tree with KBinsDiscretizer")
print(
    "Mean absolute error via cross-validation: "
    f"{-cv_results_gbdt['test_score'].mean():.3f} ± "
    f"{cv_results_gbdt['test_score'].std():.3f} k$"
)
print(f"Average fit time: {cv_results_gbdt['fit_time'].mean():.3f} seconds")
print(
    f"Average score time: {cv_results_gbdt['score_time'].mean():.3f} seconds"
)

Gradient Boosting Decision Tree with KBinsDiscretizer
Mean absolute error via cross-validation: 45.764 ± 2.029 k$
Average fit time: 3.405 seconds
Average score time: 0.008 seconds


In [9]:
histogram_gradient_boosting = HistGradientBoostingRegressor(
    max_iter=200, random_state=0
)
cv_results_hgbdt = cross_validate(
    histogram_gradient_boosting,
    data,
    target,
    scoring="neg_mean_absolute_error",
    n_jobs=2,
)

In [10]:
print("Histogram Gradient Boosting Decision Tree")
print(
    "Mean absolute error via cross-validation: "
    f"{-cv_results_hgbdt['test_score'].mean():.3f} ± "
    f"{cv_results_hgbdt['test_score'].std():.3f} k$"
)
print(f"Average fit time: {cv_results_hgbdt['fit_time'].mean():.3f} seconds")
print(
    f"Average score time: {cv_results_hgbdt['score_time'].mean():.3f} seconds"
)

Histogram Gradient Boosting Decision Tree
Mean absolute error via cross-validation: 43.758 ± 2.694 k$
Average fit time: 0.816 seconds
Average score time: 0.013 seconds
