In [1]:
import pandas as pd
import seaborn as sns
from scipy.stats import loguniform

from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV

In [2]:
adult_census = pd.read_csv("datasets/adult-census.csv")

target_column = "class"
target = adult_census[target_column]

data = adult_census.drop(columns=[target_column, "education-num"])
data.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,25,Private,11th,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States
1,38,Private,HS-grad,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States
2,28,Local-gov,Assoc-acdm,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States
3,44,Private,Some-college,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States
4,18,?,Some-college,Never-married,?,Own-child,White,Female,0,0,30,United-States


In [3]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [4]:
categorical_columns_selector = selector(dtype_include=object)
categorical_columns = categorical_columns_selector(data)

categorical_preprocessor = OrdinalEncoder(
    handle_unknown="use_encoded_value", unknown_value=-1
)

preprocessor = ColumnTransformer(
    [("cat_preprocessor", categorical_preprocessor, categorical_columns)],
    remainder="passthrough",
)

model = Pipeline(
    [
        ("preprocessor", preprocessor),
        (
            "classifier",
            HistGradientBoostingClassifier(random_state=42, max_leaf_nodes=4),
        ),
    ]
)
model

In [5]:
class loguniform_int:
    """Integer valued version of the log-uniform distribution"""

    def __init__(self, a, b):
        self._distribution = loguniform(a, b)

    def rvs(self, *args, **kwargs):
        """Random variable sample"""
        return self._distribution.rvs(*args, **kwargs).astype(int)

In [6]:
param_distributions = {
    "classifier__l2_regularization": loguniform(1e-6, 1e3),
    "classifier__learning_rate": loguniform(0.001, 10),
    "classifier__max_leaf_nodes": loguniform_int(2, 256),
    "classifier__min_samples_leaf": loguniform_int(1, 100),
    "classifier__max_bins": loguniform_int(2, 255),
}

model_random_search = RandomizedSearchCV(
    model,
    param_distributions=param_distributions,
    n_iter=10,
    cv=5,
    verbose=1,
)
model_random_search.fit(X_train, y_train)

accuracy = model_random_search.score(X_test, y_test)
print(f"The test accuracy score of the best model is {accuracy:.2f}")

print("The best parameters are:")
print(model_random_search.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
The test accuracy score of the best model is 0.87
The best parameters are:
{'classifier__l2_regularization': 133.0367865872085, 'classifier__learning_rate': 0.29581799674425313, 'classifier__max_bins': 136, 'classifier__max_leaf_nodes': 3, 'classifier__min_samples_leaf': 8}


In [7]:
column_results = [f"param_{name}" for name in param_distributions.keys()]
column_results += ["mean_test_score", "std_test_score", "rank_test_score"]

cv_results = pd.DataFrame(model_random_search.cv_results_)
cv_results = cv_results[column_results].sort_values(
    "mean_test_score", ascending=False
)


def shorten_param(param_name):
    if "__" in param_name:
        return param_name.rsplit("__", 1)[1]
    return param_name


cv_results = cv_results.rename(shorten_param, axis=1)
cv_results

Unnamed: 0,l2_regularization,learning_rate,max_leaf_nodes,min_samples_leaf,max_bins,mean_test_score,std_test_score,rank_test_score
8,133.036787,0.295818,3,8,136,0.858126,0.002475,1
6,0.003466,0.027608,17,85,2,0.801589,0.003232,2
1,3e-06,0.008736,5,32,115,0.798122,0.002207,3
3,0.000496,2.070986,162,41,27,0.797412,0.005929,4
0,3.208514,0.00599,6,10,61,0.796156,0.003565,5
5,723.535018,5.764361,94,1,20,0.772788,0.005676,6
4,401.659159,0.016206,10,17,4,0.765281,0.005793,7
2,3e-06,0.003474,4,2,2,0.758947,1.3e-05,8
9,0.023296,0.001028,116,21,4,0.758947,1.3e-05,8
7,34.148505,2.866186,3,56,15,0.714176,0.005068,10


In [8]:
# model_random_search = RandomizedSearchCV(
#     model, param_distributions=param_distributions, n_iter=500,
#     n_jobs=2, cv=5)
# model_random_search.fit(data_train, target_train)
# cv_results =  pd.DataFrame(model_random_search.cv_results_)
# cv_results.to_csv("../figures/randomized_search_results.csv")

In [9]:
cv_results = pd.read_csv(
    "results/randomized_search_results.csv", index_col=0
)

(
    cv_results[column_results]
    .rename(shorten_param, axis=1)
    .sort_values("mean_test_score", ascending=False)
)

Unnamed: 0,l2_regularization,learning_rate,max_leaf_nodes,min_samples_leaf,max_bins,mean_test_score,std_test_score,rank_test_score
208,0.011775,0.076653,24,2,155,0.871393,0.001588,1
343,0.000404,0.244503,15,15,229,0.871339,0.002741,2
21,4.994918,0.077047,53,7,192,0.870793,0.001993,3
328,2.036232,0.224702,28,49,236,0.869837,0.000808,4
327,4.733808,0.036786,61,5,241,0.869673,0.002417,5
...,...,...,...,...,...,...,...,...
232,0.000097,9.976823,28,5,3,0.448205,0.253714,496
413,0.000001,8.828574,64,1,144,0.448205,0.253714,497
344,0.000003,7.091079,5,1,95,0.448205,0.253714,497
200,0.000444,6.236325,2,2,30,0.344629,0.207156,499
