In [1]:
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import make_column_selector as selector
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [2]:
adult_census = pd.read_csv("datasets/adult-census.csv")

target_column = "class"
y = adult_census[target_column]
X = adult_census.drop(columns=[target_column, "education-num"])

In [3]:
categorical_columns_selector = selector(dtype_include=object)
categorical_columns = categorical_columns_selector(X)

categorical_preprocessor = OrdinalEncoder(
    handle_unknown="use_encoded_value", unknown_value=-1
)
preprocessor = ColumnTransformer(
    [
        ("cat_preprocessor", categorical_preprocessor, categorical_columns),
    ],
    remainder="passthrough",
)

In [4]:
model = Pipeline(
    [
        ("preprocessor", preprocessor),
        (
            "classifier",
            HistGradientBoostingClassifier(random_state=42, max_leaf_nodes=4),
        ),
    ]
)
model

In [5]:
cv_results = cross_validate(model, X, y, cv=5)
cv_results = pd.DataFrame(cv_results)

print(
    "Generalization score without hyperparameters"
    f" tuning:\n{cv_results['test_score'].mean():.3f} ±"
    f" {cv_results['test_score'].std():.3f}"
)

Generalization score without hyperparameters tuning:
0.863 ± 0.003


In [6]:
param_grid = {
    "classifier__learning_rate": (0.05, 0.5),
    "classifier__max_leaf_nodes": (10, 30),
}
model_grid_search = GridSearchCV(model, param_grid=param_grid, n_jobs=2, cv=2)
model_grid_search.fit(X, y)

cv_results = pd.DataFrame(model_grid_search.cv_results_)
cv_results[
    [
        "param_classifier__learning_rate",
        "param_classifier__max_leaf_nodes",
        "mean_test_score",
        "std_test_score",
        "rank_test_score",
    ]
]

Unnamed: 0,param_classifier__learning_rate,param_classifier__max_leaf_nodes,mean_test_score,std_test_score,rank_test_score
0,0.05,10,0.864195,6.1e-05,4
1,0.05,30,0.87091,6.1e-05,1
2,0.5,10,0.869743,0.000532,2
3,0.5,30,0.866058,0.001515,3


In [7]:
model_grid_search.best_params_

{'classifier__learning_rate': 0.05, 'classifier__max_leaf_nodes': 30}

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model_grid_search.fit(X_train, y_train)
accuracy = model_grid_search.score(X_test, y_test)
print(f"Accuracy on test set: {accuracy:.3f}")

Accuracy on test set: 0.877


In [9]:
cv_results = cross_validate(
    model_grid_search, X, y, cv=5, n_jobs=2, return_estimator=True
)
cv_results = pd.DataFrame(cv_results)
cv_test_scores = cv_results["test_score"]
print(
    "Generalization score with hyperparameters tuning:\n"
    f"{cv_test_scores.mean():.3f} ± {cv_test_scores.std():.3f}"
)

Generalization score with hyperparameters tuning:
0.871 ± 0.003


In [10]:
for cv_fold, estimator_in_fold in enumerate(cv_results["estimator"]):
    print(
        f"Best hyperparameters for fold #{cv_fold + 1}:\n"
        f"{estimator_in_fold.best_params_}"
    )

Best hyperparameters for fold #1:
{'classifier__learning_rate': 0.05, 'classifier__max_leaf_nodes': 30}
Best hyperparameters for fold #2:
{'classifier__learning_rate': 0.05, 'classifier__max_leaf_nodes': 30}
Best hyperparameters for fold #3:
{'classifier__learning_rate': 0.05, 'classifier__max_leaf_nodes': 30}
Best hyperparameters for fold #4:
{'classifier__learning_rate': 0.05, 'classifier__max_leaf_nodes': 30}
Best hyperparameters for fold #5:
{'classifier__learning_rate': 0.05, 'classifier__max_leaf_nodes': 30}
