The goal is to write an exhaustive search to find the best parameters
combination maximizing the model generalization performance.

In [2]:
import pandas as pd

from sklearn.model_selection import train_test_split

adult_census = pd.read_csv("../datasets/adult-census.csv")

target_name = "class"
target = adult_census[target_name]
data = adult_census.drop(columns=[target_name, "education-num"])
data_train, data_test, target_train, target_test = train_test_split(
    data, target, train_size=0.75, random_state=42)

### Creating Pipeline

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OrdinalEncoder

categorical_preprocessor = OrdinalEncoder(handle_unknown="use_encoded_value",
                                          unknown_value=-1)
preprocessor = ColumnTransformer(
    [('cat_preprocessor', categorical_preprocessor,
      selector(dtype_include=object))],
    remainder='passthrough', sparse_threshold=0)

from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.pipeline import Pipeline

model = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", HistGradientBoostingClassifier(random_state=42))
])


### Finding the best  `learning_rate` and `max_leaf_nodes` parameters

In [5]:
from sklearn.model_selection import cross_validate

best_score = 0
best_params = {}

for lr in [1e-2, 1e-1, 1, 10]:
    for mln in [3, 10,30]:
        model.set_params(classifier__learning_rate=lr,classifier__max_leaf_nodes=mln)
        cv_results = cross_validate(model, data_train, target_train, cv=2)
        scores = cv_results["test_score"]
        mean_score=scores.mean()
        print(f"lr={lr} and mln={mln}:\n"
              f"{scores.mean():.3f} ± {scores.std():.3f}")
        if mean_score > best_score:
            best_score = mean_score
            best_params = {'learning-rate': lr, 'max leaf nodes': mln}
print(f"Best score: {best_score:.3f} \nBest parameters: {best_params}!")
            

lr=0.01 and mln=3:
0.797 ± 0.001
lr=0.01 and mln=10:
0.818 ± 0.001
lr=0.01 and mln=30:
0.843 ± 0.003
lr=0.1 and mln=3:
0.853 ± 0.001
lr=0.1 and mln=10:
0.866 ± 0.000
lr=0.1 and mln=30:
0.868 ± 0.001
lr=1 and mln=3:
0.859 ± 0.000
lr=1 and mln=10:
0.861 ± 0.002
lr=1 and mln=30:
0.856 ± 0.000
lr=10 and mln=3:
0.283 ± 0.004
lr=10 and mln=10:
0.263 ± 0.006
lr=10 and mln=30:
0.288 ± 0.051
Best score: 0.868 
Best parameters: {'learning-rate': 0.1, 'max leaf nodes': 30}!



### Applying the best parameters to the test set and computing the test score


In [7]:
model.set_params(classifier__learning_rate=best_params['learning-rate'],
                 classifier__max_leaf_nodes=best_params['max leaf nodes'])
model.fit(data_train, target_train)
test_score = model.score(data_test, target_test)

print(f"Test score after the parameter tuning: {test_score:.3f}")

Test score after the parameter tuning: 0.879
