In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
import matplotlib.pyplot as plt
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline

import time as tm

In [2]:
train = pd.read_csv(
    '/data/temp/train.csv',
    )

numemrical_columns = ['age', 'fare']

for c in train.columns:
    if c not in numemrical_columns:
        train[c] = train[c].astype("category")

X = train.drop(columns = ['survived', 'name', 'ticket', ])
y = train['survived'].to_numpy()

In [3]:
categorical_mask = [str(X[c].dtype) for c in X.columns]
categorical_mask = [c == 'category' for c in categorical_mask]
print(categorical_mask)

[True, True, False, True, True, False, True, True]


In [4]:
ordinal_encoder = OrdinalEncoder(
    handle_unknown = "use_encoded_value", 
    unknown_value = np.nan)

ordinal_encoder.fit(X).transform(X)

In [5]:
estimator = HistGradientBoostingClassifier(
    categorical_features = categorical_mask,
    )

X = ordinal_encoder.fit(X).transform(X)
estimator.fit(X,y)

In [6]:
hist_ordinal = make_pipeline(
    ordinal_encoder, 
    estimator,
)

In [7]:
hist_ordinal.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'ordinalencoder', 'histgradientboostingclassifier', 'ordinalencoder__categories', 'ordinalencoder__dtype', 'ordinalencoder__handle_unknown', 'ordinalencoder__unknown_value', 'histgradientboostingclassifier__categorical_features', 'histgradientboostingclassifier__early_stopping', 'histgradientboostingclassifier__l2_regularization', 'histgradientboostingclassifier__learning_rate', 'histgradientboostingclassifier__loss', 'histgradientboostingclassifier__max_bins', 'histgradientboostingclassifier__max_depth', 'histgradientboostingclassifier__max_iter', 'histgradientboostingclassifier__max_leaf_nodes', 'histgradientboostingclassifier__min_samples_leaf', 'histgradientboostingclassifier__monotonic_cst', 'histgradientboostingclassifier__n_iter_no_change', 'histgradientboostingclassifier__random_state', 'histgradientboostingclassifier__scoring', 'histgradientboostingclassifier__tol', 'histgradientboostingclassifier__validation_fraction', 'histgradientboo

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingClassifier.html#sklearn.ensemble.HistGradientBoostingClassifier

In [8]:
param_grid = {
    'histgradientboostingclassifier__learning_rate': [0.1,0.01],
    'histgradientboostingclassifier__max_depth': [2,10],
    'histgradientboostingclassifier__l2_regularization':[0.0,0.5],
    'histgradientboostingclassifier__max_leaf_nodes':[30,50],
}

In [9]:
gbm = GridSearchCV(
    hist_ordinal, 
    param_grid, 
    cv = 5,
    scoring = 'roc_auc')

In [10]:
start_time = tm.time()

gbm.fit(X, y)

running_time = tm.time() - start_time

In [11]:
print('running time: %f seconds'%(running_time))

running time: 22.470004 seconds


In [12]:
gbm.best_score_

0.86443268296186

In [13]:
gbm_best = gbm.best_estimator_

In [14]:
gbm_best.feature_names_in_

array(['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'cabin',
       'embarked'], dtype=object)