In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
pd.set_option('display.max_columns', None)

In [2]:
tr_features = pd.read_csv('data/train_features.csv')
tr_labels = pd.read_csv('data/train_labels.csv')

te_features = pd.read_csv('data/test_features.csv')
te_labels = pd.read_csv('data/test_labels.csv')

In [3]:
def print_results(results):
    print(f'BEST PARAMS: {results.best_params_}\n')

    means = sorted(results.cv_results_['mean_test_score'], reverse=True)
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print(f'{round(mean,3)} (+/- {round(std * 2, 3)}) for {params}')

In [4]:
# gb = GradientBoostingClassifier()
# gbscores = cross_val_score(gb, tr_features, tr_labels.values.ravel(), cv=5, n_jobs=16)

# print(gbscores)
# gb.fit(tr_features, tr_labels.values.ravel())

gb = HistGradientBoostingClassifier()
gbscores = cross_val_score(gb, tr_features, tr_labels.values.ravel(), cv=5, n_jobs=-1)

print(gbscores)
gb.fit(tr_features, tr_labels.values.ravel())

[0.37947344 0.39240107 0.38845972 0.38562195 0.38987861]


In [5]:
# gbparams = {
#     'loss': ['log_loss', 'deviance', 'exponential'],
#     'n_estimators': [10, 50],
#     'criterion': ['friedman_mse', 'squared_error'],
#     'max_features': ['sqrt', 'log2', None],
#     'warm_start': [True, False]
# }
# cv = GridSearchCV(gb, gbparams, cv=5, n_jobs=16)
# cv.fit(tr_features, tr_labels.values.ravel())

gbparams = {
    'learning_rate': [0.1,0.5,1.0],
    'max_iter': [100,200,500],
    'max_leaf_nodes': [16,31,None],
    'max_depth': [100,500,None],
    'min_samples_leaf': [20,50,100],
    'l2_regularization': [0,1]
}

cv = GridSearchCV(gb, gbparams, cv=5, n_jobs=-1)
cv.fit(tr_features, tr_labels.values.ravel())

print_results(cv)

BEST PARAMS: {'l2_regularization': 0, 'learning_rate': 0.1, 'max_depth': 100, 'max_iter': 100, 'max_leaf_nodes': 16, 'min_samples_leaf': 50}

0.396 (+/- 0.017) for {'l2_regularization': 0, 'learning_rate': 0.1, 'max_depth': 100, 'max_iter': 100, 'max_leaf_nodes': 16, 'min_samples_leaf': 20}
0.396 (+/- 0.014) for {'l2_regularization': 0, 'learning_rate': 0.1, 'max_depth': 100, 'max_iter': 100, 'max_leaf_nodes': 16, 'min_samples_leaf': 50}
0.396 (+/- 0.008) for {'l2_regularization': 0, 'learning_rate': 0.1, 'max_depth': 100, 'max_iter': 100, 'max_leaf_nodes': 16, 'min_samples_leaf': 100}
0.395 (+/- 0.01) for {'l2_regularization': 0, 'learning_rate': 0.1, 'max_depth': 100, 'max_iter': 100, 'max_leaf_nodes': 31, 'min_samples_leaf': 20}
0.395 (+/- 0.011) for {'l2_regularization': 0, 'learning_rate': 0.1, 'max_depth': 100, 'max_iter': 100, 'max_leaf_nodes': 31, 'min_samples_leaf': 50}
0.395 (+/- 0.009) for {'l2_regularization': 0, 'learning_rate': 0.1, 'max_depth': 100, 'max_iter': 100, 'max

In [7]:
# best params
gb0 = HistGradientBoostingClassifier(l2_regularization=0, learning_rate=0.1, max_depth=100, max_iter=100, max_leaf_nodes=16, min_samples_leaf=50)
gb0scores = cross_val_score(gb0, tr_features, tr_labels.values.ravel(), cv=5, n_jobs=-1)
print(gb0scores)
gb0.fit(tr_features, tr_labels.values.ravel())

gb1 = HistGradientBoostingClassifier(l2_regularization=0, learning_rate=0.1, max_depth=100, max_iter=100, max_leaf_nodes=16, min_samples_leaf=20)
gb1scores = cross_val_score(gb1, tr_features, tr_labels.values.ravel(), cv=5, n_jobs=-1)
print(gb1scores)
gb1.fit(tr_features, tr_labels.values.ravel())

[0.39303169 0.40091439 0.393347   0.39650008 0.38467602]
[0.39050922 0.39965316 0.38972095 0.39240107 0.39129749]


In [8]:
for mdl in [gb, gb0, gb1]:
    y_pred = mdl.predict(te_features)
    accuracy = round(accuracy_score(te_labels, y_pred), 8)
    precision = round(precision_score(te_labels, y_pred, average='weighted'), 8)
    recall = round(recall_score(te_labels, y_pred, average='weighted'), 8)
    f1 = round(f1_score(te_labels, y_pred, average='weighted'), 8)
    print(f'MAX DEPTH: {mdl.max_depth} / MAX LEAF NODES: {mdl.max_leaf_nodes} / A: {accuracy} / P: {precision} / R: {recall} / F1: {f1}')

MAX DEPTH: None / MAX LEAF NODES: 31 / A: 0.39273553 / P: 0.38125917 / R: 0.39273553 / F1: 0.38058234
MAX DEPTH: 100 / MAX LEAF NODES: 16 / A: 0.39324001 / P: 0.38238662 / R: 0.39324001 / F1: 0.38169339
MAX DEPTH: 100 / MAX LEAF NODES: 16 / A: 0.39689747 / P: 0.38550849 / R: 0.39689747 / F1: 0.38332466
