Tutaj pokazałem przykład z wykorzystaniem API kompaktybilnego z scikit-learn

In [3]:
# inicjacja bibliotek oraz przestrzeni nazw
import pandas as pd
import numpy as np
import os

data_path = "E:\page_ds\LightGBM_step_by_step"
pict_path = os.path.join(data_path, r"foto")

MainData = pd.read_csv(os.path.join(data_path, "train.csv"))

#podział na cechy oraz zmienną celu
y = MainData['target'].copy()
X = MainData.drop(columns=['id','target']).copy()

#lista cech kategorialnych
X_cat_col = list(X.loc[:,X.dtypes!=np.number].columns)


for c in X.columns:
    col_type = X[c].dtype
    if col_type == 'object' or col_type.name == 'category':
        X[c] = X[c].astype('category')

Następnie inicjujemy przestrzeń parametrów i wrzucamy to do modelu. Tym razem nie dzielimy zbioru na część treningową i walidacyjną tylko korzystamy z całego dostępnego zestawu danych podzielonego krzyżowo  na 5 podzbiorów

In [1]:
from optuna.integration import LightGBMPruningCallback
import optuna
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.metrics import accuracy_score # dane są nisko niezbalansowane przez co powinniśmy rozpatrzyć losowanie warstwowe
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold

def objective(trial, X, y):
    param_grid = {
        # "device_type": trial.suggest_categorical("device_type", ['gpu']),
        "n_estimators": trial.suggest_categorical("n_estimators", [1000]),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 200, 10000, step=100),
        "lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
        "lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        "bagging_fraction": trial.suggest_float(
            "bagging_fraction", 0.2, 0.9, step=0.1
        ),
        "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
        "feature_fraction": trial.suggest_float(
            "feature_fraction", 0.2, 0.9, step=0.1
        ),
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=128262)

    cv_scores = np.empty(5)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model = lgb.LGBMClassifier(objective="binary", **param_grid)
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_test, y_test)],
            eval_metric="binary_logloss",
            early_stopping_rounds=100,
            callbacks=[
                LightGBMPruningCallback(trial, "binary_logloss")
            ],  # Add a pruning callback
        )
        preds = model.predict_proba(X_test)
        cv_scores[idx] = log_loss(y_test, preds)

    return np.mean(cv_scores)

i trenujemy, dla przyśpieszenia 100 rundek

In [None]:
study = optuna.create_study(direction="minimize", study_name="LGBM Classifier")
func = lambda trial: objective(trial, X, y)
study.optimize(func, n_trials=100)









[32m[I 2022-12-01 00:30:07,985][0m Trial 32 finished with value: 0.3524027987298371 and parameters: {'n_estimators': 1000, 'learning_rate': 0.23030037386372618, 'num_leaves': 1940, 'max_depth': 6, 'min_data_in_leaf': 4500, 'lambda_l1': 20, 'lambda_l2': 100, 'min_gain_to_split': 1.414301748884053, 'bagging_fraction': 0.4, 'bagging_freq': 1, 'feature_fraction': 0.9}. Best is trial 1 with value: 0.3507605730290229.[0m




[32m[I 2022-12-01 00:30:10,392][0m Trial 33 pruned. Trial was pruned at iteration 49.[0m
[32m[I 2022-12-01 00:30:11,569][0m Trial 34 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2022-12-01 00:30:12,728][0m Trial 35 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2022-12-01 00:30:16,021][0m Trial 36 pruned. Trial was pruned at iteration 91.[0m
[32m[I 2022-12-01 00:30:18,581][0m Trial 37 pruned. Trial was pruned at iteration 47.[0m
[32m[I 2022-12-01 00:30:19,830][0m Trial 38 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2022-12-01 00:30:21,793][0m Trial 39 pruned. Trial was pruned at iteration 34.[0m
[32m[I 2022-12-01 00:30:24,726][0m Trial 40 pruned. Trial was pruned at iteration 97.[0m
[32m[I 2022-12-01 00:30:25,972][0m Trial 41 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2022-12-01 00:30:27,186][0m Trial 42 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2022-12-01 00:30:28,464][0m Trial 43 pruned. Trial was pruned at iteration 0













In [13]:
print("Best params:")
for key, value in study.best_params.items():
    print(f"\t{key}: {value}")

Best params:
	n_estimators: 10000
	learning_rate: 0.24098597973299637
	num_leaves: 2960
	max_depth: 5
	min_data_in_leaf: 8400
	lambda_l1: 30
	lambda_l2: 25
	min_gain_to_split: 0.2839263800263714
	bagging_fraction: 0.9
	bagging_freq: 1
	feature_fraction: 0.7


In [None]:
import matplotlib.pyplot as plt
import lightgbm as lgb
# dane są nisko niezbalansowane przez co powinniśmy rozpatrzyć losowanie warstwowe
from sklearn.metrics import accuracy_score
#from sklearn.metrics import plot_roc_curve
from sklearn.metrics import RocCurveDisplay
#from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc

# dzielimy nasz zbiór na treningowy oraz testowy. Poza tym pamiętajmy, że mamy do dyspozycji
# zbiór walidacyjny około 200k rekordów
# ponieważ wybór proporcji próbki na uczącą i walidacyjną może wpłynąć na jakość modelu, powinniśmy
# rozważyć wrzucenie tego  parametru jako elementu optymalizacji hiperparametrów

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle=True, random_state=128262)

# inicjalizujemy model na wymyślonych przeze mnie parametrach (PO BOŻEMU: tu powinny być parametry z Optuny!!!)
xgb_clf = lgb.LGBMClassifier(max_depth=3,
                            n_estimators=100,
                            objective='binary:logistic',
                            booster='gbtree',
                            n_jobs=-1,
                            random_state=1)

# uczymy model, korzystamy z API sklearn
xgb_clf.fit(X_train, y_train)

# przeliczamy skoringi na zbiorze testowym aby sprawdzić jakość modelu w rozumieniu interesujących mnie miar
y_hat = xgb_clf.predict(X_test)

auc_val = roc_auc_score(y_test, xgb_clf.predict_proba(X_test)[:, 1])
auc_train = roc_auc_score(y_train, xgb_clf.predict_proba(X_train)[:, 1])

print("Accuracy for validation set: {0:.4f}".format(accuracy_score(y_test, y_hat)))
print("Accuracy for train set: {0:.4f}".format(accuracy_score(y_train, xgb_clf.predict(X_train))))

print("Wartość parametru Gini na zbiorze walidacyjnym to: {0:.4f}".format(2*auc_val-1)) # Gini = 2*AUC-1
print("Wartość parametru Gini na zbiorze treningowym to: {0:.4f}".format(2*auc_train-1)) # Gini = 2*AUC-1

print("\nROC Curve")
print(RocCurveDisplay.from_estimator(xgb_clf, X_test, y_test))
#RocCurveDisplay.from_estimator(xgb_clf, X_test, y_test)
#plt.savefig(r"E:\page_ds\XGBoost_step_by_step\foto\2_ROC_cur_clf.png", format="png" , dpi=300, facecolor="White")

print("\nConfusion Matrix")
#print(plot_confusion_matrix(xgb_clf, X_test, y_test))
cm = confusion_matrix(y_test, y_hat, labels=xgb_clf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=xgb_clf.classes_)
print(disp.plot())
#wyk = disp.plot()
#plt.savefig(r"E:\page_ds\XGBoost_step_by_step\foto\1_conf_matrix_clf.png", format="png" , dpi=300, facecolor="White")
lt.show()