In [1]:
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn import datasets
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
import numpy as np
import optuna
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

### XGBoost Optuna with feature selection of corrleation 0.15

In [14]:
fetal = pd.read_csv("fetal_health.csv")
corr = fetal.corr()
X = fetal[corr[abs(corr['fetal_health']) > 0.15]['fetal_health'].index]
X = X.drop(['fetal_health'], axis=1).values
y = fetal['fetal_health'].values.ravel().astype(int) - 1

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.3,
                                                    random_state=123,
                                                    stratify=y)

X_train_sub, X_valid, y_train_sub, y_valid = \
    train_test_split(X_train, y_train, test_size=0.2, random_state=123, stratify=y_train)

In [16]:
print('Train/Valid/Test sizes:', y_train.shape[0], y_valid.shape[0], y_test.shape[0])

Train/Valid/Test sizes: 1488 298 638


### Optuna

In [17]:
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
import numpy as np
import optuna


def optimization_objective(trial, X_train, y_train, cv=5):

    
    params =  {
            "n_estimators": trial.suggest_categorical("n_estimators", [30, 50, 100, 300, 500, 1000]),
            "learning_rate": trial.suggest_categorical("learning_rate", [0.01, 0.03, 0.05, 0.1]),
            "lambda": trial.suggest_loguniform("lambda", 1e-8, 1.0),
            "alpha": trial.suggest_loguniform("alpha", 1e-8, 1.0),
    }
    

    cv_iterator = StratifiedKFold(n_splits=cv, shuffle=True, random_state=123)

    cv_scores = np.zeros(cv)
    for idx, (train_sub_idx, valid_idx) in enumerate(cv_iterator.split(X_train, y_train)):
        
        X_train_sub, X_valid = X_train[train_sub_idx], X_train[valid_idx]
        y_train_sub, y_valid = y_train[train_sub_idx], y_train[valid_idx]
        

        model = XGBClassifier(**params, random_state=123, use_label_encoder=False)
        
        model.fit(
            X_train_sub,
            y_train_sub,
            verbose=False,
            eval_set=[(X_valid, y_valid)],
            eval_metric="auc",
            early_stopping_rounds=100,
        )
        
        preds = model.score(X_valid, y_valid)
        
        cv_scores[idx] = preds

    return np.mean(cv_scores)

In [18]:
study = optuna.create_study(direction="maximize", study_name="XGBoost Classifier")

def func(trial):
    return optimization_objective(trial, X_train, y_train)

study.optimize(func, n_trials=50);





print(f"Best CV accuracy: {study.best_value:.5f}")
print("Best params:")
for key, value in study.best_params.items():
    print(f"\t{key}: {value}")

[32m[I 2021-11-22 17:30:26,171][0m A new study created in memory with name: XGBoost Classifier[0m
[32m[I 2021-11-22 17:30:28,076][0m Trial 0 finished with value: 0.9334756965629449 and parameters: {'n_estimators': 50, 'learning_rate': 0.05, 'lambda': 0.07228467045990493, 'alpha': 0.6366909936260661}. Best is trial 0 with value: 0.9334756965629449.[0m
[32m[I 2021-11-22 17:30:34,640][0m Trial 1 finished with value: 0.9395227442207309 and parameters: {'n_estimators': 500, 'learning_rate': 0.03, 'lambda': 0.0006993359873520448, 'alpha': 1.5246526756953655e-08}. Best is trial 1 with value: 0.9395227442207309.[0m
[32m[I 2021-11-22 17:30:40,701][0m Trial 2 finished with value: 0.9408672858337287 and parameters: {'n_estimators': 300, 'learning_rate': 0.03, 'lambda': 0.0003199567248724319, 'alpha': 2.40623488649738e-07}. Best is trial 2 with value: 0.9408672858337287.[0m
[32m[I 2021-11-22 17:30:43,434][0m Trial 3 finished with value: 0.9422163469143335 and parameters: {'n_estimato

Best CV accuracy: 0.94692
Best params:
	n_estimators: 300
	learning_rate: 0.05
	lambda: 0.9790017216708068
	alpha: 0.13228211190131656


In [19]:
model = XGBClassifier(**study.best_params, random_state=123, use_label_encoder=False)
model.fit(
    X_train,
    y_train,
    verbose=False,
    eval_set=[(X_test, y_test)],
    eval_metric="auc",
    early_stopping_rounds=100,
)



print(f"Training Accuracy: {model.score(X_train, y_train)*100:0.3f}%")
print(f"Test Accuracy: {model.score(X_test, y_test)*100:0.3f}%")

Training Accuracy: 100.000%
Test Accuracy: 94.514%
