In [None]:
import optuna
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
from optuna.samplers import TPESampler
from catboost import CatBoostClassifier
from optuna.pruners import BasePruner
import time
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler


from optuna.integration import TFKerasPruningCallback
import keras
from keras.layers import Dense
from keras import Input
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras.layers import Dropout
from keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
from optuna.trial import TrialState

In [None]:
def grab_col_names(dataframe, cat_th=10, car_th=20):
    # cat_cols, cat_but_car
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    # num_cols
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')
    return cat_cols, num_cols, cat_but_car

In [None]:
# Define custom transformer
class ColumnSelector(BaseEstimator, TransformerMixin):
    """Select only specified columns."""
    def __init__(self, columns):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.columns]

In [None]:
class TimoutPruner(BasePruner):
    def __init__(self, max_sec_per_trial=2):
        self._max_sec_per_trial = max_sec_per_trial

    def prune(self, study, trial) -> bool:

        step = trial.last_step
        
        if not step:
            # initialize timestamp
            self.start_time = time.time()

        else:  # trial.last_step == None when no scores have been reported yet                
            if time.time() - self.start_time > self._max_sec_per_trial:
                print(f"This trial takes more than {self._max_sec_per_trial} seconds.")
                return True

        return False
    
pruner_timeout = TimoutPruner(max_sec_per_trial=900)

In [None]:
def create_model(trial):
    # We optimize the number of layers, hidden units and dropout in each layer and
    # the learning rate of adam optimizer.

    # We define our MLP.
    n_layers = trial.suggest_int("n_layers", 1, 4)
    model = Sequential()
    model.add(Input(shape=(X.shape[1],)))
    for i in range(n_layers):
        num_hidden = trial.suggest_int("n_units_l{}".format(i), 4, 256, log=True)
        model.add(Dense(num_hidden, activation="relu"))
        dropout = trial.suggest_float("dropout_l{}".format(i), 0.2, 0.5)
        model.add(Dropout(rate=dropout))
    model.add(Dense(1, activation='sigmoid'))
    
    return model

In [None]:
# Standard Scaler
def objective_optimize(trial, model_name= "RandomForestClassifier", scoring = "f1_weighted", timeout = False, cat_cols = [], num_cols =[],
                      X=[], y=[], scaler_m = RobustScaler()):
    
    categorical = cat_cols
    numerical = num_cols
    
    # Define categorical pipeline
    cat_pipe = Pipeline([
        ('selector', ColumnSelector(categorical)),
    ])
    # Define numerical pipeline
    num_pipe_rb = Pipeline([
        ('selector', ColumnSelector(numerical)),
        ('scaler', scaler_m)
    ])

    # Fit feature union to training data
    preprocessor = FeatureUnion([
        ('cat', cat_pipe),
        ('num', num_pipe_rb)
    ])

    # create pipeline
    estimators = []
    estimators.append(('feature_union', preprocessor))
        
    cv_outer=StratifiedKFold(n_splits=5, random_state=1,shuffle=True)
    
    if model_name == "RandomForestClassifier":
        param = {        
            "n_estimators": trial.suggest_int("n_estimators", 100, 10000, step=100),
            "bootstrap": trial.suggest_categorical("bootstrap", [True, False]),
            "max_depth": trial.suggest_int("max_depth", 1, 24),
            "max_leaf_nodes": trial.suggest_int("max_leaf_nodes", 1, 24),
            "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 12),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 12),
            "n_jobs": -1,
        }
        
        estimators.append(('RF', RandomForestClassifier(**param)))
        pipe = Pipeline(estimators)
        
    if model_name == "LGBMClassifier":
        param = {
        "objective": "binary",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "n_jobs": -1,
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-4, 10.0, step=0.01),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-4, 10.0, step=0.01),
        "num_leaves": trial.suggest_int("num_leaves", 2, 150),
        "max_depth": trial.suggest_int("max_depth", 1, 48),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 1, step=0.01),    
        "n_estimators": trial.suggest_int("n_estimators", 100, 5000, step=200),    
        "min_split_gain": trial.suggest_float("min_split_gain", 0.001, 1, step=0.1),   
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.01, 0.5, log=True),
        }
            
        estimators.append(('LGBM', LGBMClassifier(**param)))
        pipe = Pipeline(estimators)
        
    if model_name == "CatBoostClassifier":
        param = {
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.5, log=True),
        "depth": trial.suggest_int("depth", 1, 34),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-4, 10.0, step=0.01),
        "num_leaves": trial.suggest_int("num_leaves", 2, 32),
        "n_estimators": trial.suggest_int("n_estimators", 100, 5000, step=400), 
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 1, step=0.01),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 55),  
        "eval_metric": "AUC",
        "scale_pos_weight": ratio,
        "early_stopping_rounds": 100
        }
            
        estimators.append(('CB', CatBoostClassifier(**param)))
        pipe = Pipeline(estimators)
        
    if model_name== "Keras":
        rc = scaler_m
        X[numerical] = rc.fit_transform(X[numerical])
        X[categorical] = rc.fit_transform(X[categorical])
    
        n_splits = 5
        cv_outer=StratifiedKFold(n_splits=n_splits, random_state=1,shuffle=True)
        x_train, x_valid, y_train, y_valid = train_test_split(X,y,test_size=0.20,random_state=101)
    
        METRIC_NAME = "AUC_Watcher"
        metrics=[tf.keras.metrics.AUC(name=METRIC_NAME), "accuracy"]
    
        # We compile our model with a sampled learning rate.
        learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True)
        model.compile(
            loss="binary_crossentropy",
            optimizer=Adam(learning_rate=learning_rate),
            metrics= metrics,
        )
    
        checkpointer = ModelCheckpoint(filepath = '', verbose=1, save_best_only=True, save_weights_only = False)
    
        callbacks = [
            TFKerasPruningCallback(trial, "val_AUC_Watcher"),
            checkpointer
        ]

        # Fit the model on the training data.
        # The KerasPruningCallback checks for pruning condition every epoch.
        val_AUC = []
        for train_idx, val_idx in cv_outer.split(X, y):
            X_train_fold, y_train_fold = X.iloc[train_idx], y.iloc[train_idx]
            X_val_fold, y_val_fold = X.iloc[val_idx], y.iloc[val_idx]
            
            # Generate our trial model.
            model = create_model(trial)
            
            history = model.fit(
                X_train_fold,
                y_train_fold,
                batch_size=64,
                callbacks=callbacks,
                epochs=50,
                validation_data=(X_val_fold, y_val_fold),
                verbose=1,
            )

            val_AUC.append(np.mean(history.history["val_AUC_Watcher"]))

        return sum(val_AUC) / n_splits
    
    if timeout:
        for step in range(5):    
            time.sleep(trial.number)
            cvs_best = cross_val_score(pipe, X, y,cv=cv_outer,scoring= scoring).mean()
            trial.report(cvs_best, step)
            if trial.should_prune():
                raise optuna.TrialPruned()
    else:
        cvs_best = cross_val_score(pipe, X, y,cv=cv_outer,scoring= scoring).mean()
    
    return cvs_best

In [None]:
%%time
models=[]
scores=[]
hyperparams={}

In [None]:
cv_outer_split = StratifiedKFold(n_splits=5, random_state=1,shuffle=True)
nested_score = []

for train_index, test_index in cv_outer_split.split(X, y):
    # split data into train and test sets
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # define study object for hyperparameter optimization
    func = lambda trial: objective_optimize(trial, model_name= "LGBMClassifier",  timeout = True, cat_cols = cat_cols, num_cols = num_cols, X=X, y=y)
    study = optuna.create_study(direction='maximize',
                                pruner=pruner_timeout,
                                sampler=TPESampler())
    optuna.logging.set_verbosity(optuna.logging.WARNING)
    study.optimize(func, n_trials=1, timeout = 20000, show_progress_bar = True)
    
    # initialize SVR model with optimized hyperparameters
    LGBM_model = LGBMClassifier(**study.best_params)

    # fit SVR model on inner cross-validation data
    LGBM_model.fit(X_train, y_train)

    # calculate score on outer cross-validation data
    nested_score.append(LGBM_model.score(X_test, y_test))
    
# print mean score and standard deviation
print("Nested CV score: %.4f +/- %.4f" % (np.mean(nested_score), np.std(nested_score)))

model='LGBM'
score=study.best_trial.value
models.append(model)
scores.append(score)
hyperparams[model] = study.best_trial.params