In [None]:
class HashableDict(dict):
    def __hash__(self):
        return hash(tuple(sorted(self.items())))

## search space
 sgd_search_params = {
   'clf__loss': Categorical(['log', 'modified_huber']),
   'clf__class_weight': Categorical([HashableDict({0: 0.3, 1: 0}), HashableDict({0: 0.8, 1: 0}))
}
##https://github.com/scikit-optimize/scikit-optimize/issues/681#issuecomment-781879964

# Balanced RandomForest Classifier

In [None]:
#Balanced Random Forest Classifier
%%time
from skopt import BayesSearchCV
from imblearn.ensemble import BalancedRandomForestClassifier
from skopt.space import Real, Integer, Categorical
 
from collections import OrderedDict

params = [{
    'n_estimators': Integer(10,1500),
    'criterion': Categorical(['gini', 'entropy']),
    'max_depth': Integer(6,7),
    'min_samples_split': Real(0.001, 0.99, 'log-uniform'), 
    'min_samples_leaf': Integer(1,10), 
    'max_features': Categorical(['sqrt', 'log2']),
    'ccp_alpha': Real(0.0, 0.035, 'log-uniform'), 
    'bootstrap': Categorical([True]), 
    'replacement': Categorical([True]),
    'class_weight': Categorical(['balanced', 'balanced_subsample']), 
    'max_samples': Real(0.1, 0.999, 'log-uniform')
}]

bayes_search = BayesSearchCV(estimator = BalancedRandomForestClassifier(), 
                             search_spaces = params, 
                             cv = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True), 
                             scoring = 'f1', 
                             n_jobs = -1) #To use all processors

tuning = 1

while tuning <=5:
     
    bayes_search.fit(X_train_pars_df, y_train_df)
    best_accuracy = bayes_search.best_score_
    best_param = bayes_search.best_params_
    print(f'tuning: {tuning},  
    print(f'Best Accuracy: {best_accuracy*100} %')
    print(f'Best Parameters: {best_param}')
    print()
    tuning += 1

In [None]:
%%time
from sklearn.model_selection import cross_val_score
from imblearn.ensemble import BalancedRandomForestClassifier

from skopt import gp_minimize
from skopt.plots import plot_convergence
from skopt.space import Real, Integer, Categorical #Real = float
from skopt.utils import use_named_args

 

param_grid = [
    Integer(10, 1500, name="n_estimators"),
    Categorical(['gini', 'entropy'], name = "criterion"),
    Integer(1,7, name = "max_depth"),
    Real(0.0001, 0.999, name = "min_samples_split"),
    Integer(1,30, name = "min_samples_leaf"),
    Categorical(['sqrt', 'log2'], name = "max_features"),
    Real(0.0001, 0.035, name="ccp_alpha"),
    Categorical([True, False], name = "bootstrap"), 
    Categorical([True, False], name = "replacement"), 
    Categorical(['balanced', 'balanced_subsample'], name = "class_weight"),
    Real(0.1, 0.999, name="max_samples")
]

model = BalancedRandomForestClassifier()

@use_named_args(param_grid)
def objective(**params):
    
    # model with new parameters
    model.set_params(**params)

    # optimization function (hyperparam response function)
    value = np.mean(cross_val_score(model, X_train_pars_df, y_train_df,
                                    cv=StratifiedKFold(n_splits=5, random_state=seed, shuffle=True),
                                    n_jobs=-1, scoring='f1')
                   )

    # negate because we need to minimize
    return -value

 

gp_ = gp_minimize(
    objective, # the objective function to minimize
    param_grid, # the hyperparameter space
    n_initial_points=20, # the number of points to evaluate f(x) to start of
    acq_func='EI', # the acquisition function
    n_calls=100, # the number of subsequent evaluations of f(x)
    random_state=seed, 
    n_jobs = -1
)

print(f'Best Accuracy: {gp_.fun*100} %')  

print(f"""Best parameters:
=========================
n_estimators={gp_.x[0]}, 
criterion='{gp_.x[1]}',
max_depth={gp_.x[2]},
min_samples_split={gp_.x[3]:.3f}, 
min_samples_leaf={gp_.x[4]}, 
max_features='{gp_.x[5]}', 
ccp_alpha={gp_.x[6]:.4f}, 
bootstrap={gp_.x[7]}, 
replacement={gp_.x[8]}, 
class_weight='{gp_.x[9]}', 
max_samples={gp_.x[10]:.3f} """) 
print()

plot_convergence(gp_);

# Random Forest Classifier

In [None]:
#Random Forest
%%time
from skopt import BayesSearchCV
 from skopt.space import Real, Integer, Categorical
from collections import OrderedDict
from sklearn.ensemble import RandomForestClassifier


params = [{
    'n_estimators': Integer(10, 1500), 
    'criterion': Categorical(['gini', 'entropy']),
    'max_depth': Integer(1,7),
    'min_samples_split': Real(0.0001, 0.999),
    'min_samples_leaf': Integer(1,30),
    'max_features': Categorical(['auto', 'sqrt', 'log2']),
    'ccp_alpha': Real(0.0001, 0.035),
    'bootstrap': Categorical([True, False]),
    'class_weight': Categorical(['balanced', 'balanced_subsample'])
}]

bayes_search = BayesSearchCV(estimator = RandomForestClassifier(), 
                             search_spaces = params, 
                             cv = StratifiedKFold(n_splits=10, random_state=seed, shuffle=True), 
                             scoring = 'f1', 
                             n_jobs = -1) #To use all processors

tuning = 1

while tuning <=5:
     
    bayes_search.fit(X_train_pars, y_train)
    best_accuracy = bayes_search.best_score_
    best_param = bayes_search.best_params_
    print(f'tuning: {tuning},  
    print(f'Best Accuracy: {best_accuracy*100} %')
    print(f'Best Parameters: {best_param}')
    print()
    tuning += 1

In [None]:
%%time
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from skopt import gp_minimize
from skopt.plots import plot_convergence
from skopt.space import Real, Integer, Categorical #Real = float
from skopt.utils import use_named_args

param_grid = [
    Integer(10, 1500, name="n_estimators"),
    Categorical(['gini', 'entropy'], name = "criterion"),
    Integer(1,7, name = "max_depth"),
    Real(0.0001, 0.999, name = "min_samples_split"),
    Integer(1,30, name = "min_samples_leaf"),
    Categorical(['sqrt', 'log2'], name = "max_features"),
    Real(0.0001, 0.035, name="ccp_alpha"),
    Categorical([True, False], name = "bootstrap"),
    Categorical(['balanced', 'balanced_subsample'], name = "class_weight")
]

model = RandomForestClassifier()

@use_named_args(param_grid)
def objective(**params):
    
    # model with new parameters
    model.set_params(**params)

    # optimization function (hyperparam response function)
    value = np.mean(cross_val_score(model, #X_train_pars, y_train,
                                    cv=StratifiedKFold(n_splits=10, random_state=seed, shuffle=True),
                                    n_jobs=-1, scoring='f1')
                   )

    # negate because we need to minimize
    return -value

 

gp_ = gp_minimize(
    objective, # the objective function to minimize
    param_grid, # the hyperparameter space
    n_initial_points=20, # the number of points to evaluate f(x) to start of
    acq_func='EI', # the acquisition function
    n_calls=120, # the number of subsequent evaluations of f(x)
    random_state=seed, 
    n_jobs=-1,
)

print(f'Best Accuracy: {gp_.fun*100} %')  

print(f"""Best parameters:
=========================
n_estimators={gp_.x[0]}, 
criterion='{gp_.x[1]}',
max_depth={gp_.x[2]},
min_samples_split={gp_.x[3]:.5f}, 
min_samples_leaf={gp_.x[4]}, 
max_features='{gp_.x[5]}', 
ccp_alpha={gp_.x[6]:.5f}, 
bootstrap={gp_.x[7]}, 
class_weight='{gp_.x[8]}' """) 
print()

plot_convergence(gp_);

# ExtraTreeClassifier

In [None]:
%%time
from sklearn.model_selection import cross_val_score
from skopt import gp_minimize
from skopt.plots import plot_convergence
from skopt.space import Real, Integer, Categorical #Real = float
from skopt.utils import use_named_args

param_grid = [
    Integer(10, 2000, name="n_estimators"),
    Categorical(['gini', 'entropy'], name = "criterion"),
    Integer(1,7, name = "max_depth"),
    Real(0.0001, 0.999, name = "min_samples_split"),
    Integer(1,30, name = "min_samples_leaf"),
    Categorical(['sqrt', 'log2'], name = "max_features"),
    Real(0.0001, 0.035, name="ccp_alpha"),
    Categorical([True, False], name = "bootstrap"),
    Categorical(['balanced', 'balanced_subsample'], name = "class_weight")
]

model = ExtraTreesClassifier(n_jobs=-1)

@use_named_args(param_grid)
def objective(**params):
    
    # model with new parameters
    model.set_params(**params)

    # optimization function (hyperparam response function)
    value = np.mean(cross_val_score(model, X_train_pars, y_train,
                                    cv=StratifiedKFold(n_splits=10, random_state=seed, shuffle=True),
                                    n_jobs=-1, 
                                    scoring='roc_auc')
                   )

    # negate because we need to minimize
    return -value

 

gp_ = gp_minimize(
    objective, # the objective function to minimize
    param_grid, # the hyperparameter space
    n_initial_points=20, # the number of points to evaluate f(x) to start of
    acq_func='EI', # the acquisition function
    n_calls=120, # the number of subsequent evaluations of f(x)
    random_state=seed, 
    n_jobs=-1,
)

print(f'Best Accuracy: {gp_.fun*100} %')  

print(f"""Best parameters:
=========================
n_estimators={gp_.x[0]}, 
criterion='{gp_.x[1]}',
max_depth={gp_.x[2]},
min_samples_split={gp_.x[3]:.5f}, 
min_samples_leaf={gp_.x[4]}, 
max_features='{gp_.x[5]}', 
ccp_alpha={gp_.x[6]:.5f}, 
bootstrap={gp_.x[7]}, 
class_weight='{gp_.x[8]}' """)
print()

plot_convergence(gp_);

# CatBoost

In [None]:
%%time
from sklearn.model_selection import cross_val_score
from skopt import gp_minimize
from skopt.plots import plot_convergence
from skopt.space import Real, Integer, Categorical #Real = float
from skopt.utils import use_named_args
 
from catboost import CatBoostClassifier

param_grid = [
    Integer(1, 7, name="depth"),
    Integer(200, 1000, name="iterations"),
    Real(0.001, 0.4, name = "learning_rate"),
    Real(1.0, 100.0, name = "l2_leaf_reg"),
    Integer(5, 200, name="border_count"),
    Categorical(['Ordered', 'Plain'], name = "boosting_type")
]

cat = CatBoostClassifier(verbose = False, loss_function='CrossEntropy', eval_metric='TotalF1')

@use_named_args(param_grid)
def objective(**params):
    
    # model with new parameters
    cat.set_params(**params)

    # optimization function (hyperparam response function)
    value = np.mean(cross_val_score(cat, #X_train_pars_df, y_train_df,
                                    cv=StratifiedKFold(n_splits=10, random_state=seed, shuffle=True),
                                    n_jobs=-1, scoring='f1')
                   )

    # negate because we need to minimize
    return -value

 

gp_ = gp_minimize(
    objective, # the objective function to minimize
    param_grid, # the hyperparameter space
    n_initial_points=20, # the number of points to evaluate f(x) to start of
    acq_func='EI', # the acquisition function
    n_calls=100, # the number of subsequent evaluations of f(x)
    random_state=seed, 
    n_jobs=-1,
)

print(f'Best Accuracy: {gp_.fun*100} %')  

print(f"""Best parameters:
=========================
depth={gp_.x[0]}, 
iterations={gp_.x[1]},
learning_rate={gp_.x[2]:.5f}, 
l2_leaf_reg={gp_.x[3]:.5f},
border_count={gp_.x[4]}, 
boosting_type='{gp_.x[5]}' """)
print()

plot_convergence(gp_);

# XGBoost

In [None]:
%%time

from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
from xgboost import XGBClassifier
from collections import OrderedDict

params = [{
    'booster': Categorical(['dart', 'gbtree', 'gblinear']), 
    'learning_rate': Real(0.001, 0.15, prior = 'log-uniform'),
    'max_depth': Integer(3,7),
    'min_child_weight' : Integer(1,20, prior = 'log-uniform'),
    'gamma' : Real(0.001,5.0, prior = 'log-uniform'),
    'subsample': Real(0.4,0.999),
    'colsample_bytree': Real(0.3,0.999),
    'n_estimators' : Integer(10,1500), 
    'base_score': Real(0.3,0.65),
    'max_delta_step': Integer(0,10, prior = 'log-uniform'),
    'reg_alpha': Real(0.0001,2),
    'reg_lambda': Integer(1, 50)
}]

bayes_search = BayesSearchCV(estimator = XGBClassifier(use_label_encoder=False, 
                                                       eval_metric = 'error', 
                                                       objective = 'binary:logistic', 
                                                       n_jobs = -1),
                             search_spaces = params, 
                             cv = StratifiedKFold(n_splits=10, random_state=seed, shuffle = True), 
                             scoring = 'roc_auc',
                             optimizer_kwargs = {'base_estimator': 'GP', 
                                                 'n_initial_points' : 20, 
                                                 'acq_func': 'EI', 
                                                 'n_jobs': -1},
                             n_jobs = -1) #To use all processors

tuning = 1

while tuning <2:
    bayes_search.fit(X_train_pars, y_train)
    best_accuracy = bayes_search.best_score_
    best_param = bayes_search.best_params_
    print(f'tuning: {tuning}')
    print(f'Best Accuracy: {best_accuracy*100:.4f} %')
    print(f'Best Parameters: {best_param}')
    print()
    tuning += 1