# CatBoost

In [None]:
%%time
import optuna
from sklearn.model_selection import cross_val_score

def objective(trial):

    cat_iterations = trial.suggest_int("cat_iterations", 10, 2500)
    cat_depth = trial.suggest_int("cat_depth", 1, 7)
    cat_learning_rate = trial.suggest_float("cat_learning_rate", 0.0001, 0.4)
    cat_boosting_type = trial.suggest_categorical("cat_boosting_type", ['Ordered', 'Plain'])
    cat_l2_leaf_reg = trial.suggest_float("cat_l2_leaf_reg", 0.1, 100.0)
    cat_border_count = trial.suggest_int("cat_border_count", 5, 200)
    
    model = CatBoostClassifier(verbose = False, 
                               loss_function='CrossEntropy', 
                               eval_metric='TotalF1', 
                               random_seed = seed, 
                               iterations = cat_iterations, 
                               depth = cat_depth, 
                               learning_rate = cat_learning_rate, 
                               boosting_type = cat_boosting_type, 
                               l2_leaf_reg = cat_l2_leaf_reg, 
                               border_count = cat_border_count)

    cv_score = cross_val_score(model, 
#                                X_train, 
#                                y_train, 
                               cv=StratifiedKFold(n_splits=10, random_state=seed, shuffle=True),
                               scoring='roc_auc')
    score = cv_score.mean()
    return score

study = optuna.create_study(
    direction="maximize",
    sampler=optuna.samplers.CmaEsSampler(warn_independent_sampling = False)
) #TPESampler(seed = seed, multivariate = True)

study.optimize(objective, 
               n_trials=120, 
               n_jobs = -1, 
               show_progress_bar = True)
print()
print(f'Best Score: {study.best_value*100:.3f} %')
print()
print(f'Best Params:')
print(study.best_params)

results = study.trials_dataframe()
results['value'].sort_values().reset_index(drop=True).plot();
plt.title('Convergence plot');
plt.xlabel('Iteration');
plt.ylabel('Score');

chosen_param = {
    i.replace("cat_", ""):j for i, j in zip(study.best_params.keys(), study.best_params.values())
}
print()
print(chosen_param)

# Regulaized Greedy Forest (RGF)

In [None]:
%%time
import optuna
from sklearn.model_selection import cross_val_score

def objective(trial):
    rgf_algorithm = trial.suggest_categorical("rgf_algorithm", ['RGF', 'RGF_Opt', 'RGF_Sib'])
    rgf_l2 = trial.suggest_float("rgf_l2", 0.0001, 1)
    rgf_learning_rate  = trial.suggest_float("rgf_learning_rate", 0.001, 0.5)
    rgf_loss = trial.suggest_categorical("rgf_loss", ['LS', 'Log', 'Expo'])
    rgf_max_leaf = trial.suggest_int("rgf_max_leaf", 1000, 10000)
    rgf_calc_prob = trial.suggest_categorical("rgf_calc_prob", ["softmax", "sigmoid"])
    rgf_min_samples_leaf = trial.suggest_int("rgf_min_samples_leaf", 1, 20)
    rgf_reg_depth =  trial.suggest_float("rgf_reg_depth", 1.0, 5.0)
    rgf_test_interval = trial.suggest_int("rgf_test_interval", 100, 600)
    
    model = RGFClassifier(algorithm = rgf_algorithm, 
                          l2 = rgf_l2, 
                          learning_rate = rgf_learning_rate, 
                          loss = rgf_loss, 
                          max_leaf = rgf_max_leaf, 
                          calc_prob = rgf_calc_prob, 
                          min_samples_leaf = rgf_min_samples_leaf, 
                          reg_depth = rgf_reg_depth, 
                          test_interval = rgf_test_interval)

    cv_score = cross_val_score(model, 
#                                X_train_pars, 
#                                y_train, 
                               cv=StratifiedKFold(n_splits=10, random_state=seed, shuffle=True),
                               scoring='roc_auc')
    
    score = cv_score.mean()
    return score

study = optuna.create_study(
    direction="maximize",
    sampler=optuna.samplers.CmaEsSampler(warn_independent_sampling = False)
) #TPESampler(seed = seed, multivariate = True)

study.optimize(objective, 
               n_trials=120, 
               n_jobs = -1, 
               show_progress_bar = True)
print()
print(f'Best Score: {study.best_value*100:.3f} %')
print()
print(f'Best Params:')
print(study.best_params)

results = study.trials_dataframe()
results['value'].sort_values().reset_index(drop=True).plot();
plt.title('Convergence plot');
plt.xlabel('Iteration');
plt.ylabel('Score');

chosen_param = {
    i.replace("rgf_", ""):j for i, j in zip(study.best_params.keys(), study.best_params.values())
}
print()
print(chosen_param)

# Balanced RandomForest

In [None]:
%%time
import optuna
from sklearn.model_selection import cross_val_score

def objective(trial):
    balRF_n_estimators = trial.suggest_int("balRF_n_estimators", 10, 1500)
    balRF_criterion = trial.suggest_categorical("balRF_criterion", ['gini', 'entropy'])
    balRF_max_depth = trial.suggest_int("balRF_max_depth", 1, 7)
    balRF_min_samples_split = trial.suggest_float("balRF_min_samples_split", 0.0001, 0.999)
    balRF_min_samples_leaf = trial.suggest_int("balRF_min_samples_leaf", 1, 30)
    balRF_max_features = trial.suggest_categorical("balRF_max_features", ['sqrt', 'log2'])
    balRF_ccp_alpha = trial.suggest_float("balRF_ccp_alpha", 0.0001, 0.035)
    balRF_bootstrap = trial.suggest_categorical("balRF_bootstrap", [True, False])
    balRF_replacement = trial.suggest_categorical("balRF_replacement", [True, False])
    balRF_class_weight = trial.suggest_categorical("balRF_class_weight", ['balanced', 'balanced_subsample'])
    balRF_max_samples = trial.suggest_float("balRF_max_samples", 0.1, 0.999)
    
    model = BalancedRandomForestClassifier(random_state = seed, 
                                           n_jobs = -1, 
                                           n_estimators = balRF_n_estimators, 
                                           criterion = balRF_criterion, 
                                           max_depth = balRF_max_depth, 
                                           min_samples_split = balRF_min_samples_split, 
                                           min_samples_leaf = balRF_min_samples_leaf, 
                                           max_features = balRF_max_features, 
                                           ccp_alpha = balRF_ccp_alpha, 
                                           bootstrap = balRF_bootstrap, 
                                           replacement = balRF_replacement, 
                                           class_weight = balRF_class_weight, 
                                           max_samples = balRF_max_samples)

    cv_score = cross_val_score(model, 
                               X_train_pars, 
                               y_train, 
                               cv=StratifiedKFold(n_splits=10, random_state=seed, shuffle=True),
                               scoring='roc_auc')
    score = cv_score.mean()
    return score

study = optuna.create_study(
    direction="maximize",
    sampler=optuna.samplers.CmaEsSampler(warn_independent_sampling = False)
) #TPESampler(seed = seed, multivariate = True)

study.optimize(objective, 
               n_trials=120, 
               n_jobs = -1, 
               show_progress_bar = True)
print()
print(f'Best Score: {study.best_value*100:.3f} %')
print()
print(f'Best Params:')
print(study.best_params)

results = study.trials_dataframe()
results['value'].sort_values().reset_index(drop=True).plot();
plt.title('Convergence plot');
plt.xlabel('Iteration');
plt.ylabel('Score');

chosen_param = {
    i.replace("balRF_", ""):j for i, j in zip(study.best_params.keys(), study.best_params.values())
}
print()
print(chosen_param)

# Gradient Boosting Classifier

In [None]:
%%time
import optuna
from sklearn.model_selection import cross_val_score

def objective(trial):
    
    gbc_loss = trial.suggest_categorical("gbc_loss", ['deviance', 'exponential'])
    gbc_learning_rate = trial.suggest_float("gbc_learning_rate", 0.001, 0.5)
    gbc_n_estimators = trial.suggest_int("gbc_n_estimators", 10, 2000)
    gbc_min_samples_split = trial.suggest_int("gbc_min_samples_split", 2, 100)
    gbc_min_samples_leaf = trial.suggest_int("gbc_min_samples_leaf", 1, 10)
    gbc_max_depth = trial.suggest_int("gbc_max_depth", 1, 7)
    gbc_max_features = trial.suggest_categorical("gbc_max_features", ['sqrt', 'log2'])
    gbc_ccp_alpha = trial.suggest_float("gbc_ccp_alpha", 0.0, 0.035)
    gbc_subsample = trial.suggest_float("gbc_subsample", 0.5, 1.0)
    gbc_criterion = trial.suggest_categorical("gbc_criterion", ['friedman_mse', 'mse', 'mae'])
    
    
    model = GradientBoostingClassifier(random_state = seed, 
                                       loss = gbc_loss, 
                                       learning_rate = gbc_learning_rate, 
                                       n_estimators = gbc_n_estimators, 
                                       min_samples_split = gbc_min_samples_split, 
                                       min_samples_leaf = gbc_min_samples_leaf, 
                                       max_depth = gbc_max_depth, 
                                       max_features = gbc_max_features, 
                                       ccp_alpha = gbc_ccp_alpha, 
                                       subsample = gbc_subsample, 
                                       criterion = gbc_criterion)
    
    cv_score = cross_val_score(model, 
                               X_train_pars, 
                               y_train, 
                               cv=StratifiedKFold(n_splits=10, random_state=seed, shuffle=True),
                               scoring='roc_auc')
    
    score = cv_score.mean()
    return score

study = optuna.create_study(
    direction="maximize",
    sampler=optuna.samplers.CmaEsSampler(warn_independent_sampling = False)
) #TPESampler(seed = seed, multivariate = True)

study.optimize(objective, 
               n_trials=120, 
               n_jobs = -1, 
               show_progress_bar = True)
print()
print(f'Best Score: {study.best_value*100:.3f} %')
print()
print(f'Best Params:')
print(study.best_params)

results = study.trials_dataframe()
results['value'].sort_values().reset_index(drop=True).plot();
plt.title('Convergence plot');
plt.xlabel('Iteration');
plt.ylabel('Score');

chosen_param = {
    i.replace("gbc_", ""):j for i, j in zip(study.best_params.keys(), study.best_params.values())
}
print()
print(chosen_param)