In [1]:
#Import packages
import os #Allows us to get operating system information in python.
#In artemis video, he did not import os package

#Data Handling
import pandas as pd, numpy as np

#Time
import time

#Plotting
import matplotlib.pyplot as plt, seaborn as sns, scipy.stats, pylab

#Saving data
import pickle

#train and test split
from sklearn.model_selection import train_test_split

#Scalers
from sklearn import preprocessing

#TomekLinks and RandomUnderSampler
from imblearn.under_sampling import TomekLinks, RandomUnderSampler

#Hyperparameter optimization
import optuna

#Metrics
from sklearn.metrics import f1_score, balanced_accuracy_score, recall_score, roc_auc_score

#General Management
import gc as gc
gc.enable()
from joblib import dump, load
from warnings import filterwarnings

#Notebook configurations
filterwarnings('ignore')

In [2]:
#MODELS
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB 

In [4]:
f = open('CCF_ProcessedData.pckl','rb')
pickle_list = pickle.load(f)
f.close()

#pickle_list = [tomek_modeling_data, y, rus_tomek_modeling_data, y2]

tomek_modeling_data = pickle_list[0]
y = pickle_list[1]
rus_tomek_modeling_data = pickle_list[2]
y2 = pickle_list[3]
test = pickle_list[4]

In [5]:
#TEST
#print(tomek_modeling_data.head())
#print(rus_tomek_modeling_data.head())

#Double check column values before moving forward

#tomek_modeling_data - GOOD!
# for col in tomek_modeling_data.columns:
#     print(col, tomek_modeling_data[col].dtype) 
    
#rus_tomek_modeling_data - GOOD!
# for col in rus_tomek_modeling_data.columns:
#     print(col, rus_tomek_modeling_data[col].dtype)

# modeling_cat_cols = [col for col in modeling_data.columns if col not in ['Age', 'Flight Distance', 'Departure Delay in Minutes','Arrival Delay in Minutes', 'satisfaction']]
# t_mcol = [col for col in tomek_modeling_data.columns if col not in ['amount_4root','oldbalanceOrig_4root','newbalanceOrig_4root','oldbalanceDest_4root','newbalanceDest_4root']]
# t_rus_mcol = [col for col in rus_tomek_modeling_data.columns if col not in ['amount_4root','oldbalanceOrig_4root','newbalanceOrig_4root','oldbalanceDest_4root','newbalanceDest_4root']]

# # for col in modeling_cat_cols:
# #     modeling_data[col] = modeling_data[col].astype('category')

# #Tomek
# for col in t_mcol:
#     tomek_modeling_data[col] = tomek_modeling_data[col].astype('category')
    
# #Tomek + RUS
# for col in t_rus_mcol:
#     rus_tomek_modeling_data[col] = rus_tomek_modeling_data[col].astype('category')

Unnamed: 0,isFraud
0,0
1,0
2,0
3,0
4,0


# MODELING

In [5]:
#tomek_modeling_data & y
Xtrain_tomek, Xdev_tomek, ytrain_tomek, ydev_tomek = train_test_split(tomek_modeling_data, y, stratify=y, test_size=0.1, random_state=5)

#rus_tomek_modeling_data & y2
Xtrain_rus, Xdev_rus, ytrain_rus, ydev_rus = train_test_split(rus_tomek_modeling_data, y2, stratify=y2, test_size=0.3, random_state=5)

In [6]:
#SAVE THE VARIABLES ABOVE SO WE DO NOT NEED TO RE-RUN AND CAN CALL
#FROM THE APPROPRIATE GET_DATA FUNCTION
pickle_list1 = [Xtrain_tomek, Xdev_tomek, ytrain_tomek, ydev_tomek]
pickle_list2 = [Xtrain_rus, Xdev_rus, ytrain_rus, ydev_rus]
f1 = open('tomek_data.pckl','wb')
f2 = open('rus_data.pckl','wb')
pickle.dump(pickle_list1,f1)
pickle.dump(pickle_list2,f2)
f1.close()
f2.close()

In [4]:
#List of the 13 classifiers we will test
classifiers = [XGBClassifier(random_state=1,categorical_features=True),
               LGBMClassifier(random_state=1,is_unbalance=True),
               RandomForestClassifier(random_state=1),
               ExtraTreesClassifier(random_state=1),
               GradientBoostingClassifier(random_state=1),
               DecisionTreeClassifier(random_state=1),
               ExtraTreeClassifier(random_state=1),
               LogisticRegression(random_state=1),
               RidgeClassifier(random_state=1),
               SGDClassifier(random_state=1),
               KNeighborsClassifier(n_neighbors=10),
               GaussianNB(),
               MultinomialNB()]

In [12]:
#Function that will test the classifiers
def test_classifiers(train_data, train_classes, test_data, test_classes, classifiers):
    results = {} #Save the results in a dictionary
    
    #Loop through the different classifiers in the list
    for clf in classifiers:
        name = clf.__class__.__name__ #Grab the name of the classifier
        
        print("Now training {}...".format(name)) #Lets us know what classifier we are on
        
        start_time = time.time() #Start keeping track of time
        clf.fit(train_data, train_classes) #Fit the training data to the classifier
        predict = clf.predict(test_data) #Make predictions on test data
        
        #METRICS - Compare test data predictions with actual values
        f1 = round(f1_score(y_true=test_classes, y_pred=predict, pos_label=1), 3)
        bal_acc = round(balanced_accuracy_score(test_classes, predict), 3)
        recall = round(recall_score(test_classes, predict, pos_label=1), 3)
        #roc_auc = round(roc_auc_score(test_classes,clf.predict_proba(test_data)), 3)
        
        stop_time = time.time() #Stop keeping track of time
        runtime = round(stop_time - start_time, 3) #Calculate run time
        
        print("{} trained in {} with \n  F1: {} \n  Balanced Accuracy: {} \n  Recall: {}".format(name,runtime,f1,bal_acc,recall))
        
        results[name] = (f1, bal_acc, recall, runtime)
        
    return results

### Tomek Modeling Data

In [13]:
#classifier_results_tomek = test_classifiers(Xtrain_tomek, ytrain_tomek, Xdev_tomek, ydev_tomek, classifiers)

Now training XGBClassifier...
XGBClassifier trained in 941.981 with 
  F1: 0.888 
  Balanced Accuracy: 0.913 
  Recall: 0.825
Now training LGBMClassifier...
LGBMClassifier trained in 45.119 with 
  F1: 0.121 
  Balanced Accuracy: 0.963 
  Recall: 0.943
Now training RandomForestClassifier...
RandomForestClassifier trained in 1551.686 with 
  F1: 0.864 
  Balanced Accuracy: 0.89 
  Recall: 0.78
Now training ExtraTreesClassifier...
ExtraTreesClassifier trained in 386.59 with 
  F1: 0.876 
  Balanced Accuracy: 0.897 
  Recall: 0.793
Now training GradientBoostingClassifier...
GradientBoostingClassifier trained in 1663.791 with 
  F1: 0.528 
  Balanced Accuracy: 0.71 
  Recall: 0.421
Now training DecisionTreeClassifier...
DecisionTreeClassifier trained in 56.936 with 
  F1: 0.904 
  Balanced Accuracy: 0.95 
  Recall: 0.9
Now training ExtraTreeClassifier...
ExtraTreeClassifier trained in 9.967 with 
  F1: 0.797 
  Balanced Accuracy: 0.895 
  Recall: 0.791
Now training LogisticRegression...
Lo

In [14]:
#5/18/23
#Save the dictionary so we do not have to re-run above code again
pickle_list = [classifier_results_tomek]
f = open('classifier_results_tomek.pckl','wb')
pickle.dump(pickle_list,f)
f.close()

### Tomek + RUS Modeling Data

In [16]:
#classifier_results_rus = test_classifiers(Xtrain_rus, ytrain_rus, Xdev_rus, ydev_rus, classifiers)

Now training XGBClassifier...
XGBClassifier trained in 0.939 with 
  F1: 0.994 
  Balanced Accuracy: 0.994 
  Recall: 0.997
Now training LGBMClassifier...
LGBMClassifier trained in 0.425 with 
  F1: 0.994 
  Balanced Accuracy: 0.994 
  Recall: 0.998
Now training RandomForestClassifier...
RandomForestClassifier trained in 0.881 with 
  F1: 0.992 
  Balanced Accuracy: 0.992 
  Recall: 0.997
Now training ExtraTreesClassifier...
ExtraTreesClassifier trained in 0.599 with 
  F1: 0.99 
  Balanced Accuracy: 0.99 
  Recall: 0.993
Now training GradientBoostingClassifier...
GradientBoostingClassifier trained in 1.657 with 
  F1: 0.988 
  Balanced Accuracy: 0.988 
  Recall: 0.995
Now training DecisionTreeClassifier...
DecisionTreeClassifier trained in 0.058 with 
  F1: 0.991 
  Balanced Accuracy: 0.991 
  Recall: 0.993
Now training ExtraTreeClassifier...
ExtraTreeClassifier trained in 0.026 with 
  F1: 0.971 
  Balanced Accuracy: 0.971 
  Recall: 0.97
Now training LogisticRegression...
LogisticRe

In [17]:
#5/18/23
#Save the dictionary so we do not have to re-run above code again
pickle_list = [classifier_results_rus]
f = open('classifier_results_rus.pckl','wb')
pickle.dump(pickle_list,f)
f.close()

## HYPERPARAMETER TUNING

In [4]:
def get_tomek_data():
    f = open('tomek_data.pckl','rb')
    data = pickle.load(f)
    f.close()
    return data

def get_rus_data():
    f = open('rus_data.pckl','rb')
    data = pickle.load(f)
    f.close()
    return data

In [36]:
def optimize_objective(trial,datatype):
    
    if datatype == 'tomek':
        #Read in the data needed for each experiment
        data = get_tomek_data() #[Xtrain_tomek, Xdev_tomek, ytrain_tomek, ydev_tomek]
        X_train = data[0]
        X_dev = data[1]
        y_train = data[2]
        y_dev = data[3]
    
        #Use Optuna to find the best algorithm to use
        algorithm = trial.suggest_categorical('algorithm', ['XGB', 'LGBM', 'DT', 'GNB'])
    
    else:
        #Read in the data needed for each experiment
        data = get_rus_data() #[Xtrain_rus, Xdev_rus, ytrain_rus, ydev_rus]
        X_train = data[0]
        X_dev = data[1]
        y_train = data[2]
        y_dev = data[3]
    
        #Use Optuna to find the best algorithm to use
        algorithm = trial.suggest_categorical('algorithm', ['XGB', 'LGBM', 'GNB', 'RFC', 'GBC'])
    
    
    
    # XGBoost Classifier
    if algorithm == 'XGB':
        #Have Optuna suggest values for the following hyperparameters
        xgb_booster = trial.suggest_categorical('xgb_booster', ['gbtree','dart'])
        xgb_eta_exp = trial.suggest_int('xgb_eta_exp',-3,-1) #!!!
        xgb_eta_base = trial.suggest_float('xgb_eta_base',1.0,9.99) #!!!
        xgb_eta = xgb_eta_base*(10**xgb_eta_exp) 
        mcw_exp = trial.suggest_int('mcw_exp',-2,2) #!!!
        mcw_base = trial.suggest_float('mcw_base',1.0,9.99) #!!!
        xgb_mcw = mcw_base*(10**mcw_exp) 
        xgb_maxdepth = trial.suggest_int('xgb_maxdepth', 3, 30)
        xgb_gamma = trial.suggest_float('xgb_gamma', 0.0, 10.0)
        xgb_subsample = trial.suggest_float('xgb_subsample', 0.1, 1.0)
        xgb_lambda = trial.suggest_float('xgb_lambda', 1.0, 20.0)
        xgb_alpha = trial.suggest_float('xgb_alpha', 0.0, 10.0)
        
        #Create the model
        model = XGBClassifier(booster=xgb_booster, eta=xgb_eta, min_child_weight=xgb_mcw,
                             max_depth=xgb_maxdepth, gamma=xgb_gamma, subsample=xgb_subsample,
                             reg_lambda=xgb_lambda, reg_alpha=xgb_alpha, random_state=10)
    
    # LGBM Classifier
    elif algorithm == 'LGBM':
        #Have Optuna suggest values for the following hyperparameters
        lgbm_boosting = trial.suggest_categorical('lgbm_booster', ['gbdt','dart','goss'])
        lgbm_num_exp = trial.suggest_int('lgbm_num_exp',1,2) #!!!
        lgbm_num_base = trial.suggest_float('lgbm_num_base',1.0,9.99) #!!!
        lgbm_num = round(lgbm_num_base*(10**lgbm_num_exp))
        lgbm_lr_exp = trial.suggest_int('lgbm_lr_exp',-3,-1) #!!!
        lgbm_lr_base = trial.suggest_float('lgbm_lr_base',1.0,9.99) #!!!
        lgbm_learningrate = lgbm_lr_base*(10**lgbm_lr_exp)
        lgbm_maxdepth = trial.suggest_int('lgbm_maxdepth', 3, 30)
        lgbm_nl_exp = trial.suggest_int('lgbm_nl_exp',1,2) #!!!
        lgbm_nl_base = trial.suggest_float('lgbm_nl_base',1.0,9.99) #!!!
        lgbm_numleaves = round(lgbm_nl_base*(10**lgbm_nl_exp))
        lgbm_mc_exp = trial.suggest_int('lgbm_mc_exp',1,3) #!!!
        lgbm_mc_base = trial.suggest_float('lgbm_mc_base',1.0,9.99) #!!!
        lgbm_minchild = round(lgbm_mc_base*(10**lgbm_mc_exp))
        lgbm_subsample = trial.suggest_float('lgbm_subsample', 0.1, 1.0)
        lgbm_lambda = trial.suggest_float('lgbm_lambda', 1.0, 20.0)
        lgbm_alpha_exp = trial.suggest_int('lgbm_alpha_exp',-1,1) #!!!
        lgbm_alpha_base = trial.suggest_float('lgbm_alpha_base',1.0,9.99) #!!!
        lgbm_alpha = lgbm_alpha_base*(10**lgbm_alpha_exp)
        
        #Create the model
        model = LGBMClassifier(boosting_type=lgbm_boosting, num_iterations=lgbm_num,
                              learning_rate=lgbm_learningrate, max_depth=lgbm_maxdepth,
                              num_leaves=lgbm_numleaves, min_child_samples=lgbm_minchild, 
                              subsample=lgbm_subsample,reg_lambda=lgbm_lambda, reg_alpha=lgbm_alpha, 
                              random_state=10)
        
    # Decision Tree Classifier
    elif algorithm == 'DT':
        #Have Optuna suggest values for the following hyperparameters
        dt_splitter = trial.suggest_categorical('dt_splitter', ['best','random'])
        dt_maxdepth = trial.suggest_int('dt_maxdepth', 3, 30)
        dt_ml_exp = trial.suggest_int('dt_ml_exp',1,3) #!!!
        dt_ml_base = trial.suggest_int('dt_ml_base',1,9) #!!!
        dt_minleaf = round(dt_ml_base*(10**dt_ml_exp))
        
        #Create the model
        model = DecisionTreeClassifier(splitter=dt_splitter, max_depth=dt_maxdepth,
                                      min_samples_leaf=dt_minleaf, random_state=10)
    
    # Gaussian Naive Bayes
    elif algorithm == 'GNB':
        #Have Optuna suggest a value for var_smoothing
        gnb_exp = trial.suggest_int('gnb_exp',6,10) #!!!
        gnb_base = trial.suggest_float('gnb_base',1.0, 9.99) #!!!
        gnb_varsmoothing = gnb_base * (10**(-gnb_exp))
        
        #Create the model
        model = GaussianNB(var_smoothing=gnb_varsmoothing)
        
    
    # Random Forest Classifier
    elif algorithm == 'RFC':
        #Have Optuna suggest values for the following hyperparameters
        rfc_num_exp = trial.suggest_int('rfc_num_exp',1,3) #!!!
        rfc_num_base = trial.suggest_float('rfc_num_base',1.0,9.99) #!!!
        rfc_num = round(rfc_num_base*(10**rfc_num_exp))
        rfc_maxdepth = trial.suggest_int('rfc_maxdepth', 3, 30)
        rfc_ml_exp = trial.suggest_int('rfc_ml_exp',1,3) #!!!
        rfc_ml_base = trial.suggest_float('rfc_ml_base',1.0,9.99) #!!!
        rfc_minleaf = round(rfc_ml_base*(10**rfc_ml_exp))
        rfc_bootstrap = trial.suggest_categorical('rfc_bootstrap',[True, False])
        
        #Create the model
        if rfc_bootstrap == False:
            model = RandomForestClassifier(n_estimators = rfc_num, max_depth=rfc_maxdepth,
                                          min_samples_leaf=rfc_minleaf, bootstrap=rfc_bootstrap,
                                          random_state=10)
        else:
            rfc_maxsamples = trial.suggest_float('rfc_maxsamples',0.1,0.99)
            model = RandomForestClassifier(n_estimators = rfc_num, max_depth=rfc_maxdepth,
                                          min_samples_leaf=rfc_minleaf, bootstrap=rfc_bootstrap,
                                          max_samples=rfc_maxsamples,random_state=10)
    
    # Gradient Boosting Classifier
    else:
        #Have Optuna suggest values for the following hyperparameters
        gbc_learn_exp = trial.suggest_int('gbc_learn_exp',-3,-1) #!!!
        gbc_learn_base = trial.suggest_float('gbc_learn_base',1.0,9.99) #!!!
        gbc_learning = gbc_learn_base*(10**gbc_learn_exp)
        gbc_num_exp = trial.suggest_int('gbc_num_exp',1,3) #!!!
        gbc_num_base = trial.suggest_float('gbc_num_base',1.0,9.99) #!!!
        gbc_num = round(gbc_num_base*(10**gbc_num_exp))
        gbc_subsample = trial.suggest_float('gbc_subsample', 0.1, 1.0)
        gbc_ml_exp = trial.suggest_int('gbc_ml_exp',1,3) #!!!
        gbc_ml_base = trial.suggest_float('gbc_ml_base',1.0,9.99) #!!!
        gbc_minleaf = round(gbc_ml_base*(10**gbc_ml_exp))
        gbc_maxdepth = trial.suggest_int('gbc_maxdepth', 3, 30)
        
        #Create the model
        model = GradientBoostingClassifier(learning_rate=gbc_learning, n_estimators=gbc_num,
                                          subsample=gbc_subsample,min_samples_leaf=gbc_minleaf, 
                                          max_depth=gbc_maxdepth,random_state=10)
    
    
    
    #Fit and score model
    model.fit(X_train, y_train)
    score_recall = recall_score(y_true=y_dev, y_pred=model.predict(X_dev), pos_label=1)
    
    #Return Score
    return score_recall


### TOMEK DATA

In the artemis video, we see that he creates a function "get_data". We do not need to create a function like this here because the data is already ready to go (Xtrain_tomek, Xdev_tomek, ytrain_tomek, ydev_tomek). Ideally we would want to create the function and call it in our objective function (optimize_tomek) because we don't want to rely on the fact that we have the correct variables in the global environment everytime we call the function. So we will go ahead and create the get_tomek_data() function.

<div class="alert alert-block alert-success">
GOOD TO RUN!!!!

In [23]:
# We want to maximize the score, i.e., recall
study1 = optuna.create_study(direction="maximize")
# Run Optuna a maximum of 100 times to find the best set of params for the best algo
study1.optimize(lambda trial: optimize_objective(trial, datatype='tomek'), n_trials=100)

[32m[I 2023-06-05 13:50:26,643][0m A new study created in memory with name: no-name-2cd8e488-fc72-4ac3-b1d8-d2f9ff03cfcc[0m
[32m[I 2023-06-05 13:50:42,017][0m Trial 0 finished with value: 0.6371463714637147 and parameters: {'algorithm': 'DT', 'dt_splitter': 'random', 'dt_maxdepth': 29, 'dt_ml_exp': 2, 'dt_ml_base': 4}. Best is trial 0 with value: 0.6371463714637147.[0m
[32m[I 2023-06-05 14:39:13,770][0m Trial 1 finished with value: 0.7183271832718328 and parameters: {'algorithm': 'LGBM', 'lgbm_booster': 'dart', 'lgbm_num_exp': 2, 'lgbm_num_base': 7.542094049019105, 'lgbm_lr_exp': -2, 'lgbm_lr_base': 4.004821571457194, 'lgbm_maxdepth': 8, 'lgbm_nl_exp': 1, 'lgbm_nl_base': 7.094678004749073, 'lgbm_mc_exp': 3, 'lgbm_mc_base': 4.836711734961391, 'lgbm_subsample': 0.7217060240401513, 'lgbm_lambda': 10.641240484437608, 'lgbm_alpha_exp': 0, 'lgbm_alpha_base': 6.9892167179595}. Best is trial 1 with value: 0.7183271832718328.[0m
[32m[I 2023-06-05 14:39:25,044][0m Trial 2 finished wit



[32m[I 2023-06-05 14:54:54,847][0m Trial 10 finished with value: 0.6531365313653137 and parameters: {'algorithm': 'XGB', 'xgb_booster': 'gbtree', 'xgb_eta_exp': -2, 'xgb_eta_base': 2.6821035303480447, 'mcw_exp': 0, 'mcw_base': 9.532541596432102, 'xgb_maxdepth': 8, 'xgb_gamma': 7.4556919884885975, 'xgb_subsample': 0.7208122725280365, 'xgb_lambda': 16.573744894094283, 'xgb_alpha': 6.638199806922174}. Best is trial 2 with value: 0.997539975399754.[0m
[32m[I 2023-06-05 14:55:04,411][0m Trial 11 finished with value: 0.997539975399754 and parameters: {'algorithm': 'GNB', 'gnb_exp': 10, 'gnb_base': 9.211997476671375}. Best is trial 2 with value: 0.997539975399754.[0m
[32m[I 2023-06-05 14:55:13,086][0m Trial 12 finished with value: 0.997539975399754 and parameters: {'algorithm': 'GNB', 'gnb_exp': 10, 'gnb_base': 1.2457073663305094}. Best is trial 2 with value: 0.997539975399754.[0m
[32m[I 2023-06-05 14:55:21,116][0m Trial 13 finished with value: 0.997539975399754 and parameters: {'a



[32m[I 2023-06-05 15:06:23,476][0m Trial 15 finished with value: 0.3899138991389914 and parameters: {'algorithm': 'XGB', 'xgb_booster': 'dart', 'xgb_eta_exp': -3, 'xgb_eta_base': 9.878927935837867, 'mcw_exp': -2, 'mcw_base': 1.3144780211368117, 'xgb_maxdepth': 29, 'xgb_gamma': 0.4606855689040259, 'xgb_subsample': 0.1088405680655502, 'xgb_lambda': 2.81342672414846, 'xgb_alpha': 0.6234514187387852}. Best is trial 2 with value: 0.997539975399754.[0m
[32m[I 2023-06-05 15:06:33,457][0m Trial 16 finished with value: 0.996309963099631 and parameters: {'algorithm': 'GNB', 'gnb_exp': 8, 'gnb_base': 5.9716820536874}. Best is trial 2 with value: 0.997539975399754.[0m
[32m[I 2023-06-05 15:06:42,325][0m Trial 17 finished with value: 0.997539975399754 and parameters: {'algorithm': 'GNB', 'gnb_exp': 9, 'gnb_base': 3.5784905414246295}. Best is trial 2 with value: 0.997539975399754.[0m
[32m[I 2023-06-05 15:06:51,206][0m Trial 18 finished with value: 0.997539975399754 and parameters: {'algori



[32m[I 2023-06-05 15:15:21,986][0m Trial 19 finished with value: 0.6568265682656826 and parameters: {'algorithm': 'XGB', 'xgb_booster': 'gbtree', 'xgb_eta_exp': -1, 'xgb_eta_base': 1.9277918597685022, 'mcw_exp': 2, 'mcw_base': 5.633634906357093, 'xgb_maxdepth': 24, 'xgb_gamma': 9.510700396781747, 'xgb_subsample': 0.9622403712608569, 'xgb_lambda': 19.476224141736473, 'xgb_alpha': 9.927681824615364}. Best is trial 2 with value: 0.997539975399754.[0m
[32m[I 2023-06-05 15:15:32,547][0m Trial 20 finished with value: 0.997539975399754 and parameters: {'algorithm': 'GNB', 'gnb_exp': 10, 'gnb_base': 7.798021647400549}. Best is trial 2 with value: 0.997539975399754.[0m
[32m[I 2023-06-05 15:15:41,962][0m Trial 21 finished with value: 0.997539975399754 and parameters: {'algorithm': 'GNB', 'gnb_exp': 10, 'gnb_base': 9.845575247382122}. Best is trial 2 with value: 0.997539975399754.[0m
[32m[I 2023-06-05 15:15:52,514][0m Trial 22 finished with value: 0.997539975399754 and parameters: {'al



[32m[I 2023-06-05 15:24:41,590][0m Trial 27 finished with value: 0.32718327183271834 and parameters: {'algorithm': 'XGB', 'xgb_booster': 'dart', 'xgb_eta_exp': -3, 'xgb_eta_base': 6.8287619747004165, 'mcw_exp': 2, 'mcw_base': 1.1784670290323866, 'xgb_maxdepth': 3, 'xgb_gamma': 3.3561818553964518, 'xgb_subsample': 0.3627539777283519, 'xgb_lambda': 7.7459196803626496, 'xgb_alpha': 0.509886763736846}. Best is trial 2 with value: 0.997539975399754.[0m
[32m[I 2023-06-05 15:26:21,970][0m Trial 28 finished with value: 0.8314883148831488 and parameters: {'algorithm': 'LGBM', 'lgbm_booster': 'goss', 'lgbm_num_exp': 2, 'lgbm_num_base': 1.8067186006975513, 'lgbm_lr_exp': -1, 'lgbm_lr_base': 9.577837441957456, 'lgbm_maxdepth': 30, 'lgbm_nl_exp': 2, 'lgbm_nl_base': 1.034396140941686, 'lgbm_mc_exp': 1, 'lgbm_mc_base': 9.919736401592171, 'lgbm_subsample': 0.3077266692509484, 'lgbm_lambda': 19.892511620465694, 'lgbm_alpha_exp': -1, 'lgbm_alpha_base': 9.927645510947794}. Best is trial 2 with value



[32m[I 2023-06-05 15:51:25,412][0m Trial 45 finished with value: 0.7724477244772447 and parameters: {'algorithm': 'XGB', 'xgb_booster': 'gbtree', 'xgb_eta_exp': -1, 'xgb_eta_base': 5.328503171265846, 'mcw_exp': -2, 'mcw_base': 9.559735002064023, 'xgb_maxdepth': 16, 'xgb_gamma': 5.0532742524236545, 'xgb_subsample': 0.5264298876965094, 'xgb_lambda': 12.408720886599083, 'xgb_alpha': 4.161596051254687}. Best is trial 2 with value: 0.997539975399754.[0m
[32m[I 2023-06-05 15:51:35,868][0m Trial 46 finished with value: 0.997539975399754 and parameters: {'algorithm': 'GNB', 'gnb_exp': 10, 'gnb_base': 8.89887086750308}. Best is trial 2 with value: 0.997539975399754.[0m
[32m[I 2023-06-05 15:51:44,873][0m Trial 47 finished with value: 0.997539975399754 and parameters: {'algorithm': 'GNB', 'gnb_exp': 10, 'gnb_base': 9.932446377801945}. Best is trial 2 with value: 0.997539975399754.[0m
[32m[I 2023-06-05 15:52:27,443][0m Trial 48 finished with value: 0.6088560885608856 and parameters: {'a



[32m[I 2023-06-05 16:07:22,441][0m Trial 50 finished with value: 0.6777367773677737 and parameters: {'algorithm': 'XGB', 'xgb_booster': 'dart', 'xgb_eta_exp': -2, 'xgb_eta_base': 1.4063710956166764, 'mcw_exp': 0, 'mcw_base': 5.049664136139949, 'xgb_maxdepth': 16, 'xgb_gamma': 0.10818942197209402, 'xgb_subsample': 0.9855119597627744, 'xgb_lambda': 1.4060536198661424, 'xgb_alpha': 9.881565854753141}. Best is trial 2 with value: 0.997539975399754.[0m
[32m[I 2023-06-05 16:07:33,068][0m Trial 51 finished with value: 0.997539975399754 and parameters: {'algorithm': 'GNB', 'gnb_exp': 9, 'gnb_base': 3.596609961288585}. Best is trial 2 with value: 0.997539975399754.[0m
[32m[I 2023-06-05 16:07:41,589][0m Trial 52 finished with value: 0.8142681426814268 and parameters: {'algorithm': 'GNB', 'gnb_exp': 6, 'gnb_base': 3.5061375450364025}. Best is trial 2 with value: 0.997539975399754.[0m
[32m[I 2023-06-05 16:07:49,628][0m Trial 53 finished with value: 0.997539975399754 and parameters: {'al



[32m[I 2023-06-05 16:15:43,822][0m Trial 65 finished with value: 0.6851168511685117 and parameters: {'algorithm': 'XGB', 'xgb_booster': 'gbtree', 'xgb_eta_exp': -1, 'xgb_eta_base': 9.863565858783126, 'mcw_exp': 1, 'mcw_base': 6.7342940208041755, 'xgb_maxdepth': 23, 'xgb_gamma': 9.63719476028373, 'xgb_subsample': 0.13673371523407413, 'xgb_lambda': 9.521541934295202, 'xgb_alpha': 3.892628318489626}. Best is trial 2 with value: 0.997539975399754.[0m
[32m[I 2023-06-05 16:15:53,041][0m Trial 66 finished with value: 0.997539975399754 and parameters: {'algorithm': 'GNB', 'gnb_exp': 10, 'gnb_base': 9.960584027175399}. Best is trial 2 with value: 0.997539975399754.[0m
[32m[I 2023-06-05 16:18:51,912][0m Trial 67 finished with value: 0.7626076260762608 and parameters: {'algorithm': 'LGBM', 'lgbm_booster': 'goss', 'lgbm_num_exp': 2, 'lgbm_num_base': 6.64530474549863, 'lgbm_lr_exp': -1, 'lgbm_lr_base': 1.0500509335816997, 'lgbm_maxdepth': 3, 'lgbm_nl_exp': 2, 'lgbm_nl_base': 4.0752588220602



[32m[I 2023-06-05 16:32:02,357][0m Trial 79 finished with value: 0.2890528905289053 and parameters: {'algorithm': 'XGB', 'xgb_booster': 'dart', 'xgb_eta_exp': -3, 'xgb_eta_base': 4.705179425271604, 'mcw_exp': -1, 'mcw_base': 3.3573649433558272, 'xgb_maxdepth': 10, 'xgb_gamma': 4.4796429235296, 'xgb_subsample': 0.7242184814695571, 'xgb_lambda': 14.142621268260292, 'xgb_alpha': 6.947420408242256}. Best is trial 2 with value: 0.997539975399754.[0m
[32m[I 2023-06-05 16:32:12,638][0m Trial 80 finished with value: 0.996309963099631 and parameters: {'algorithm': 'GNB', 'gnb_exp': 8, 'gnb_base': 8.898654838588516}. Best is trial 2 with value: 0.997539975399754.[0m
[32m[I 2023-06-05 16:32:22,537][0m Trial 81 finished with value: 0.997539975399754 and parameters: {'algorithm': 'GNB', 'gnb_exp': 10, 'gnb_base': 9.549147484859034}. Best is trial 2 with value: 0.997539975399754.[0m
[32m[I 2023-06-05 16:32:31,983][0m Trial 82 finished with value: 0.997539975399754 and parameters: {'algori



[32m[I 2023-06-05 16:47:19,643][0m Trial 97 finished with value: 0.6728167281672817 and parameters: {'algorithm': 'XGB', 'xgb_booster': 'dart', 'xgb_eta_exp': -2, 'xgb_eta_base': 7.628990123268456, 'mcw_exp': 1, 'mcw_base': 8.600865811984923, 'xgb_maxdepth': 30, 'xgb_gamma': 7.09328508261839, 'xgb_subsample': 0.326872365377617, 'xgb_lambda': 6.150834492882952, 'xgb_alpha': 2.7310207131912816}. Best is trial 2 with value: 0.997539975399754.[0m
[32m[I 2023-06-05 16:47:30,977][0m Trial 98 finished with value: 0.997539975399754 and parameters: {'algorithm': 'GNB', 'gnb_exp': 10, 'gnb_base': 2.921897036574144}. Best is trial 2 with value: 0.997539975399754.[0m
[32m[I 2023-06-05 16:47:40,623][0m Trial 99 finished with value: 0.997539975399754 and parameters: {'algorithm': 'GNB', 'gnb_exp': 10, 'gnb_base': 9.514885322573242}. Best is trial 2 with value: 0.997539975399754.[0m


In [24]:
def get_optuna_trials(study, top_n):
    all_trials = study.get_trials() #Grab all trials from study
    res_trials = {} #instantiate dictionary for trials that will contain information we want to look at
    for _t in all_trials:
        res_trials[_t.number] = (_t.value, _t.params) #Will look at recall, and parameters
        
    #Sort the trials based on the items listed in the dictionary
    f_trials = sorted(res_trials.items(), key=lambda x:x[1][0], reverse=True)[:top_n]
    #.items() shows the key-value pairs in a dictionary. This is what we are sorting.
    #key defines how we sort the list, and in this case we are sorting by the recall value
    #We only want to grab the top_n trials 
    
    #return the top_n sorted trials
    return f_trials

In [29]:
f_trials_tomek = get_optuna_trials(study1, 100)
f_trials_tomek

[(2,
  (0.997539975399754,
   {'algorithm': 'GNB', 'gnb_exp': 9, 'gnb_base': 9.125860889745052})),
 (7,
  (0.997539975399754,
   {'algorithm': 'GNB', 'gnb_exp': 10, 'gnb_base': 3.510608280551681})),
 (11,
  (0.997539975399754,
   {'algorithm': 'GNB', 'gnb_exp': 10, 'gnb_base': 9.211997476671375})),
 (12,
  (0.997539975399754,
   {'algorithm': 'GNB', 'gnb_exp': 10, 'gnb_base': 1.2457073663305094})),
 (13,
  (0.997539975399754,
   {'algorithm': 'GNB', 'gnb_exp': 9, 'gnb_base': 9.976370205210701})),
 (17,
  (0.997539975399754,
   {'algorithm': 'GNB', 'gnb_exp': 9, 'gnb_base': 3.5784905414246295})),
 (18,
  (0.997539975399754,
   {'algorithm': 'GNB', 'gnb_exp': 9, 'gnb_base': 7.938155000531474})),
 (20,
  (0.997539975399754,
   {'algorithm': 'GNB', 'gnb_exp': 10, 'gnb_base': 7.798021647400549})),
 (21,
  (0.997539975399754,
   {'algorithm': 'GNB', 'gnb_exp': 10, 'gnb_base': 9.845575247382122})),
 (22,
  (0.997539975399754,
   {'algorithm': 'GNB', 'gnb_exp': 10, 'gnb_base': 8.49037919526663

In [30]:
#SAVE THE TOMEK TRIALS
pickle_list = [f_trials_tomek]
f = open('optuna_trials_tomek.pckl','wb')
pickle.dump(pickle_list,f)
f.close()

### TOMEK + RUS DATA

In the artemis video, we see that he creates a function "get_data". We do not need to create a function like this here because the data is already ready to go (Xtrain_rus, Xdev_rus, ytrain_rus, ydev_rus). Ideally we would want to create the function and call it in our objective function (optimize_tomek) because we don't want to rely on the fact that we have the correct variables in the global environment everytime we call the function. So we will go ahead and create the get_tomek_data() function.

<div class="alert alert-block alert-success">
GOOD TO RUN!!!!

In [37]:
# We want to maximize the score, i.e., recall
study2 = optuna.create_study(direction="maximize")
# Run Optuna a maximum of 100 times to find the best set of params for the best algo
study2.optimize(lambda trial: optimize_objective(trial, datatype='rus'), n_trials=100)

[32m[I 2023-06-05 17:37:27,129][0m A new study created in memory with name: no-name-84e54d5f-9f26-4ee3-8d62-313ed10b1111[0m
[32m[I 2023-06-05 17:37:36,535][0m Trial 0 finished with value: 0.9950779327317474 and parameters: {'algorithm': 'LGBM', 'lgbm_booster': 'dart', 'lgbm_num_exp': 2, 'lgbm_num_base': 9.222876306126139, 'lgbm_lr_exp': -1, 'lgbm_lr_base': 6.31245941173835, 'lgbm_maxdepth': 22, 'lgbm_nl_exp': 1, 'lgbm_nl_base': 9.938072043465244, 'lgbm_mc_exp': 1, 'lgbm_mc_base': 7.117969420544642, 'lgbm_subsample': 0.9115048669842948, 'lgbm_lambda': 14.104046492032666, 'lgbm_alpha_exp': 0, 'lgbm_alpha_base': 9.595196727728482}. Best is trial 0 with value: 0.9950779327317474.[0m
[32m[I 2023-06-05 17:37:36,800][0m Trial 1 finished with value: 0.8634126333059885 and parameters: {'algorithm': 'RFC', 'rfc_num_exp': 1, 'rfc_num_base': 5.92402539190864, 'rfc_maxdepth': 15, 'rfc_ml_exp': 3, 'rfc_ml_base': 2.6294668846878064, 'rfc_bootstrap': False}. Best is trial 0 with value: 0.99507

[32m[I 2023-06-05 17:38:14,457][0m Trial 24 finished with value: 0.9938474159146842 and parameters: {'algorithm': 'GNB', 'gnb_exp': 6, 'gnb_base': 8.746155743833311}. Best is trial 13 with value: 0.9979491386382281.[0m
[32m[I 2023-06-05 17:38:14,494][0m Trial 25 finished with value: 0.9975389663658737 and parameters: {'algorithm': 'GNB', 'gnb_exp': 9, 'gnb_base': 6.527882068855613}. Best is trial 13 with value: 0.9979491386382281.[0m
[32m[I 2023-06-05 17:38:39,670][0m Trial 26 finished with value: 1.0 and parameters: {'algorithm': 'RFC', 'rfc_num_exp': 3, 'rfc_num_base': 9.8410100557842, 'rfc_maxdepth': 5, 'rfc_ml_exp': 3, 'rfc_ml_base': 9.56629896082272, 'rfc_bootstrap': True, 'rfc_maxsamples': 0.9767807421240824}. Best is trial 26 with value: 1.0.[0m
[32m[I 2023-06-05 17:39:00,753][0m Trial 27 finished with value: 1.0 and parameters: {'algorithm': 'RFC', 'rfc_num_exp': 3, 'rfc_num_base': 9.51340671485012, 'rfc_maxdepth': 3, 'rfc_ml_exp': 3, 'rfc_ml_base': 9.752906804396432

[32m[I 2023-06-05 17:44:46,633][0m Trial 49 finished with value: 0.0 and parameters: {'algorithm': 'GBC', 'gbc_learn_exp': -1, 'gbc_learn_base': 1.515929800913249, 'gbc_num_exp': 3, 'gbc_num_base': 9.937349281209528, 'gbc_subsample': 0.44703104961093076, 'gbc_ml_exp': 3, 'gbc_ml_base': 9.850552605425438, 'gbc_maxdepth': 26}. Best is trial 26 with value: 1.0.[0m
[32m[I 2023-06-05 17:44:46,801][0m Trial 50 finished with value: 0.5607054963084496 and parameters: {'algorithm': 'LGBM', 'lgbm_booster': 'dart', 'lgbm_num_exp': 1, 'lgbm_num_base': 5.643982663388502, 'lgbm_lr_exp': -3, 'lgbm_lr_base': 1.0903408043189913, 'lgbm_maxdepth': 3, 'lgbm_nl_exp': 2, 'lgbm_nl_base': 1.0084181516665618, 'lgbm_mc_exp': 3, 'lgbm_mc_base': 3.3491006259002694, 'lgbm_subsample': 0.12057697511564447, 'lgbm_lambda': 9.167425240457494, 'lgbm_alpha_exp': 1, 'lgbm_alpha_base': 7.576265587649889}. Best is trial 26 with value: 1.0.[0m
[32m[I 2023-06-05 17:45:08,468][0m Trial 51 finished with value: 1.0 and p

[32m[I 2023-06-05 17:50:10,819][0m Trial 72 finished with value: 1.0 and parameters: {'algorithm': 'RFC', 'rfc_num_exp': 3, 'rfc_num_base': 9.949152643825453, 'rfc_maxdepth': 9, 'rfc_ml_exp': 3, 'rfc_ml_base': 9.700481565326912, 'rfc_bootstrap': True, 'rfc_maxsamples': 0.9327127442665428}. Best is trial 26 with value: 1.0.[0m
[32m[I 2023-06-05 17:50:29,796][0m Trial 73 finished with value: 1.0 and parameters: {'algorithm': 'RFC', 'rfc_num_exp': 3, 'rfc_num_base': 8.651908150496709, 'rfc_maxdepth': 12, 'rfc_ml_exp': 3, 'rfc_ml_base': 8.498004308649517, 'rfc_bootstrap': True, 'rfc_maxsamples': 0.9345122106408603}. Best is trial 26 with value: 1.0.[0m
[32m[I 2023-06-05 17:51:04,055][0m Trial 74 finished with value: 0.9339622641509434 and parameters: {'algorithm': 'RFC', 'rfc_num_exp': 3, 'rfc_num_base': 9.502014993226846, 'rfc_maxdepth': 6, 'rfc_ml_exp': 2, 'rfc_ml_base': 9.52635937317344, 'rfc_bootstrap': True, 'rfc_maxsamples': 0.9378714512408584}. Best is trial 26 with value: 1

[32m[I 2023-06-05 17:57:02,208][0m Trial 96 finished with value: 1.0 and parameters: {'algorithm': 'RFC', 'rfc_num_exp': 3, 'rfc_num_base': 9.766344284067836, 'rfc_maxdepth': 21, 'rfc_ml_exp': 3, 'rfc_ml_base': 9.680436893453155, 'rfc_bootstrap': True, 'rfc_maxsamples': 0.9008772595774133}. Best is trial 26 with value: 1.0.[0m
[32m[I 2023-06-05 17:57:02,302][0m Trial 97 finished with value: 0.9741591468416735 and parameters: {'algorithm': 'LGBM', 'lgbm_booster': 'gbdt', 'lgbm_num_exp': 1, 'lgbm_num_base': 1.1740381158638389, 'lgbm_lr_exp': -2, 'lgbm_lr_base': 3.4603521926648617, 'lgbm_maxdepth': 9, 'lgbm_nl_exp': 1, 'lgbm_nl_base': 7.260906843515781, 'lgbm_mc_exp': 2, 'lgbm_mc_base': 4.454215434247473, 'lgbm_subsample': 0.7380064857009577, 'lgbm_lambda': 10.410505703623267, 'lgbm_alpha_exp': 0, 'lgbm_alpha_base': 6.596296381522215}. Best is trial 26 with value: 1.0.[0m
[32m[I 2023-06-05 17:57:04,259][0m Trial 98 finished with value: 1.0 and parameters: {'algorithm': 'RFC', 'rfc

In [38]:
f_trials_rus = get_optuna_trials(study2, 100)
f_trials_rus

[(26,
  (1.0,
   {'algorithm': 'RFC',
    'rfc_num_exp': 3,
    'rfc_num_base': 9.8410100557842,
    'rfc_maxdepth': 5,
    'rfc_ml_exp': 3,
    'rfc_ml_base': 9.56629896082272,
    'rfc_bootstrap': True,
    'rfc_maxsamples': 0.9767807421240824})),
 (27,
  (1.0,
   {'algorithm': 'RFC',
    'rfc_num_exp': 3,
    'rfc_num_base': 9.51340671485012,
    'rfc_maxdepth': 3,
    'rfc_ml_exp': 3,
    'rfc_ml_base': 9.752906804396432,
    'rfc_bootstrap': True,
    'rfc_maxsamples': 0.8924991091615007})),
 (28,
  (1.0,
   {'algorithm': 'RFC',
    'rfc_num_exp': 3,
    'rfc_num_base': 9.98704820722556,
    'rfc_maxdepth': 3,
    'rfc_ml_exp': 3,
    'rfc_ml_base': 9.92358683139348,
    'rfc_bootstrap': True,
    'rfc_maxsamples': 0.9732463247600415})),
 (29,
  (1.0,
   {'algorithm': 'RFC',
    'rfc_num_exp': 3,
    'rfc_num_base': 9.94937577503255,
    'rfc_maxdepth': 3,
    'rfc_ml_exp': 3,
    'rfc_ml_base': 9.980813233316885,
    'rfc_bootstrap': True,
    'rfc_maxsamples': 0.9611879579013174

In [39]:
#SAVE THE RUS TRIALS
pickle_list = [f_trials_rus]
f = open('optuna_trials_rus.pckl','wb')
pickle.dump(pickle_list,f)
f.close()

<div class="alert alert-block alert-danger">
EXTRA CODE NOT BEING USED

In [4]:
# def optimize_rus(trial):
    
#     #Read in the data needed for each experiment
#     data = get_rus_data() #[Xtrain_rus, Xdev_rus, ytrain_rus, ydev_rus]
#     X_train = data[0]
#     X_dev = data[1]
#     y_train = data[2]
#     y_dev = data[3]
    
#     #Use Optuna to find the best algorithm to use
#     algorithm = trial.suggest_categorical('algorithm', ['XGB', 'LGBM', 'GNB', 'RFC', 'GBC'])
    
#     # XGBoost Classifier
#     if algorithm == 'XGB':
#         #Have Optuna suggest values for the following hyperparameters
#         xgb_booster = trial.suggest_categorical('xgb_booster', ['gbtree','dart'])
#         xgb_eta = trial.suggest_float('xgb_eta', 0.01, 0.99)
#         xgb_mcw = trial.suggest_float('xgb_mcw', 0.01, 100.0)
#         xgb_maxdepth = trial.suggest_int('xgb_maxdepth', 3, 30)
#         xgb_gamma = trial.suggest_float('xgb_gamma', 0.0, 10.0)
#         xgb_subsample = trial.suggest_float('xgb_subsample', 0.1, 1.0)
#         xgb_lambda = trial.suggest_float('xgb_lambda', 1.0, 20.0)
#         xgb_alpha = trial.suggest_float('xgb_alpha', 0.0, 10.0)
        
#         #Create the model
#         model = XGBClassifier(booster=xgb_booster, eta=xgb_eta, min_child_weight=xgb_mcw,
#                              max_depth=xgb_maxdepth, gamma=xgb_gamma, subsample=xgb_subsample,
#                              reg_lambda=xgb_lambda, reg_alpha=xgb_alpha, random_state=10)
    
#     # LGBM Classifier
#     elif algorithm == 'LGBM':
#         #Have Optuna suggest values for the following hyperparameters
#         lgbm_boosting = trial.suggest_categorical('lgbm_booster', ['gbdt','dart','goss'])
#         lgbm_num = trial.suggest_int('lgbm_num', 10, 500)
#         lgbm_learningrate = trial.suggest_float('lgbm_learningrate',0.01,0.99)
#         lgbm_maxdepth = trial.suggest_int('lgbm_maxdepth', 3, 30)
#         lgbm_numleaves = trial.suggest_int('lgbm_numleaves',20, 500)
#         lgbm_minsamples = trial.suggest_float('lgbm_minsamples', 50, 2500)
#         lgbm_subsample = trial.suggest_float('lgbm_subsample', 0.1, 1.0)
#         lgbm_lambda = trial.suggest_float('lgbm_lambda', 1.0, 20.0)
#         lgbm_alpha = trial.suggest_float('lgbm_alpha', 0.0, 10.0)
        
#         #Create the model
#         model = LGBMClassifier(boosting_type=lgbm_boosting, num_iterations=lgbm_num,
#                               learning_rate=lgbm_learningrate, max_depth=lgbm_maxdepth,
#                               num_leaves=lgbm_numleaves,min_child_samples=lgbm_minsamples, 
#                               subsample=lgbm_subsample,reg_lambda=lgbm_lambda, reg_alpha=lgbm_alpha, 
#                               random_state=10)
        
        
#     # Gaussian Naive Bayes
#     elif algorithm == 'GNB':
#         #Have Optuna suggest a value for var_smoothing
#         gnb_exp = trial.suggest_int('gnb_exp',6,9)
#         gnb_base = trial.suggest_float(0.1, 1.0)
#         gnb_varsmoothing = gnb_base * (10**(-gnb_exp))
        
#         #Create the model
#         model = GaussianNB(var_smoothing=gnb_varsmoothing)
        
#     # Random Forest Classifier
#     elif algorithm == 'RFC':
#         #Have Optuna suggest values for the following hyperparameters
#         rfc_num = trial.suggest_int('rfc_num', 10, 1000)
#         rfc_maxdepth = trial.suggest_int('rfc_maxdepth', 3, 30)
#         rfc_minleaf = trial.suggest_int('rfc_minleaf', 50, 2500)
#         rfc_bootstrap = trial.suggest_categorical('rfc_bootstrap',True, False)
        
#         #Create the model
#         if rfc_bootstrap == False:
#             model = RandomForestClassifier(n_estimators = rfc_num, max_depth=rfc_maxdepth,
#                                           min_samples_leaf=rfc_minleaf, bootstrap=rfc_bootstrap,
#                                           random_state=10)
#         else:
#             rfc_maxsamples = trial.suggest_float('rfc_maxsamples',0.1,0.99)
#             model = RandomForestClassifier(n_estimators = rfc_num, max_depth=rfc_maxdepth,
#                                           min_samples_leaf=rfc_minleaf, bootstrap=rfc_bootstrap,
#                                           max_samples=rfc_maxsamples,random_state=10)
    
#     else:
#         #Have Optuna suggest values for the following hyperparameters
#         gbc_learning = trial.suggest_float('gbc_learning',0.01,0.99)
#         gbc_num = trial.suggest_int('gbc_num', 10, 1000)
#         gbc_subsample = trial.suggest_float('gbc_subsample', 0.1, 1.0)
#         gbc_minleaf = trial.suggest_int('gbc_minleaf', 50, 2500)
#         gbc_maxdepth = trial.suggest_int('rfc_maxdepth', 3, 30)
        
#         #Create the model
#         model = GradientBoostingClassifier(learning_rate=gbc_learning, n_estimators=gbc_num,
#                                           subsample=gbc_subsample,min_samples_leaf=gbc_minleaf, 
#                                           max_depth=gbc_maxdepth,random_state=10)
    
#     #Fit and score model
#     model.fit(X_train, y_train)
#     score_recall = recall_score(y_true=y_dev, y_pred=model.predict(X_dev), pos_label=1)
    
#     #Return Score
#     return score_recall