In [2]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
#import plotly.express as px
import seaborn as sns

from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import StratifiedKFold
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, \
                            accuracy_score, f1_score, average_precision_score, \
                            fbeta_score, precision_recall_curve, auc
from sklearn.metrics import matthews_corrcoef

from sklearn.model_selection import train_test_split

import optuna
from optuna.samplers import TPESampler
sns.set(style='white', context='notebook', palette='deep')
%config Completer.use_jedi = False

In [3]:
csl_df = pd.read_csv('../../data/csl/Sites/CSL_he_PI_s41.csv', index_col=0)
X = csl_df.drop(['high_EBL','Sitenum'], axis=1, inplace=False)
y = csl_df['high_EBL'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=7)
X_train                                                    

Unnamed: 0_level_0,transfus_yes,Accrete,AdmBishop,Admcervpos,Admconsistency,Admcontract,AdmDBP,Admefface,Admpresent,AdmSBP,...,TD_nos,ThreatenedPB,threatpb9,UnspecHBP,uscar,version9,new_age,new_BMI,new_high_Age,new_high_BMI
MomID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
41-24887,0,0,6,2,2,5.0,77,70.0,1,132.0,...,0,0,0,0,0,0,27,22,1,1
41-17428,0,0,12,1,3,4.0,67,90.0,1,121.0,...,0,0,0,0,0,0,26,23,1,1
41-06946,0,0,7,1,2,5.0,84,60.0,77,126.0,...,0,0,0,0,0,0,34,30,1,1
41-17653,0,0,9,2,3,3.3,71,70.0,1,134.0,...,0,0,0,0,0,0,25,31,1,1
41-17118,0,0,8,8,8,3.3,82,80.0,77,120.0,...,0,0,0,0,0,0,29,24,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41-36132,0,0,5,3,2,6.7,87,100.0,77,151.0,...,0,0,0,0,0,0,31,33,1,1
41-42972,1,0,9,2,3,2.9,66,100.0,77,126.0,...,0,0,0,0,0,0,21,24,1,1
41-39190,0,0,11,2,3,4.4,83,90.0,77,130.0,...,0,0,0,0,0,0,26,23,1,1
41-16875,0,0,9,2,3,5.7,78,80.0,1,133.0,...,0,0,0,0,0,0,16,22,0,1


In [5]:
X.shape

(17785, 195)

<h1 align='center'>Exploratory Data Analysis</h1>

<h1 align='center'>Modelling</h1>

In [10]:
print(X_train.shape)
print(X.shape)
print(y.shape)

(12449, 195)
(17785, 195)
(17785,)


Here we define our validation set

<h1 align='center'>Modelling Using Undersampling</h1>

We will use `imblearn`'s RandomUnderSampler to undersample from the majority class so that they match

In [11]:
sampler = RandomUnderSampler(sampling_strategy=1.0, random_state=7)
X_rus, y_rus = sampler.fit_resample(X_train, y_train)

# Try with no undersampling
#X_rus, y_rus = X_train, y_train

In [12]:
print(X_rus.shape)
print(y_rus.shape)
print(np.bincount(y_rus))

(624, 195)
(624,)
[312 312]


<h1 align='center'>Basic Gradient Boosting</h1>

In [13]:
%%time
from sklearn.metrics import matthews_corrcoef
from sklearn.utils import class_weight
gb_rus = GradientBoostingClassifier()   # If not undersampling
gb_rus.fit(X_rus, y_rus)
print(gb_rus.get_params())
y_pred = gb_rus.predict(X_test)
probs = gb_rus.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'presort': 'deprecated', 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
[[4024 1178]
 [  19  115]]
              precision    recall  f1-score   support

           0       1.00      0.77      0.87      5202
           1       0.09      0.86      0.16       134

    accuracy                           0.78      5336
   macro avg       0.54      0.82      0.52      5336
weighted avg       0.97      0.78      0.85      5336

ROC AUC prob1: 0.8788783877613089
MCC: 0.2306943236069416
F2: 0.3143794423182067
PR_AUC: 0.15703770535178238
CPU times: user 344 ms, sys: 2.39 ms, total

In [14]:
%%time
# Not undersampled
from sklearn.metrics import matthews_corrcoef
from sklearn.utils import class_weight
gb_weight = GradientBoostingClassifier()  
weights = class_weight.compute_sample_weight('balanced', y=y_train)
gb_weight.fit(X_train, y_train, sample_weight=weights)
print(gb_weight.get_params())
y_pred = gb_weight.predict(X_test)
probs = gb_weight.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'presort': 'deprecated', 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
[[4173 1029]
 [  27  107]]
              precision    recall  f1-score   support

           0       0.99      0.80      0.89      5202
           1       0.09      0.80      0.17       134

    accuracy                           0.80      5336
   macro avg       0.54      0.80      0.53      5336
weighted avg       0.97      0.80      0.87      5336

ROC AUC prob1: 0.8707816167145817
MCC: 0.2296046180391226
F2: 0.3199760765550239
PR_AUC: 0.16827930566797322
CPU times: user 4.29 s, sys: 0 ns, total: 4

<h1 align='center'>Hyperparameter Tuning with Cross Validation</h1>

In [16]:
%%time
def create_model(trial):
    n_estimators = trial.suggest_int('n_estimators', 10, 200)
    subsample = trial.suggest_float('subsample', 0.8, 1.0)
    max_depth = trial.suggest_int('max_depth',3,20)
    max_features = trial.suggest_int('max_features',10,20)
    learning_rate = trial.suggest_uniform('learning_rate',0.08,0.2)
    min_samples_leaf = trial.suggest_int('min_samples_leaf',1,50)
    model = GradientBoostingClassifier(n_estimators=n_estimators, subsample=subsample, 
                           max_depth=max_depth, learning_rate=learning_rate, 
                           min_samples_leaf=min_samples_leaf)
#    model = GradientBoostingClassifier(n_estimators=n_estimators, subsample=subsample, 
#                           learning_rate=learning_rate, 
#                           min_samples_leaf=min_samples_leaf)
#    model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate) 
    return model

def objective(trial):
    model = create_model(trial)
    
    mcc_list = []
    roc_list = []
    pr_list = []
    f2_list = []
    cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=11)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train_c, X_test_c = X.iloc[train_idx], X.iloc[test_idx]
        y_train_c, y_test_c = y[train_idx], y[test_idx]
        sampler = RandomUnderSampler(sampling_strategy=1.0, random_state=7)
        X_res, y_res = sampler.fit_resample(X_train_c, y_train_c)

#        weights = class_weight.compute_sample_weight('balanced', y=y_train)
#        model.fit(X_train, y_train, sample_weight=weights)
        model.fit(X_res, y_res)
        y_pred = model.predict(X_test_c)
        probs = model.predict_proba(X_test_c)
        prob1 = probs[:, 1]  # Only positives
        precision, recall, pr_thresh = precision_recall_curve(y_test_c, prob1)
        roc = roc_auc_score(y_test_c, prob1)
        roc_list.append(roc)
        mcc = matthews_corrcoef(y_test_c, y_pred)  # MWB
        mcc_list.append(mcc)
        f2 = fbeta_score(y_test_c, y_pred, beta=2.0, average=None)[1]
        pr_auc = auc(recall, precision)
        pr_list.append(pr_auc)
        avg_precision = average_precision_score(y_test_c, prob1)
#        print(f'ROC: {roc}')
#        print(f'PR_AUC: {pr_auc}')
#        print(f'avg_prec: {avg_precision}')
        
    print(f'MCC: {mcc_list}')
    print(f'ROC: {roc_list}')
    print(f'PR_AUC: {pr_list}')
    return np.mean(mcc_list)

sampler = TPESampler(seed=7)
study = optuna.create_study(sampler=sampler,direction='maximize')
#study.optimize(objective,n_trials=60)
study.optimize(objective,n_trials=25)
#study.optimize(objective,n_trials=5)

[32m[I 2021-12-10 12:50:17,461][0m A new study created in memory with name: no-name-dcfd21cc-5846-4455-80f4-e36e0d973af3[0m
[32m[I 2021-12-10 12:50:18,272][0m Trial 0 finished with value: 0.21643418033075995 and parameters: {'n_estimators': 24, 'subsample': 0.955983758448023, 'max_depth': 10, 'max_features': 17, 'learning_rate': 0.1973587414395923, 'min_samples_leaf': 27}. Best is trial 0 with value: 0.21643418033075995.[0m


MCC: [0.21385669452465605, 0.22050993055177648, 0.21282253046663716, 0.2185475657799702]
ROC: [0.86422804415884, 0.8683915749659695, 0.8578176792709664, 0.85851077856154]
PR_AUC: [0.10302145474890229, 0.10490676750633003, 0.11784707515351844, 0.12080322240851146]


[32m[I 2021-12-10 12:50:20,041][0m Trial 1 finished with value: 0.20935055216084775 and parameters: {'n_estimators': 105, 'subsample': 0.8144102266719524, 'max_depth': 7, 'max_features': 15, 'learning_rate': 0.16150759953451288, 'min_samples_leaf': 41}. Best is trial 0 with value: 0.21643418033075995.[0m


MCC: [0.21576177522786175, 0.20995989746238292, 0.2041418226149563, 0.20753871333819007]
ROC: [0.8552315043664525, 0.8578696343402226, 0.8544218959443872, 0.8627278495616059]
PR_AUC: [0.1202600414617948, 0.11168743597214881, 0.12160222738834559, 0.13360998261289672]


[32m[I 2021-12-10 12:50:21,741][0m Trial 2 finished with value: 0.21976603740907424 and parameters: {'n_estimators': 82, 'subsample': 0.813187269381181, 'max_depth': 8, 'max_features': 20, 'learning_rate': 0.10560624242958987, 'min_samples_leaf': 23}. Best is trial 2 with value: 0.21976603740907424.[0m


MCC: [0.21643419086015178, 0.21686187779069568, 0.21730192493720965, 0.2284661560482399]
ROC: [0.8647738507167573, 0.8660556750522148, 0.8577449421740079, 0.866238298503527]
PR_AUC: [0.1294984454485884, 0.10976120759687058, 0.12245575502388907, 0.11843917977353861]


[32m[I 2021-12-10 12:50:25,212][0m Trial 3 finished with value: 0.21786920814171334 and parameters: {'n_estimators': 187, 'subsample': 0.8049798455100696, 'max_depth': 13, 'max_features': 20, 'learning_rate': 0.10763634548251577, 'min_samples_leaf': 28}. Best is trial 2 with value: 0.21976603740907424.[0m


MCC: [0.21399281151016888, 0.23238621850394914, 0.20474602059524163, 0.22035178195749372]
ROC: [0.8591201186356895, 0.8692436381017695, 0.8532975882456851, 0.859503757663656]
PR_AUC: [0.1274084265939988, 0.11029168956519379, 0.1143414351215495, 0.11764738136363932]


[32m[I 2021-12-10 12:50:29,099][0m Trial 4 finished with value: 0.2174500016864801 and parameters: {'n_estimators': 183, 'subsample': 0.8266338891518501, 'max_depth': 12, 'max_features': 18, 'learning_rate': 0.16028158890606967, 'min_samples_leaf': 24}. Best is trial 2 with value: 0.21976603740907424.[0m


MCC: [0.20348440965235282, 0.23814625113382348, 0.21298009541230667, 0.21518925054743737]
ROC: [0.8499649859943978, 0.8686555067177903, 0.8505211093446388, 0.8498747445447952]
PR_AUC: [0.11979598809995501, 0.10913935049082288, 0.11727914434787631, 0.09710190491009532]


[32m[I 2021-12-10 12:50:30,241][0m Trial 5 finished with value: 0.2221289997666841 and parameters: {'n_estimators': 49, 'subsample': 0.8981531778182141, 'max_depth': 9, 'max_features': 15, 'learning_rate': 0.12390684629367116, 'min_samples_leaf': 42}. Best is trial 5 with value: 0.2221289997666841.[0m


MCC: [0.20939493056616693, 0.23075201969175485, 0.2263780984466844, 0.22199095036213018]
ROC: [0.8633176800131817, 0.8699107411910181, 0.8626017020480689, 0.8696251400883379]
PR_AUC: [0.11836647690070819, 0.11313831683240819, 0.11865341208544541, 0.1449328465472369]


[32m[I 2021-12-10 12:50:34,223][0m Trial 6 finished with value: 0.21625963803222287 and parameters: {'n_estimators': 156, 'subsample': 0.8627989354425325, 'max_depth': 13, 'max_features': 13, 'learning_rate': 0.13434115190556806, 'min_samples_leaf': 18}. Best is trial 5 with value: 0.2221289997666841.[0m


MCC: [0.21112396613969622, 0.21560738601932036, 0.22126413113337118, 0.21704306883650376]
ROC: [0.8552088482451804, 0.8705362802248616, 0.8620011014474682, 0.8600414496670842]
PR_AUC: [0.1205198574741953, 0.10551288947021253, 0.11979465119891035, 0.11228752301783745]


[32m[I 2021-12-10 12:50:36,487][0m Trial 7 finished with value: 0.20860545398480398 and parameters: {'n_estimators': 135, 'subsample': 0.8740702165976071, 'max_depth': 11, 'max_features': 17, 'learning_rate': 0.12955901949366017, 'min_samples_leaf': 46}. Best is trial 5 with value: 0.2221289997666841.[0m


MCC: [0.1966769696146831, 0.22639332168383922, 0.19471135788731553, 0.21664016675337808]
ROC: [0.8492811830614598, 0.8690815382856905, 0.8588422332366968, 0.860379309776518]
PR_AUC: [0.10305272966514972, 0.12331773640255075, 0.13223670048792185, 0.1404303446265231]


[32m[I 2021-12-10 12:50:37,673][0m Trial 8 finished with value: 0.2207994128318419 and parameters: {'n_estimators': 44, 'subsample': 0.9482237745826528, 'max_depth': 10, 'max_features': 14, 'learning_rate': 0.15612558423606065, 'min_samples_leaf': 27}. Best is trial 5 with value: 0.2221289997666841.[0m


MCC: [0.21707943544317412, 0.22003985502148912, 0.2279930797101911, 0.21808528115251324]
ROC: [0.8618676882517713, 0.8625684508037449, 0.8618161414009166, 0.8680223646911464]
PR_AUC: [0.11879577262916657, 0.10752447077059525, 0.11703240107106355, 0.12998663617647938]


[32m[I 2021-12-10 12:50:39,012][0m Trial 9 finished with value: 0.21804596628083917 and parameters: {'n_estimators': 89, 'subsample': 0.8002853761125517, 'max_depth': 4, 'max_features': 17, 'learning_rate': 0.14292147161182367, 'min_samples_leaf': 35}. Best is trial 5 with value: 0.2221289997666841.[0m


MCC: [0.21641494860503138, 0.21080295338247187, 0.21763505277030115, 0.22733091036555228]
ROC: [0.8559956335475367, 0.8603115225952596, 0.8600725292766815, 0.86735900520799]
PR_AUC: [0.11887814191545551, 0.12068125696580051, 0.11665203461807129, 0.13933250618637302]


[32m[I 2021-12-10 12:50:41,888][0m Trial 10 finished with value: 0.21359188644585314 and parameters: {'n_estimators': 55, 'subsample': 0.924763336987846, 'max_depth': 18, 'max_features': 10, 'learning_rate': 0.09172353354166263, 'min_samples_leaf': 4}. Best is trial 5 with value: 0.2221289997666841.[0m


MCC: [0.22852290110163548, 0.21278588578484386, 0.1990780033370495, 0.21398075555988366]
ROC: [0.8593919920909541, 0.8578072882571152, 0.8656026268483016, 0.8532801272331729]
PR_AUC: [0.09230923940936248, 0.10219636881022476, 0.12415205774694404, 0.10367909766423272]


[32m[I 2021-12-10 12:50:42,539][0m Trial 11 finished with value: 0.22327951601326101 and parameters: {'n_estimators': 15, 'subsample': 0.9947616307703723, 'max_depth': 16, 'max_features': 13, 'learning_rate': 0.18341794836852388, 'min_samples_leaf': 49}. Best is trial 11 with value: 0.22327951601326101.[0m


MCC: [0.21292046815748833, 0.22835902988732365, 0.2303978712458933, 0.2214406947623388]
ROC: [0.8572787938704893, 0.8743113355570103, 0.8611687812379855, 0.8664134089260993]
PR_AUC: [0.09711833760570868, 0.12358215939417931, 0.11995706952624656, 0.17900532202775044]


[32m[I 2021-12-10 12:50:43,066][0m Trial 12 finished with value: 0.22384383737786545 and parameters: {'n_estimators': 10, 'subsample': 0.9983488269913793, 'max_depth': 18, 'max_features': 12, 'learning_rate': 0.19834120666704083, 'min_samples_leaf': 48}. Best is trial 12 with value: 0.22384383737786545.[0m


MCC: [0.20277447771079213, 0.24594635586268127, 0.22354884203122555, 0.22310567390676295]
ROC: [0.8513078760916131, 0.867570684871723, 0.8551783617527562, 0.8617647422374579]
PR_AUC: [0.10554723423194967, 0.1144697095142222, 0.11066174994983223, 0.1545754185709808]


[32m[I 2021-12-10 12:50:43,698][0m Trial 13 finished with value: 0.225731740091721 and parameters: {'n_estimators': 16, 'subsample': 0.9948280636032704, 'max_depth': 20, 'max_features': 11, 'learning_rate': 0.19897381889877022, 'min_samples_leaf': 49}. Best is trial 13 with value: 0.225731740091721.[0m


MCC: [0.21611137973253378, 0.23622575319930633, 0.23250454628253067, 0.21808528115251324]
ROC: [0.8635298236941835, 0.876886228789343, 0.8667248563442335, 0.8709044762344255]
PR_AUC: [0.10752241342016845, 0.1299348575831288, 0.11793561443272887, 0.1717967140418523]


[32m[I 2021-12-10 12:50:44,251][0m Trial 14 finished with value: 0.22532486620655878 and parameters: {'n_estimators': 10, 'subsample': 0.9909383110845321, 'max_depth': 20, 'max_features': 10, 'learning_rate': 0.1955419741845684, 'min_samples_leaf': 36}. Best is trial 13 with value: 0.225731740091721.[0m


MCC: [0.21490406917287028, 0.2318494608762947, 0.22849539122247367, 0.22605054355459653]
ROC: [0.8571891992090954, 0.8710984340742127, 0.8711732493739415, 0.8733786834992419]
PR_AUC: [0.10871267923168554, 0.11630932870440017, 0.138209416699346, 0.1397052287995463]


[32m[I 2021-12-10 12:50:45,761][0m Trial 15 finished with value: 0.21780806150675702 and parameters: {'n_estimators': 61, 'subsample': 0.9698387144179637, 'max_depth': 20, 'max_features': 10, 'learning_rate': 0.1808280525874309, 'min_samples_leaf': 36}. Best is trial 13 with value: 0.225731740091721.[0m


MCC: [0.22068304219785695, 0.21390670632310294, 0.21369665012943206, 0.22294584737663609]
ROC: [0.8621601581809194, 0.8632210064736017, 0.8612654176668018, 0.8638052936910805]
PR_AUC: [0.13039351797516613, 0.1076762015127566, 0.12846536402531686, 0.12504693650962445]


[32m[I 2021-12-10 12:50:46,686][0m Trial 16 finished with value: 0.22068304266487518 and parameters: {'n_estimators': 32, 'subsample': 0.9749728671023473, 'max_depth': 20, 'max_features': 11, 'learning_rate': 0.18294250145885432, 'min_samples_leaf': 37}. Best is trial 13 with value: 0.225731740091721.[0m


MCC: [0.2087808264187041, 0.22912405630477975, 0.22158266690471864, 0.2232446210312983]
ROC: [0.8558885318833416, 0.8654800128848571, 0.8652015337136445, 0.8570460313797877]
PR_AUC: [0.12150213196976363, 0.11996348901643925, 0.13752552868295184, 0.13181491361404696]


[32m[I 2021-12-10 12:50:48,958][0m Trial 17 finished with value: 0.21064417254698573 and parameters: {'n_estimators': 66, 'subsample': 0.9313746319603654, 'max_depth': 15, 'max_features': 11, 'learning_rate': 0.17351870331919775, 'min_samples_leaf': 13}. Best is trial 13 with value: 0.225731740091721.[0m


MCC: [0.2058880767946249, 0.2121184686216313, 0.2142475744367116, 0.21032257033497506]
ROC: [0.8555466304168726, 0.8589773164167629, 0.8587632615314276, 0.8582264816401872]
PR_AUC: [0.1081120732368015, 0.10207708195996129, 0.12063190775352528, 0.11048459782838878]


[32m[I 2021-12-10 12:50:51,796][0m Trial 18 finished with value: 0.20722823957006892 and parameters: {'n_estimators': 128, 'subsample': 0.9794720361077872, 'max_depth': 17, 'max_features': 12, 'learning_rate': 0.19856569620442835, 'min_samples_leaf': 32}. Best is trial 13 with value: 0.225731740091721.[0m


MCC: [0.2006484447446409, 0.2116017733664335, 0.2089709265562988, 0.2076918136129025]
ROC: [0.8546300873290492, 0.859557134989661, 0.8533349958955495, 0.8514713395741315]
PR_AUC: [0.1123663994815816, 0.11345341670014009, 0.1259148828777908, 0.11086572238153071]


[32m[I 2021-12-10 12:50:52,591][0m Trial 19 finished with value: 0.22632206881548675 and parameters: {'n_estimators': 25, 'subsample': 0.9168014347216491, 'max_depth': 15, 'max_features': 10, 'learning_rate': 0.17100147187916845, 'min_samples_leaf': 43}. Best is trial 19 with value: 0.22632206881548675.[0m


MCC: [0.21368349118922877, 0.23097831904759622, 0.22578221422088737, 0.23484425080423468]
ROC: [0.8662815126050419, 0.8705071853860781, 0.8698068310525059, 0.866358815676709]
PR_AUC: [0.1116052642265223, 0.1348589910028646, 0.11680522055751459, 0.1583395535904552]


[32m[I 2021-12-10 12:50:53,532][0m Trial 20 finished with value: 0.22208081631477258 and parameters: {'n_estimators': 36, 'subsample': 0.9040402067499202, 'max_depth': 15, 'max_features': 12, 'learning_rate': 0.1688983124163948, 'min_samples_leaf': 43}. Best is trial 19 with value: 0.22632206881548675.[0m


MCC: [0.209699891874652, 0.21771289380415162, 0.22807151316651253, 0.23283896641377422]
ROC: [0.8648356401384083, 0.8616623543959183, 0.8657751176782319, 0.8730161019183862]
PR_AUC: [0.12797868301370036, 0.11924122058274998, 0.11606839804362268, 0.14769022550446512]


[32m[I 2021-12-10 12:50:54,074][0m Trial 21 finished with value: 0.2208138261197718 and parameters: {'n_estimators': 10, 'subsample': 0.9484030905393109, 'max_depth': 20, 'max_features': 10, 'learning_rate': 0.18692986066441591, 'min_samples_leaf': 50}. Best is trial 19 with value: 0.22632206881548675.[0m


MCC: [0.20277447771079213, 0.23333318284079657, 0.2230789057869542, 0.22406873814054423]
ROC: [0.8343837535014006, 0.8698120265594314, 0.852879869488866, 0.8571593381238052]
PR_AUC: [0.08300119765528124, 0.13977592843617206, 0.14753246495699252, 0.15058654366073906]


[32m[I 2021-12-10 12:50:54,894][0m Trial 22 finished with value: 0.221827706044224 and parameters: {'n_estimators': 28, 'subsample': 0.8537401607514483, 'max_depth': 19, 'max_features': 11, 'learning_rate': 0.1486057373358557, 'min_samples_leaf': 40}. Best is trial 19 with value: 0.22632206881548675.[0m


MCC: [0.22184421168841112, 0.22319956497422325, 0.21863835926497585, 0.2236286882492857]
ROC: [0.8671877574559236, 0.8688425449671124, 0.8647505637125015, 0.8657768310369833]
PR_AUC: [0.11476464618042123, 0.12240955499814002, 0.12827201136204006, 0.1507440113341566]


[32m[I 2021-12-10 12:50:56,526][0m Trial 23 finished with value: 0.21175659392136953 and parameters: {'n_estimators': 76, 'subsample': 0.9258785527064558, 'max_depth': 17, 'max_features': 10, 'learning_rate': 0.18686415613602847, 'min_samples_leaf': 44}. Best is trial 19 with value: 0.22632206881548675.[0m


MCC: [0.20662854381538343, 0.2240809608467469, 0.19691899889052125, 0.21939787213282658]
ROC: [0.8546383259186027, 0.8642725770753452, 0.8493697850099236, 0.8652535598918848]
PR_AUC: [0.11870745940563113, 0.12340919368030653, 0.11725739388507075, 0.144128396296758]


[32m[I 2021-12-10 12:50:57,610][0m Trial 24 finished with value: 0.217316634191831 and parameters: {'n_estimators': 36, 'subsample': 0.9820726610349131, 'max_depth': 14, 'max_features': 11, 'learning_rate': 0.17154941448458988, 'min_samples_leaf': 32}. Best is trial 19 with value: 0.22632206881548675.[0m


MCC: [0.21674423492223502, 0.222289041869779, 0.2106768847551386, 0.21955637522017143]
ROC: [0.8607184050090625, 0.8622026871161819, 0.8596839053586459, 0.8635477783637682]
PR_AUC: [0.12364355478229892, 0.1144142606527407, 0.12694710534213796, 0.12854678719091228]
CPU times: user 39.7 s, sys: 617 ms, total: 40.3 s
Wall time: 40.2 s


In [21]:
%%time
def create_model(trial):
    n_estimators = trial.suggest_int('n_estimators', 10, 200)
    subsample = trial.suggest_float('subsample', 0.8, 1.0)
    max_depth = trial.suggest_int('max_depth',3,20)
    max_features = trial.suggest_int('max_features',10,20)
    learning_rate = trial.suggest_uniform('learning_rate',0.08,0.2)
    min_samples_leaf = trial.suggest_int('min_samples_leaf',1,50)
    model = GradientBoostingClassifier(n_estimators=n_estimators, subsample=subsample, 
                           max_depth=max_depth, learning_rate=learning_rate, 
                           min_samples_leaf=min_samples_leaf)
#    model = GradientBoostingClassifier(n_estimators=n_estimators, subsample=subsample, 
#                           learning_rate=learning_rate, 
#                           min_samples_leaf=min_samples_leaf)
#    model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate) 
    return model

def objective(trial):
    model = create_model(trial)
    
    mcc_list = []
    roc_list = []
    pr_list = []
    f2_list = []
    cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=11)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train_c, X_test_c = X.iloc[train_idx], X.iloc[test_idx]
        y_train_c, y_test_c = y[train_idx], y[test_idx]
#        sampler = RandomUnderSampler(sampling_strategy=1.0, random_state=7)
#        X_res, y_res = sampler.fit_resample(X_train_c, y_train_c)
#        model.fit(X_res, y_res)

        weights = class_weight.compute_sample_weight('balanced', y=y_train_c)
        model.fit(X_train_c, y_train_c, sample_weight=weights)
        y_pred = model.predict(X_test_c)
        probs = model.predict_proba(X_test_c)
        prob1 = probs[:, 1]  # Only positives
        precision, recall, pr_thresh = precision_recall_curve(y_test_c, prob1)
        roc = roc_auc_score(y_test_c, prob1)
        roc_list.append(roc)
        mcc = matthews_corrcoef(y_test_c, y_pred)  # MWB
        mcc_list.append(mcc)
        f2 = fbeta_score(y_test_c, y_pred, beta=2.0, average=None)[1]
        pr_auc = auc(recall, precision)
        pr_list.append(pr_auc)
        avg_precision = average_precision_score(y_test_c, prob1)
#        print(f'ROC: {roc}')
#        print(f'PR_AUC: {pr_auc}')
#        print(f'avg_prec: {avg_precision}')
        
    print(f'MCC: {mcc_list}')
    print(f'ROC: {roc_list}')
    print(f'PR_AUC: {pr_list}')
    return np.mean(mcc_list)

sampler = TPESampler(seed=7)
study = optuna.create_study(sampler=sampler,direction='maximize')
#study.optimize(objective,n_trials=60)
study.optimize(objective,n_trials=25)
#study.optimize(objective,n_trials=5)

[32m[I 2021-12-10 14:05:55,147][0m A new study created in memory with name: no-name-7baee9f3-41f1-4913-b5d8-ff14fa2528e0[0m
[32m[I 2021-12-10 14:06:07,610][0m Trial 0 finished with value: 0.1977736653386975 and parameters: {'n_estimators': 24, 'subsample': 0.955983758448023, 'max_depth': 10, 'max_features': 17, 'learning_rate': 0.1973587414395923, 'min_samples_leaf': 27}. Best is trial 0 with value: 0.1977736653386975.[0m


MCC: [0.19323489136313596, 0.18292581180981932, 0.20243641044702243, 0.2124975477348123]
ROC: [0.8523685944966222, 0.8658707150056631, 0.8618078285898355, 0.86058223185444]
PR_AUC: [0.1654322753496176, 0.16237035467999034, 0.15750053696079433, 0.17279273479213383]


[32m[I 2021-12-10 14:06:42,680][0m Trial 1 finished with value: 0.18716216826304696 and parameters: {'n_estimators': 105, 'subsample': 0.8144102266719524, 'max_depth': 7, 'max_features': 15, 'learning_rate': 0.16150759953451288, 'min_samples_leaf': 41}. Best is trial 0 with value: 0.1977736653386975.[0m


MCC: [0.17978494040785276, 0.17387030014318158, 0.19111898086935966, 0.20387445163179382]
ROC: [0.8465294941506014, 0.8258590770701497, 0.8535843802279789, 0.8568070571560419]
PR_AUC: [0.11961352370497694, 0.1354891661743422, 0.13546161461902112, 0.1437961222152409]


[32m[I 2021-12-10 14:07:13,537][0m Trial 2 finished with value: 0.18843169829788836 and parameters: {'n_estimators': 82, 'subsample': 0.813187269381181, 'max_depth': 8, 'max_features': 20, 'learning_rate': 0.10560624242958987, 'min_samples_leaf': 23}. Best is trial 0 with value: 0.1977736653386975.[0m


MCC: [0.16684847224284488, 0.2063969527370979, 0.16187763788316556, 0.21860373032844516]
ROC: [0.8611880046136102, 0.8420025561894073, 0.8625539033843531, 0.8647982727931967]
PR_AUC: [0.13021931256052305, 0.16477642207417403, 0.1234662435560254, 0.13757318605886434]


[32m[I 2021-12-10 14:08:56,104][0m Trial 3 finished with value: 0.14168666802394975 and parameters: {'n_estimators': 187, 'subsample': 0.8049798455100696, 'max_depth': 13, 'max_features': 20, 'learning_rate': 0.10763634548251577, 'min_samples_leaf': 28}. Best is trial 0 with value: 0.1977736653386975.[0m


MCC: [0.13417713210477838, 0.18453372666982998, 0.12815990297509883, 0.1198759103460918]
ROC: [0.8425152413906739, 0.848974926483577, 0.8634163575340046, 0.8673651855758455]
PR_AUC: [0.1407834163413837, 0.13402826554418304, 0.14693895168767449, 0.13916310241015442]


[32m[I 2021-12-10 14:10:37,258][0m Trial 4 finished with value: 0.10485167991258405 and parameters: {'n_estimators': 183, 'subsample': 0.8266338891518501, 'max_depth': 12, 'max_features': 18, 'learning_rate': 0.16028158890606967, 'min_samples_leaf': 24}. Best is trial 0 with value: 0.1977736653386975.[0m


MCC: [0.07980928003163139, 0.19416030112368277, 0.06527245397447189, 0.08016468452055014]
ROC: [0.8334589718240237, 0.8258590770701499, 0.8262227625549426, 0.8382947953062166]
PR_AUC: [0.12821803630123063, 0.15539732965765327, 0.1377353779719155, 0.14153259405692584]


[32m[I 2021-12-10 14:10:58,681][0m Trial 5 finished with value: 0.20015290830966848 and parameters: {'n_estimators': 49, 'subsample': 0.8981531778182141, 'max_depth': 9, 'max_features': 15, 'learning_rate': 0.12390684629367116, 'min_samples_leaf': 42}. Best is trial 5 with value: 0.20015290830966848.[0m


MCC: [0.18492746845787258, 0.21716917541940484, 0.18357115768435928, 0.21494383167703726]
ROC: [0.8651590047783819, 0.8724606959901077, 0.8671155584650394, 0.8725196123673281]
PR_AUC: [0.14591804632955754, 0.15296491112127558, 0.17452871161800954, 0.19329147339923747]


[32m[I 2021-12-10 14:12:32,170][0m Trial 6 finished with value: 0.12115950372920217 and parameters: {'n_estimators': 156, 'subsample': 0.8627989354425325, 'max_depth': 13, 'max_features': 13, 'learning_rate': 0.13434115190556806, 'min_samples_leaf': 18}. Best is trial 5 with value: 0.20015290830966848.[0m


MCC: [0.11453103740919258, 0.1535924107539169, 0.12291950470221816, 0.09359506205148108]
ROC: [0.8463461855330366, 0.8517638746012448, 0.8541039309205399, 0.8422213890170744]
PR_AUC: [0.13797839369596124, 0.15641865786929002, 0.14340884941022947, 0.15191841172960388]


[32m[I 2021-12-10 14:13:42,819][0m Trial 7 finished with value: 0.17657597662746582 and parameters: {'n_estimators': 135, 'subsample': 0.8740702165976071, 'max_depth': 11, 'max_features': 17, 'learning_rate': 0.12955901949366017, 'min_samples_leaf': 46}. Best is trial 5 with value: 0.20015290830966848.[0m


MCC: [0.1787947796351122, 0.17074251631568615, 0.16964363377872077, 0.18712297678034412]
ROC: [0.8434832756632066, 0.84236416347143, 0.8536695865415588, 0.8689988628123146]
PR_AUC: [0.1461707998948178, 0.14798967168381405, 0.1409986511573209, 0.14861271740219653]


[32m[I 2021-12-10 14:14:05,703][0m Trial 8 finished with value: 0.17714112904732746 and parameters: {'n_estimators': 44, 'subsample': 0.9482237745826528, 'max_depth': 10, 'max_features': 14, 'learning_rate': 0.15612558423606065, 'min_samples_leaf': 27}. Best is trial 5 with value: 0.20015290830966848.[0m


MCC: [0.16902383360413745, 0.1559625734730109, 0.21193543955088626, 0.17164266956127527]
ROC: [0.8478208930631076, 0.8516849028959756, 0.8706609723910762, 0.8511499604456457]
PR_AUC: [0.14129766551670517, 0.12740263585517517, 0.1493372647792779, 0.1524182428879656]


[32m[I 2021-12-10 14:14:23,286][0m Trial 9 finished with value: 0.21705294737727426 and parameters: {'n_estimators': 89, 'subsample': 0.8002853761125517, 'max_depth': 4, 'max_features': 17, 'learning_rate': 0.14292147161182367, 'min_samples_leaf': 35}. Best is trial 9 with value: 0.21705294737727426.[0m


MCC: [0.20465937726602002, 0.20201735058835246, 0.23322225113341366, 0.22831281052131092]
ROC: [0.8659210743120778, 0.8623793343516526, 0.8678990409094215, 0.8745302920429824]
PR_AUC: [0.15951450309825183, 0.1480575245293238, 0.1346353696619883, 0.15518761182974605]


[32m[I 2021-12-10 14:14:38,187][0m Trial 10 finished with value: 0.2354243732956553 and parameters: {'n_estimators': 82, 'subsample': 0.9869602664638994, 'max_depth': 3, 'max_features': 10, 'learning_rate': 0.1867212049748521, 'min_samples_leaf': 4}. Best is trial 10 with value: 0.2354243732956553.[0m


MCC: [0.21854050420827267, 0.24510405811815061, 0.24562686279939505, 0.23242606805680288]
ROC: [0.8732122260668973, 0.8779118218564583, 0.8692955931710257, 0.8725989270881402]
PR_AUC: [0.14430235109434317, 0.17117955931527917, 0.15731372246798964, 0.1716694606014502]


[32m[I 2021-12-10 14:14:53,307][0m Trial 11 finished with value: 0.23297222350847158 and parameters: {'n_estimators': 83, 'subsample': 0.9963673775870002, 'max_depth': 3, 'max_features': 10, 'learning_rate': 0.19868240678542037, 'min_samples_leaf': 5}. Best is trial 10 with value: 0.2354243732956553.[0m


MCC: [0.221193667671841, 0.23528909281537633, 0.23094414785419612, 0.24446198569247277]
ROC: [0.8686913000494316, 0.8716720180388, 0.8742593804877541, 0.8790223894126179]
PR_AUC: [0.1664702213101995, 0.1618534153076196, 0.14970690803827466, 0.18313820724758614]


[32m[I 2021-12-10 14:16:11,787][0m Trial 12 finished with value: 0.06521555828648873 and parameters: {'n_estimators': 70, 'subsample': 0.9983488269913793, 'max_depth': 18, 'max_features': 10, 'learning_rate': 0.19834120666704083, 'min_samples_leaf': 1}. Best is trial 10 with value: 0.2354243732956553.[0m


MCC: [0.019303709719963494, 0.10001426767264698, 0.07503326267475662, 0.06651099307858784]
ROC: [0.7826454111056187, 0.8115319471720857, 0.8211228529567629, 0.8233218241149713]
PR_AUC: [0.08574209367294149, 0.10310750007325849, 0.12865143546959423, 0.09637115178063828]


[32m[I 2021-12-10 14:16:34,684][0m Trial 13 finished with value: 0.2222719933503033 and parameters: {'n_estimators': 127, 'subsample': 0.9948046646589249, 'max_depth': 3, 'max_features': 10, 'learning_rate': 0.18087203032951035, 'min_samples_leaf': 2}. Best is trial 10 with value: 0.2354243732956553.[0m


MCC: [0.19771717249760565, 0.24626412049187946, 0.20844079787950728, 0.2366658825322207]
ROC: [0.8659622672598452, 0.8656899113646519, 0.8605172646695136, 0.8727338651196519]
PR_AUC: [0.15431277530554197, 0.17103733378230568, 0.14544104677901498, 0.18085807274280719]


[32m[I 2021-12-10 14:16:38,161][0m Trial 14 finished with value: 0.22287037112814248 and parameters: {'n_estimators': 12, 'subsample': 0.9605815973767309, 'max_depth': 5, 'max_features': 12, 'learning_rate': 0.18272801791765883, 'min_samples_leaf': 10}. Best is trial 10 with value: 0.2354243732956553.[0m


MCC: [0.19447240731381768, 0.2277474619038457, 0.24491613465999462, 0.22434548063491191]
ROC: [0.8453143021914648, 0.8678075999875308, 0.8710423225994159, 0.8606965686597666]
PR_AUC: [0.11092359592904982, 0.16206811950293473, 0.1603094947944962, 0.15846233669784016]


[32m[I 2021-12-10 14:16:58,273][0m Trial 15 finished with value: 0.20505119493486293 and parameters: {'n_estimators': 61, 'subsample': 0.9307322809405643, 'max_depth': 6, 'max_features': 11, 'learning_rate': 0.08133799186397042, 'min_samples_leaf': 10}. Best is trial 10 with value: 0.2354243732956553.[0m


MCC: [0.1753273882901828, 0.21449840639000725, 0.22132439024337355, 0.2090545948158881]
ROC: [0.8504304663041686, 0.8771283394120762, 0.8633145255982627, 0.8720931669852989]
PR_AUC: [0.12338438417199732, 0.18640273814498654, 0.13894841887467585, 0.14302995400663207]


[32m[I 2021-12-10 14:18:30,777][0m Trial 16 finished with value: 0.10114347160369416 and parameters: {'n_estimators': 109, 'subsample': 0.9860295170328774, 'max_depth': 16, 'max_features': 12, 'learning_rate': 0.17457821288481112, 'min_samples_leaf': 10}. Best is trial 10 with value: 0.2354243732956553.[0m


MCC: [0.17029530538208276, 0.09525212400519409, 0.06636527393169263, 0.07266118309580723]
ROC: [0.8034045971329707, 0.823741388447271, 0.8183297484335547, 0.8282331564374712]
PR_AUC: [0.14639084318702283, 0.14366713415055815, 0.1450685126371553, 0.13372695817972235]


[32m[I 2021-12-10 14:18:47,130][0m Trial 17 finished with value: 0.2196339815832044 and parameters: {'n_estimators': 100, 'subsample': 0.924580558278454, 'max_depth': 3, 'max_features': 11, 'learning_rate': 0.18645722559130903, 'min_samples_leaf': 16}. Best is trial 10 with value: 0.2354243732956553.[0m


MCC: [0.19671404272664195, 0.2404658600719602, 0.20148912257752766, 0.2398669009566878]
ROC: [0.862908840006591, 0.8701310306846639, 0.871772810873157, 0.8824020205682642]
PR_AUC: [0.1433683034412762, 0.17333926455033258, 0.1448188081797124, 0.16346230964453712]


[32m[I 2021-12-10 14:19:30,478][0m Trial 18 finished with value: 0.1286810235302696 and parameters: {'n_estimators': 129, 'subsample': 0.9726407465877877, 'max_depth': 6, 'max_features': 10, 'learning_rate': 0.16701608308923357, 'min_samples_leaf': 5}. Best is trial 10 with value: 0.2354243732956553.[0m


MCC: [0.14153716673656505, 0.1470971690735733, 0.13698759691055548, 0.0891021614003846]
ROC: [0.7948920744768495, 0.8436755094194541, 0.7984039402724524, 0.8223700474652251]
PR_AUC: [0.12682723773320423, 0.13881732422508222, 0.11675642539351576, 0.11021147328333608]


[32m[I 2021-12-10 14:21:26,506][0m Trial 19 finished with value: 0.09737630388626768 and parameters: {'n_estimators': 156, 'subsample': 0.9231395341936665, 'max_depth': 15, 'max_features': 13, 'learning_rate': 0.14905710641940623, 'min_samples_leaf': 16}. Best is trial 10 with value: 0.2354243732956553.[0m


MCC: [0.05662892362256908, 0.13712169912228192, 0.09813005627254057, 0.09762453652767918]
ROC: [0.8300770308123249, 0.8396167794091669, 0.8284651433440361, 0.8121312380512888]
PR_AUC: [0.12148046463723382, 0.15184323551450665, 0.16071358661561172, 0.14272947130078978]


[32m[I 2021-12-10 14:21:37,716][0m Trial 20 finished with value: 0.205995382574193 and parameters: {'n_estimators': 40, 'subsample': 0.9741032583511194, 'max_depth': 5, 'max_features': 12, 'learning_rate': 0.19203351736058122, 'min_samples_leaf': 6}. Best is trial 10 with value: 0.2354243732956553.[0m


MCC: [0.1829669053544159, 0.2260784993880051, 0.22111155004762773, 0.1938245755067233]
ROC: [0.8595598533531059, 0.8611781331504514, 0.8697029209139936, 0.8624167710462127]
PR_AUC: [0.12365147898036383, 0.13514473727771387, 0.14668483940692859, 0.13935009652523433]


[32m[I 2021-12-10 14:21:39,659][0m Trial 21 finished with value: 0.22588114948352556 and parameters: {'n_estimators': 10, 'subsample': 0.9610881697194754, 'max_depth': 3, 'max_features': 11, 'learning_rate': 0.18035794700248242, 'min_samples_leaf': 10}. Best is trial 10 with value: 0.2354243732956553.[0m


MCC: [0.20878714385094538, 0.23658747371819971, 0.22616760117212617, 0.23198237919283096]
ROC: [0.8591530729939034, 0.8737533381131998, 0.8764311023826594, 0.8666616537016282]
PR_AUC: [0.1260894280983796, 0.16266265590878382, 0.17173051510572632, 0.17996625119599902]


[32m[I 2021-12-10 14:21:50,758][0m Trial 22 finished with value: 0.2296646699404017 and parameters: {'n_estimators': 66, 'subsample': 0.9776866512018546, 'max_depth': 3, 'max_features': 11, 'learning_rate': 0.17170606732696952, 'min_samples_leaf': 7}. Best is trial 10 with value: 0.2354243732956553.[0m


MCC: [0.20784069004336825, 0.24529692009604734, 0.22821313457567405, 0.2373079350465172]
ROC: [0.8689034437304333, 0.8818042956451261, 0.8693693693693694, 0.8796363059529304]
PR_AUC: [0.1499140677807782, 0.16275702795109595, 0.14697852255266017, 0.16607390427106491]


[32m[I 2021-12-10 14:23:00,873][0m Trial 23 finished with value: 0.1047327349929128 and parameters: {'n_estimators': 69, 'subsample': 0.9820369123337096, 'max_depth': 20, 'max_features': 10, 'learning_rate': 0.1712498753449273, 'min_samples_leaf': 6}. Best is trial 10 with value: 0.2354243732956553.[0m


MCC: [0.042203694300907896, 0.19923187646684823, 0.10001426767264698, 0.07748110153124811]
ROC: [0.834223101005108, 0.8464457537121897, 0.856256948990513, 0.8421863669325599]
PR_AUC: [0.12427161763409422, 0.1453184577693277, 0.13406121448469097, 0.11755830202451656]


[32m[I 2021-12-10 14:23:25,909][0m Trial 24 finished with value: 0.1855152098129284 and parameters: {'n_estimators': 85, 'subsample': 0.9423994435571657, 'max_depth': 5, 'max_features': 11, 'learning_rate': 0.19005020005150144, 'min_samples_leaf': 13}. Best is trial 10 with value: 0.2354243732956553.[0m


MCC: [0.18258113062463469, 0.19145526118489997, 0.17623712071390477, 0.1917873267282743]
ROC: [0.8397367770637666, 0.8518948013757702, 0.8414268940220498, 0.8414406025446635]
PR_AUC: [0.12442800221137888, 0.1223127834575828, 0.12124286029086931, 0.11367359262472913]
CPU times: user 17min 30s, sys: 927 ms, total: 17min 30s
Wall time: 17min 30s


In [23]:
best = study.best_trial
best

FrozenTrial(number=10, values=[0.2354243732956553], datetime_start=datetime.datetime(2021, 12, 10, 14, 14, 23, 289085), datetime_complete=datetime.datetime(2021, 12, 10, 14, 14, 38, 186807), params={'n_estimators': 82, 'subsample': 0.9869602664638994, 'max_depth': 3, 'max_features': 10, 'learning_rate': 0.1867212049748521, 'min_samples_leaf': 4}, distributions={'n_estimators': IntUniformDistribution(high=200, low=10, step=1), 'subsample': UniformDistribution(high=1.0, low=0.8), 'max_depth': IntUniformDistribution(high=20, low=3, step=1), 'max_features': IntUniformDistribution(high=20, low=10, step=1), 'learning_rate': UniformDistribution(high=0.2, low=0.08), 'min_samples_leaf': IntUniformDistribution(high=50, low=1, step=1)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=10, state=TrialState.COMPLETE, value=None)

In [24]:
%%time
gb_params = study.best_params
#gb_params['random_state'] = 42
gb = GradientBoostingClassifier(**gb_params)
gb.fit(X_rus, y_rus)
y_pred = gb.predict(X_test)
probs = gb.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

[[4073 1129]
 [  23  111]]
              precision    recall  f1-score   support

           0       0.99      0.78      0.88      5202
           1       0.09      0.83      0.16       134

    accuracy                           0.78      5336
   macro avg       0.54      0.81      0.52      5336
weighted avg       0.97      0.78      0.86      5336

ROC AUC prob1: 0.8701790930009698
MCC: 0.22647477418563908
F2: 0.31250000000000006
PR_AUC: 0.1603509845415154
CPU times: user 136 ms, sys: 5.02 ms, total: 141 ms
Wall time: 138 ms


In [25]:
%%time
gb_params = study.best_params
gb_params['random_state'] = 42
gb_weight = GradientBoostingClassifier(**gb_params)
weights = class_weight.compute_sample_weight('balanced', y=y_train)
gb_weight.fit(X_train, y_train, sample_weight=weights)
y_pred = gb_weight.predict(X_test)
probs = gb_weight.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

[[4192 1010]
 [  22  112]]
              precision    recall  f1-score   support

           0       0.99      0.81      0.89      5202
           1       0.10      0.84      0.18       134

    accuracy                           0.81      5336
   macro avg       0.55      0.82      0.53      5336
weighted avg       0.97      0.81      0.87      5336

ROC AUC prob1: 0.8770134334096529
MCC: 0.2463780966848265
F2: 0.3377563329312425
PR_AUC: 0.143792394503228
CPU times: user 474 ms, sys: 5.03 ms, total: 479 ms
Wall time: 476 ms


In [26]:
# No Sitenum - 5
coeffs_wgt = pd.Series(data=gb_weight.feature_importances_,
                   index=X_test.columns.values).sort_values(ascending=False)
coeffs_wgt[0:20]

Delfetalpos     0.208864
Presentdel      0.081449
Delmode         0.060527
Anteanemia      0.058732
MthInd_Miso     0.048175
CS_FTP          0.043437
transfus_yes    0.043320
Lac_Min         0.031142
Admcontract     0.026786
Intrachorio     0.020832
Admcervpos      0.019700
Lac_None        0.019461
AdmDBP          0.018383
Augment         0.017812
Education       0.016026
BESTGA          0.015832
Dilat_lst       0.015541
Analgesia       0.014529
AdmSBP          0.014098
AdmBishop       0.013313
dtype: float64

In [1]:
plt.rcParams["figure.figsize"] = (14, 8)
x_labels = coeffs_wgt.index.values[0:25]
print(x_labels)
fig, ax = plt.subplots(1,1)  # Create a figure and an axes.
ax.set_title('Top Predictors: All Sites combined; target: High_EBLoss, vars: PI')
ax.bar(x_labels, coeffs_wgt.values[0:25])
ax.set_ylabel('Coeff')
ax.set_xlabel('Variable')
plt.draw()
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, fontsize=16)
#plt.xlim(1,20)
plt.show()

NameError: name 'plt' is not defined

In [28]:
# No Sitenum - 5
coeffs = pd.Series(data=gb.feature_importances_,
                   index=X_test.columns.values).sort_values(ascending=False)
coeffs[0:20]

Delmode         0.130822
Delfetalpos     0.059513
transfus_yes    0.052151
Anteanemia      0.048770
Presentdel      0.046679
Lac_None        0.039831
new_BMI         0.036245
Admefface       0.035901
BESTGA          0.033457
Analgesia       0.030581
Hxcsection      0.024901
new_age         0.024420
Dilat_lst       0.022652
AdmSBP          0.022127
MthInd_Mec      0.021862
CS_FTP          0.021307
chorio          0.018980
MthInd_Miso     0.015475
HosEpitype      0.015274
AdmDBP          0.014916
dtype: float64

In [20]:
gb_params # 5 runs

{'n_estimators': 82,
 'subsample': 0.813187269381181,
 'max_depth': 8,
 'max_features': 20,
 'learning_rate': 0.10560624242958987,
 'min_samples_leaf': 23,
 'random_state': 42}

In [2]:
from optuna.visualization import plot_optimization_history

plotly_config = {"staticPlot": True}

fig = plot_optimization_history(study)
fig.show(config=plotly_config)

NameError: name 'study' is not defined

In [None]:
from optuna.visualization import plot_param_importances

fig = plot_param_importances(study)
fig.show(config=plotly_config)

In [23]:
np.mean([0.3149689279176914, 0.31449457458602254, 0.31066798538596907, 0.3219519143488398])

0.3155208505596307

<h1 align='center'>Thanks and make sure to learn!</h1>