In [1]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
#import plotly.express as px
import seaborn as sns

from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import StratifiedKFold
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, \
                            accuracy_score, f1_score, average_precision_score, \
                            fbeta_score, precision_recall_curve, auc
from sklearn.metrics import matthews_corrcoef

from sklearn.model_selection import train_test_split

import optuna
from optuna.samplers import TPESampler
sns.set(style='white', context='notebook', palette='deep')
%config Completer.use_jedi = False

In [2]:
csl_df = pd.read_csv('../../data/csl/Sites/CSL_he_PI_s44.csv', index_col=0)
X = csl_df.drop(['high_EBL','Sitenum'], axis=1, inplace=False)
y = csl_df['high_EBL'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=7)
X_train                                                    

Unnamed: 0_level_0,transfus_yes,Accrete,AdmBishop,Admcervpos,Admconsistency,Admcontract,AdmDBP,Admefface,Admpresent,AdmSBP,...,TD_nos,ThreatenedPB,threatpb9,UnspecHBP,uscar,version9,new_age,new_BMI,new_high_Age,new_high_BMI
MomID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
44-10938,0,0,8,8,8,99.0,73,40.0,77,117.0,...,0,0,0,0,0,0,25,20,1,0
44-01887,0,0,8,8,8,99.0,78,0.0,77,154.0,...,0,0,0,0,0,0,34,24,1,1
44-13761,0,0,8,8,8,99.0,87,0.0,77,148.0,...,0,0,0,0,0,0,26,31,1,1
44-02977,0,0,8,8,8,0.0,74,50.0,77,124.4,...,0,0,0,0,0,0,27,40,1,1
44-10546,0,0,8,8,8,1.8,71,90.0,77,117.0,...,0,0,0,0,1,0,34,24,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44-19833,0,0,8,8,8,1.0,74,70.0,77,124.4,...,0,0,0,0,0,0,19,31,0,1
44-07454,0,0,8,8,8,99.0,98,20.0,77,158.0,...,0,0,0,0,0,0,36,24,1,1
44-04903,0,0,8,8,8,2.9,69,60.0,77,145.0,...,0,1,0,0,0,0,27,24,1,1
44-17473,0,0,8,8,8,99.0,54,80.0,77,138.0,...,0,0,0,0,0,0,20,24,1,1


In [3]:
X.shape

(16014, 195)

<h1 align='center'>Exploratory Data Analysis</h1>

<h1 align='center'>Modelling</h1>

In [8]:
print(X_train.shape)
print(X.shape)
print(y.shape)

(11209, 195)
(16014, 195)
(16014,)


Here we define our validation set

<h1 align='center'>Modelling Using Undersampling</h1>

We will use `imblearn`'s RandomUnderSampler to undersample from the majority class so that they match

In [9]:
sampler = RandomUnderSampler(sampling_strategy=1.0, random_state=7)
X_rus, y_rus = sampler.fit_resample(X_train, y_train)

# Try with no undersampling
#X_rus, y_rus = X_train, y_train

In [10]:
print(X_rus.shape)
print(y_rus.shape)
print(np.bincount(y_rus))

(1320, 195)
(1320,)
[660 660]


<h1 align='center'>Basic Gradient Boosting</h1>

In [11]:
%%time
from sklearn.metrics import matthews_corrcoef
from sklearn.utils import class_weight
gb_rus = GradientBoostingClassifier()   # If not undersampling
gb_rus.fit(X_rus, y_rus)
print(gb_rus.get_params())
y_pred = gb_rus.predict(X_test)
probs = gb_rus.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'presort': 'deprecated', 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
[[3069 1453]
 [  27  256]]
              precision    recall  f1-score   support

           0       0.99      0.68      0.81      4522
           1       0.15      0.90      0.26       283

    accuracy                           0.69      4805
   macro avg       0.57      0.79      0.53      4805
weighted avg       0.94      0.69      0.77      4805

ROC AUC prob1: 0.8360922572488172
MCC: 0.28685394522940977
F2: 0.4505455825413586
PR_AUC: 0.2806320711284951
CPU times: user 522 ms, sys: 6.69 ms, total

In [12]:
%%time
# Not undersampled
from sklearn.metrics import matthews_corrcoef
from sklearn.utils import class_weight
gb_weight = GradientBoostingClassifier()  
weights = class_weight.compute_sample_weight('balanced', y=y_train)
gb_weight.fit(X_train, y_train, sample_weight=weights)
print(gb_weight.get_params())
y_pred = gb_weight.predict(X_test)
probs = gb_weight.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'presort': 'deprecated', 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
[[3141 1381]
 [  36  247]]
              precision    recall  f1-score   support

           0       0.99      0.69      0.82      4522
           1       0.15      0.87      0.26       283

    accuracy                           0.71      4805
   macro avg       0.57      0.78      0.54      4805
weighted avg       0.94      0.71      0.78      4805

ROC AUC prob1: 0.8379700029537573
MCC: 0.2822335975116122
F2: 0.447463768115942
PR_AUC: 0.25476426452093576
CPU times: user 3.68 s, sys: 5.22 ms, total:

<h1 align='center'>Hyperparameter Tuning with Cross Validation</h1>

In [13]:
%%time
def create_model(trial):
    n_estimators = trial.suggest_int('n_estimators', 10, 200)
    subsample = trial.suggest_float('subsample', 0.8, 1.0)
    max_depth = trial.suggest_int('max_depth',3,20)
    max_features = trial.suggest_int('max_features',10,20)
    learning_rate = trial.suggest_uniform('learning_rate',0.08,0.2)
    min_samples_leaf = trial.suggest_int('min_samples_leaf',1,50)
    model = GradientBoostingClassifier(n_estimators=n_estimators, subsample=subsample, 
                           max_depth=max_depth, learning_rate=learning_rate, 
                           min_samples_leaf=min_samples_leaf)
#    model = GradientBoostingClassifier(n_estimators=n_estimators, subsample=subsample, 
#                           learning_rate=learning_rate, 
#                           min_samples_leaf=min_samples_leaf)
#    model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate) 
    return model

def objective(trial):
    model = create_model(trial)
    
    mcc_list = []
    roc_list = []
    pr_list = []
    f2_list = []
    cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=11)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train_c, X_test_c = X.iloc[train_idx], X.iloc[test_idx]
        y_train_c, y_test_c = y[train_idx], y[test_idx]
        sampler = RandomUnderSampler(sampling_strategy=1.0, random_state=7)
        X_res, y_res = sampler.fit_resample(X_train_c, y_train_c)

#        weights = class_weight.compute_sample_weight('balanced', y=y_train)
#        model.fit(X_train, y_train, sample_weight=weights)
        model.fit(X_res, y_res)
        y_pred = model.predict(X_test_c)
        probs = model.predict_proba(X_test_c)
        prob1 = probs[:, 1]  # Only positives
        precision, recall, pr_thresh = precision_recall_curve(y_test_c, prob1)
        roc = roc_auc_score(y_test_c, prob1)
        roc_list.append(roc)
        mcc = matthews_corrcoef(y_test_c, y_pred)  # MWB
        mcc_list.append(mcc)
        f2 = fbeta_score(y_test_c, y_pred, beta=2.0, average=None)[1]
        pr_auc = auc(recall, precision)
        pr_list.append(pr_auc)
        avg_precision = average_precision_score(y_test_c, prob1)
#        print(f'ROC: {roc}')
#        print(f'PR_AUC: {pr_auc}')
#        print(f'avg_prec: {avg_precision}')
        
    print(f'MCC: {mcc_list}')
    print(f'ROC: {roc_list}')
    print(f'PR_AUC: {pr_list}')
    return np.mean(mcc_list)

sampler = TPESampler(seed=7)
study = optuna.create_study(sampler=sampler,direction='maximize')
#study.optimize(objective,n_trials=60)
study.optimize(objective,n_trials=25)
#study.optimize(objective,n_trials=5)

[32m[I 2021-12-10 14:34:05,490][0m A new study created in memory with name: no-name-a8a26de2-2394-49fe-99e6-0f9cf989248e[0m
[32m[I 2021-12-10 14:34:06,908][0m Trial 0 finished with value: 0.2829188039431525 and parameters: {'n_estimators': 24, 'subsample': 0.955983758448023, 'max_depth': 10, 'max_features': 17, 'learning_rate': 0.1973587414395923, 'min_samples_leaf': 27}. Best is trial 0 with value: 0.2829188039431525.[0m


MCC: [0.283405297709138, 0.2954640751594321, 0.2644634897779625, 0.28834235312607726]
ROC: [0.8456032512864803, 0.8416133632012667, 0.8369999505068548, 0.8322796675249583]
PR_AUC: [0.25538554986480816, 0.23259926735107828, 0.2706580759149826, 0.23000282333646943]


[32m[I 2021-12-10 14:34:10,421][0m Trial 1 finished with value: 0.2802069120851233 and parameters: {'n_estimators': 105, 'subsample': 0.8144102266719524, 'max_depth': 7, 'max_features': 15, 'learning_rate': 0.16150759953451288, 'min_samples_leaf': 41}. Best is trial 0 with value: 0.2829188039431525.[0m


MCC: [0.27581341604885756, 0.2862248341445119, 0.269153359269899, 0.28963603887722494]
ROC: [0.8403842347691534, 0.8371792795710534, 0.8229776425965004, 0.8351741428377828]
PR_AUC: [0.245031251816038, 0.2297979488852732, 0.2166896578982507, 0.2337646712027493]


[32m[I 2021-12-10 14:34:13,641][0m Trial 2 finished with value: 0.28311784656096983 and parameters: {'n_estimators': 82, 'subsample': 0.813187269381181, 'max_depth': 8, 'max_features': 20, 'learning_rate': 0.10560624242958987, 'min_samples_leaf': 23}. Best is trial 2 with value: 0.28311784656096983.[0m


MCC: [0.2803716785726608, 0.2833708864290812, 0.2735319291876853, 0.2951968920544521]
ROC: [0.8448902893231135, 0.8412478858541149, 0.8327255425123621, 0.8415175949767356]
PR_AUC: [0.2795448755636807, 0.23908935885885116, 0.24920042648033527, 0.24895139591701682]


[32m[I 2021-12-10 14:34:21,511][0m Trial 3 finished with value: 0.2878301042453931 and parameters: {'n_estimators': 187, 'subsample': 0.8049798455100696, 'max_depth': 13, 'max_features': 20, 'learning_rate': 0.10763634548251577, 'min_samples_leaf': 28}. Best is trial 3 with value: 0.2878301042453931.[0m


MCC: [0.288603936965472, 0.2966813271474825, 0.27281225808865533, 0.29322289477996266]
ROC: [0.8396071737018246, 0.837939472453129, 0.8287154729070023, 0.8392273117405249]
PR_AUC: [0.22605454711101877, 0.23109538723933404, 0.22887059434334744, 0.23796823695724095]


[32m[I 2021-12-10 14:34:29,619][0m Trial 4 finished with value: 0.2750948127243314 and parameters: {'n_estimators': 183, 'subsample': 0.8266338891518501, 'max_depth': 12, 'max_features': 18, 'learning_rate': 0.16028158890606967, 'min_samples_leaf': 24}. Best is trial 3 with value: 0.2878301042453931.[0m


MCC: [0.2843607071890491, 0.27944622137714487, 0.2564573694324353, 0.28011495289869637]
ROC: [0.8354598492209148, 0.8369386267947748, 0.8233972094864862, 0.8338652482269503]
PR_AUC: [0.21919798001336133, 0.21712381351295548, 0.21347330542025103, 0.21476563733206389]


[32m[I 2021-12-10 14:34:31,710][0m Trial 5 finished with value: 0.28524226301745004 and parameters: {'n_estimators': 49, 'subsample': 0.8981531778182141, 'max_depth': 9, 'max_features': 15, 'learning_rate': 0.12390684629367116, 'min_samples_leaf': 42}. Best is trial 3 with value: 0.2878301042453931.[0m


MCC: [0.27878658046820753, 0.282699036114478, 0.28465930771811987, 0.2948241277689947]
ROC: [0.8417618032314945, 0.8400266292417863, 0.8353441798310934, 0.8488559877128788]
PR_AUC: [0.25760284535070943, 0.24277352123332002, 0.25358535757769707, 0.25909822118175135]


[32m[I 2021-12-10 14:34:39,896][0m Trial 6 finished with value: 0.2822454574576821 and parameters: {'n_estimators': 156, 'subsample': 0.8627989354425325, 'max_depth': 13, 'max_features': 13, 'learning_rate': 0.13434115190556806, 'min_samples_leaf': 18}. Best is trial 3 with value: 0.2878301042453931.[0m


MCC: [0.2845254029049371, 0.28774063178670667, 0.2678548703738489, 0.2888609247652358]
ROC: [0.8434508708481774, 0.8394598582172801, 0.8268932252882975, 0.8389969282197227]
PR_AUC: [0.23962865775034362, 0.2160670551539822, 0.21163420084227022, 0.21788445015813723]


[32m[I 2021-12-10 14:34:44,979][0m Trial 7 finished with value: 0.2846883992106069 and parameters: {'n_estimators': 135, 'subsample': 0.8740702165976071, 'max_depth': 11, 'max_features': 17, 'learning_rate': 0.12955901949366017, 'min_samples_leaf': 46}. Best is trial 3 with value: 0.2878301042453931.[0m


MCC: [0.29389661292772085, 0.2848173658037059, 0.2706657097712829, 0.2893739083397178]
ROC: [0.8464511587318724, 0.8337291734139407, 0.8282464128718173, 0.8405000677598591]
PR_AUC: [0.2605380851980186, 0.2155360736193632, 0.22781992403957543, 0.23825981123630502]


[32m[I 2021-12-10 14:34:47,479][0m Trial 8 finished with value: 0.27854467485466483 and parameters: {'n_estimators': 44, 'subsample': 0.9482237745826528, 'max_depth': 10, 'max_features': 14, 'learning_rate': 0.15612558423606065, 'min_samples_leaf': 27}. Best is trial 3 with value: 0.2878301042453931.[0m


MCC: [0.27716030503513295, 0.29414754828462686, 0.25349756459904166, 0.289373281499858]
ROC: [0.8448070729425312, 0.8482661754651121, 0.8292047801379512, 0.8368195780819443]
PR_AUC: [0.25613948503059336, 0.26251627361063595, 0.2574001522403092, 0.2459001986079441]


[32m[I 2021-12-10 14:34:49,587][0m Trial 9 finished with value: 0.29408360597713357 and parameters: {'n_estimators': 89, 'subsample': 0.8002853761125517, 'max_depth': 4, 'max_features': 17, 'learning_rate': 0.14292147161182367, 'min_samples_leaf': 35}. Best is trial 9 with value: 0.29408360597713357.[0m


MCC: [0.2930133667606989, 0.29459917428364724, 0.2905173841814652, 0.2982044986827229]
ROC: [0.8476532980675808, 0.8472664543524416, 0.840069650353426, 0.8485047657767539]
PR_AUC: [0.26908767741248746, 0.24742082466972984, 0.25836361452476836, 0.26199746675325886]


[32m[I 2021-12-10 14:34:51,481][0m Trial 10 finished with value: 0.28686043628172064 and parameters: {'n_estimators': 82, 'subsample': 0.9869602664638994, 'max_depth': 3, 'max_features': 10, 'learning_rate': 0.1867212049748521, 'min_samples_leaf': 4}. Best is trial 9 with value: 0.29408360597713357.[0m


MCC: [0.27736510857212077, 0.2955549456468948, 0.2760815229568722, 0.2984401679509946]
ROC: [0.848774470114074, 0.8565467676058873, 0.8330376867803809, 0.8556302796223517]
PR_AUC: [0.2666741190534415, 0.25376838760425713, 0.26126863954393753, 0.27230847215617965]


[32m[I 2021-12-10 14:34:58,550][0m Trial 11 finished with value: 0.2855365869660532 and parameters: {'n_estimators': 193, 'subsample': 0.800508843859655, 'max_depth': 18, 'max_features': 20, 'learning_rate': 0.0820677705499854, 'min_samples_leaf': 38}. Best is trial 9 with value: 0.29408360597713357.[0m


MCC: [0.2926333329080998, 0.2934933500268704, 0.2632271410009381, 0.2927925239283048]
ROC: [0.8443707492173161, 0.8357050001799273, 0.8295534818427648, 0.8415322762795319]
PR_AUC: [0.24088126023098236, 0.21731569339445828, 0.24134960174327974, 0.24209744914171633]


[32m[I 2021-12-10 14:35:04,641][0m Trial 12 finished with value: 0.28344723725693355 and parameters: {'n_estimators': 139, 'subsample': 0.8516235870960643, 'max_depth': 16, 'max_features': 18, 'learning_rate': 0.1054781979556547, 'min_samples_leaf': 34}. Best is trial 9 with value: 0.29408360597713357.[0m


MCC: [0.2994066970547032, 0.2809141485522983, 0.26560017226090105, 0.2878679311598316]
ROC: [0.8460631904710496, 0.834966173665839, 0.8246446617143526, 0.8465318245471384]
PR_AUC: [0.244151092226955, 0.22678969031628968, 0.2176291464751723, 0.2421317426200019]


[32m[I 2021-12-10 14:35:06,852][0m Trial 13 finished with value: 0.2928327094054616 and parameters: {'n_estimators': 100, 'subsample': 0.8418193398113022, 'max_depth': 3, 'max_features': 19, 'learning_rate': 0.10684061959682244, 'min_samples_leaf': 14}. Best is trial 9 with value: 0.29408360597713357.[0m


MCC: [0.27752236427793997, 0.3005110419017383, 0.29333791666938447, 0.2999595147727835]
ROC: [0.8531140919068695, 0.8563617798409444, 0.8423592707410024, 0.8530582283055518]
PR_AUC: [0.2851777965032844, 0.2514277655477188, 0.2706496994101378, 0.2655847282330399]


[32m[I 2021-12-10 14:35:08,752][0m Trial 14 finished with value: 0.2909150878351678 and parameters: {'n_estimators': 91, 'subsample': 0.8403871437580014, 'max_depth': 3, 'max_features': 18, 'learning_rate': 0.09122622785465852, 'min_samples_leaf': 11}. Best is trial 9 with value: 0.29408360597713357.[0m


MCC: [0.2794470066055071, 0.2974388832800012, 0.2863532090791228, 0.30042125237604006]
ROC: [0.8489054796862067, 0.8509077332757565, 0.8439289908347694, 0.8518955820571894]
PR_AUC: [0.2789479211691327, 0.2493491164552717, 0.2744417157017649, 0.2636855714813193]


[32m[I 2021-12-10 14:35:13,185][0m Trial 15 finished with value: 0.2813211606188769 and parameters: {'n_estimators': 127, 'subsample': 0.879256742403203, 'max_depth': 6, 'max_features': 17, 'learning_rate': 0.14640266358227755, 'min_samples_leaf': 13}. Best is trial 9 with value: 0.29408360597713357.[0m


MCC: [0.2819343035877987, 0.2914616414104702, 0.2509232816593393, 0.3009654158178996]
ROC: [0.8450972057288855, 0.8472495861671885, 0.8196627267123504, 0.8488966436283146]
PR_AUC: [0.25675424266099445, 0.24475784363318936, 0.22055936924220543, 0.2464363459374232]


[32m[I 2021-12-10 14:35:14,810][0m Trial 16 finished with value: 0.2895449831262538 and parameters: {'n_estimators': 58, 'subsample': 0.8395046051428205, 'max_depth': 4, 'max_features': 12, 'learning_rate': 0.11697284397069731, 'min_samples_leaf': 3}. Best is trial 9 with value: 0.29408360597713357.[0m


MCC: [0.27733693792040354, 0.30250451926657673, 0.2843765577117112, 0.29396191760632373]
ROC: [0.8431078844146965, 0.8491168942747132, 0.841382343545419, 0.8484268419388353]
PR_AUC: [0.26561107756028923, 0.27376760227668406, 0.2633850625397905, 0.26149757576111676]


[32m[I 2021-12-10 14:35:18,096][0m Trial 17 finished with value: 0.282648350188032 and parameters: {'n_estimators': 112, 'subsample': 0.924580558278454, 'max_depth': 5, 'max_features': 19, 'learning_rate': 0.14302287802722008, 'min_samples_leaf': 32}. Best is trial 9 with value: 0.29408360597713357.[0m


MCC: [0.2886824254397137, 0.28884478724746715, 0.2672859236286785, 0.2857802644362688]
ROC: [0.8434261308431394, 0.8427660225268992, 0.8326479282619357, 0.8440405203957176]
PR_AUC: [0.26644912719350705, 0.23456789104780776, 0.24686116425613566, 0.239013493078897]


[32m[I 2021-12-10 14:35:18,722][0m Trial 18 finished with value: 0.2869263049706109 and parameters: {'n_estimators': 10, 'subsample': 0.8911597547874567, 'max_depth': 6, 'max_features': 16, 'learning_rate': 0.17521314425168996, 'min_samples_leaf': 50}. Best is trial 9 with value: 0.29408360597713357.[0m


MCC: [0.2774428799455955, 0.28978015530383205, 0.2893799555728828, 0.2911022290601331]
ROC: [0.8195610223469718, 0.8331067373780993, 0.8379403202656432, 0.8368975019198627]
PR_AUC: [0.2386540345143549, 0.23713446389111212, 0.24785992402070342, 0.2512912461621813]


[32m[I 2021-12-10 14:35:22,557][0m Trial 19 finished with value: 0.2871521220500431 and parameters: {'n_estimators': 69, 'subsample': 0.8362004263898423, 'max_depth': 15, 'max_features': 19, 'learning_rate': 0.11575454849369761, 'min_samples_leaf': 17}. Best is trial 9 with value: 0.29408360597713357.[0m


MCC: [0.2823437548148377, 0.3005426329833771, 0.27107403829774107, 0.29464806210421646]
ROC: [0.8435599517794811, 0.8436735308935189, 0.8318256671451004, 0.8410173013506799]
PR_AUC: [0.2465585187220715, 0.24008554498183562, 0.22336779519812472, 0.22828877441931159]


[32m[I 2021-12-10 14:35:27,585][0m Trial 20 finished with value: 0.2771644086090082 and parameters: {'n_estimators': 158, 'subsample': 0.912242179344593, 'max_depth': 5, 'max_features': 16, 'learning_rate': 0.17086645704688908, 'min_samples_leaf': 9}. Best is trial 9 with value: 0.29408360597713357.[0m


MCC: [0.27825104251013233, 0.27318737758578054, 0.2676696134565732, 0.28954960088354675]
ROC: [0.8400378746986217, 0.8382464734247365, 0.8274792691212267, 0.8456498170483806]
PR_AUC: [0.26237147303410613, 0.23688583876150057, 0.2272708062883379, 0.23506195216353323]


[32m[I 2021-12-10 14:35:29,911][0m Trial 21 finished with value: 0.2925663381269687 and parameters: {'n_estimators': 96, 'subsample': 0.8564994809384565, 'max_depth': 3, 'max_features': 18, 'learning_rate': 0.0857171190366385, 'min_samples_leaf': 10}. Best is trial 9 with value: 0.29408360597713357.[0m


MCC: [0.2823220531446776, 0.29970838523032484, 0.28868469388793794, 0.29955022024493444]
ROC: [0.8549482259167296, 0.8533727374140847, 0.8485880955487666, 0.8524833988345305]
PR_AUC: [0.30378129990348257, 0.2614108584735758, 0.2847033647349761, 0.28173126844387963]


[32m[I 2021-12-10 14:35:32,305][0m Trial 22 finished with value: 0.288693784879823 and parameters: {'n_estimators': 110, 'subsample': 0.8599557965694357, 'max_depth': 3, 'max_features': 19, 'learning_rate': 0.09245685393644121, 'min_samples_leaf': 19}. Best is trial 9 with value: 0.29408360597713357.[0m


MCC: [0.28261121775873205, 0.29232405764940406, 0.27959309680988476, 0.3002467673012711]
ROC: [0.848924596962827, 0.851574588866098, 0.8437591393591988, 0.8496764466729909]
PR_AUC: [0.30430704248314816, 0.253521147015179, 0.28215343091883727, 0.26965358743123874]


[32m[I 2021-12-10 14:35:38,445][0m Trial 23 finished with value: 0.2862917039477315 and parameters: {'n_estimators': 85, 'subsample': 0.8302098329306508, 'max_depth': 20, 'max_features': 16, 'learning_rate': 0.08216653449054163, 'min_samples_leaf': 9}. Best is trial 9 with value: 0.29408360597713357.[0m


MCC: [0.289383340702955, 0.30126403698310233, 0.2585623466721122, 0.29595709143275645]
ROC: [0.8464230450897836, 0.8488430674007702, 0.8280878098383374, 0.8402346749785428]
PR_AUC: [0.23779705274223092, 0.24800059805286528, 0.21146035211602657, 0.21596214632190155]


[32m[I 2021-12-10 14:35:41,556][0m Trial 24 finished with value: 0.2898508668988375 and parameters: {'n_estimators': 99, 'subsample': 0.8196009430772153, 'max_depth': 5, 'max_features': 18, 'learning_rate': 0.08943759311949451, 'min_samples_leaf': 13}. Best is trial 9 with value: 0.29408360597713357.[0m


MCC: [0.2790643825474787, 0.2986425600552829, 0.2845868717265862, 0.29710965326600225]
ROC: [0.8498911439778329, 0.853379484688186, 0.8347761335055094, 0.8488819623255184]
PR_AUC: [0.2762514426655979, 0.2682334977424441, 0.2547028211952381, 0.25552116547488896]
CPU times: user 1min 35s, sys: 595 ms, total: 1min 36s
Wall time: 1min 36s


In [14]:
best = study.best_trial
best

FrozenTrial(number=9, values=[0.29408360597713357], datetime_start=datetime.datetime(2021, 12, 10, 14, 34, 47, 480465), datetime_complete=datetime.datetime(2021, 12, 10, 14, 34, 49, 586874), params={'n_estimators': 89, 'subsample': 0.8002853761125517, 'max_depth': 4, 'max_features': 17, 'learning_rate': 0.14292147161182367, 'min_samples_leaf': 35}, distributions={'n_estimators': IntUniformDistribution(high=200, low=10, step=1), 'subsample': UniformDistribution(high=1.0, low=0.8), 'max_depth': IntUniformDistribution(high=20, low=3, step=1), 'max_features': IntUniformDistribution(high=20, low=10, step=1), 'learning_rate': UniformDistribution(high=0.2, low=0.08), 'min_samples_leaf': IntUniformDistribution(high=50, low=1, step=1)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=9, state=TrialState.COMPLETE, value=None)

In [15]:
%%time
gb_params = study.best_params
#gb_params['random_state'] = 42
gb = GradientBoostingClassifier(**gb_params)
gb.fit(X_rus, y_rus)
y_pred = gb.predict(X_test)
probs = gb.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

[[3150 1372]
 [  38  245]]
              precision    recall  f1-score   support

           0       0.99      0.70      0.82      4522
           1       0.15      0.87      0.26       283

    accuracy                           0.71      4805
   macro avg       0.57      0.78      0.54      4805
weighted avg       0.94      0.71      0.78      4805

ROC AUC prob1: 0.8442776031744295
MCC: 0.28017342603227563
F2: 0.44561658785012737
PR_AUC: 0.26059185559157483
CPU times: user 197 ms, sys: 3.01 ms, total: 200 ms
Wall time: 197 ms


In [16]:
%%time
gb_params = study.best_params
gb_params['random_state'] = 42
gb_weight = GradientBoostingClassifier(**gb_params)
weights = class_weight.compute_sample_weight('balanced', y=y_train)
gb_weight.fit(X_train, y_train, sample_weight=weights)
y_pred = gb_weight.predict(X_test)
probs = gb_weight.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

[[3259 1263]
 [  44  239]]
              precision    recall  f1-score   support

           0       0.99      0.72      0.83      4522
           1       0.16      0.84      0.27       283

    accuracy                           0.73      4805
   macro avg       0.57      0.78      0.55      4805
weighted avg       0.94      0.73      0.80      4805

ROC AUC prob1: 0.8441115520041009
MCC: 0.2870702102708204
F2: 0.45368261199696286
PR_AUC: 0.2692526257562669
CPU times: user 868 ms, sys: 6.01 ms, total: 874 ms
Wall time: 871 ms


In [17]:
# No Sitenum - 5
coeffs_wgt = pd.Series(data=gb_weight.feature_importances_,
                   index=X_test.columns.values).sort_values(ascending=False)
coeffs_wgt[0:20]

Delmode            0.261279
TrialLabor         0.086231
CS_FTP             0.060727
transfus_yes       0.058034
Episiotomy         0.044051
Lac_Min            0.043800
Admefface          0.040804
Dilat_lst          0.030375
new_age            0.021443
BESTGA             0.020973
new_BMI            0.020449
Antefetdistress    0.018872
AdmSBP             0.016804
AdmDBP             0.014495
Admreason          0.014314
chorio9            0.013997
Admcontract        0.010723
prelaborCD         0.010661
uscar              0.010136
HxnumCS            0.010124
dtype: float64

In [1]:
plt.rcParams["figure.figsize"] = (14, 8)
x_labels = coeffs_wgt.index.values[0:25]
print(x_labels)
fig, ax = plt.subplots(1,1)  # Create a figure and an axes.
ax.set_title('Top Predictors: All Sites combined; target: High_EBLoss, vars: PI')
ax.bar(x_labels, coeffs_wgt.values[0:25])
ax.set_ylabel('Coeff')
ax.set_xlabel('Variable')
plt.draw()
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, fontsize=16)
#plt.xlim(1,20)
plt.show()

NameError: name 'plt' is not defined

In [19]:
# No Sitenum - 5
coeffs = pd.Series(data=gb.feature_importances_,
                   index=X_test.columns.values).sort_values(ascending=False)
coeffs[0:20]

Delmode             0.135576
Admefface           0.100974
transfus_yes        0.080779
CS_FTP              0.077234
Episiotomy          0.049818
spontlabor          0.043917
Lac_Min             0.039844
Dilat_lst           0.028894
Hxcsection          0.025010
new_age             0.024443
prelaborCD          0.021669
AdmSBP              0.020131
BESTGA              0.019069
TrialLabor          0.018761
vertex              0.018372
Intrafetdistress    0.017526
new_BMI             0.016969
AdmDBP              0.015993
uscar               0.015188
momrace_new         0.014002
dtype: float64

In [21]:
gb_params # 5 runs

{'n_estimators': 89,
 'subsample': 0.8002853761125517,
 'max_depth': 4,
 'max_features': 17,
 'learning_rate': 0.14292147161182367,
 'min_samples_leaf': 35,
 'random_state': 42}

In [2]:
from optuna.visualization import plot_optimization_history

plotly_config = {"staticPlot": True}

fig = plot_optimization_history(study)
fig.show(config=plotly_config)

NameError: name 'study' is not defined

In [3]:
from optuna.visualization import plot_param_importances

fig = plot_param_importances(study)
fig.show(config=plotly_config)

NameError: name 'study' is not defined

In [24]:
np.mean([0.3149689279176914, 0.31449457458602254, 0.31066798538596907, 0.3219519143488398])

0.3155208505596307

<h1 align='center'>Thanks and make sure to learn!</h1>