In [1]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
#import plotly.express as px
import seaborn as sns

from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import StratifiedKFold
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, \
                            accuracy_score, f1_score, average_precision_score, \
                            fbeta_score, precision_recall_curve, auc
from sklearn.metrics import matthews_corrcoef

from sklearn.model_selection import train_test_split

import optuna
from optuna.samplers import TPESampler
sns.set(style='white', context='notebook', palette='deep')
%config Completer.use_jedi = False

In [2]:
csl_df = pd.read_csv('../../data/csl/Sites/CSL_he_PI_s48.csv', index_col=0)
X = csl_df.drop(['high_EBL','Sitenum'], axis=1, inplace=False)
y = csl_df['high_EBL'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=7)
X_train                                                    

Unnamed: 0_level_0,transfus_yes,Accrete,AdmBishop,Admcervpos,Admconsistency,Admcontract,AdmDBP,Admefface,Admpresent,AdmSBP,...,TD_nos,ThreatenedPB,threatpb9,UnspecHBP,uscar,version9,new_age,new_BMI,new_high_Age,new_high_BMI
MomID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
48-07666,0,0,8,8,8,66.0,61,70.0,1,122.0,...,0,0,0,0,1,0,30,24,1,1
48-04148,0,0,8,8,8,66.0,74,60.0,1,126.0,...,0,0,0,0,0,0,25,28,1,1
48-11172,0,0,8,8,8,66.0,74,100.0,1,124.4,...,0,0,0,0,0,0,18,25,0,1
48-08241,0,0,8,8,8,77.0,59,0.0,77,115.0,...,0,0,0,0,1,0,40,32,2,1
48-04258,0,0,8,8,8,66.0,70,90.0,1,109.0,...,0,0,0,0,0,0,26,20,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48-07263,0,0,8,8,8,66.0,48,0.0,77,115.0,...,0,0,0,0,0,0,25,25,1,1
48-14530,0,0,8,8,8,66.0,83,80.0,1,129.0,...,0,0,0,0,0,0,20,18,1,0
48-14340,0,0,8,8,8,66.0,68,75.0,1,125.0,...,0,0,0,0,0,0,17,21,0,1
48-00907,0,0,8,8,8,77.0,70,0.0,77,115.0,...,0,0,0,0,0,0,18,21,0,1


In [3]:
X.shape

(16086, 195)

<h1 align='center'>Exploratory Data Analysis</h1>

<h1 align='center'>Modelling</h1>

In [8]:
print(X_train.shape)
print(X.shape)
print(y.shape)

(11260, 195)
(16086, 195)
(16086,)


Here we define our validation set

<h1 align='center'>Modelling Using Undersampling</h1>

We will use `imblearn`'s RandomUnderSampler to undersample from the majority class so that they match

In [9]:
sampler = RandomUnderSampler(sampling_strategy=1.0, random_state=7)
X_rus, y_rus = sampler.fit_resample(X_train, y_train)

# Try with no undersampling
#X_rus, y_rus = X_train, y_train

In [10]:
print(X_rus.shape)
print(y_rus.shape)
print(np.bincount(y_rus))

(386, 195)
(386,)
[193 193]


<h1 align='center'>Basic Gradient Boosting</h1>

In [11]:
%%time
from sklearn.metrics import matthews_corrcoef
from sklearn.utils import class_weight
gb_rus = GradientBoostingClassifier()   # If not undersampling
gb_rus.fit(X_rus, y_rus)
print(gb_rus.get_params())
y_pred = gb_rus.predict(X_test)
probs = gb_rus.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'presort': 'deprecated', 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
[[3110 1633]
 [  15   68]]
              precision    recall  f1-score   support

           0       1.00      0.66      0.79      4743
           1       0.04      0.82      0.08        83

    accuracy                           0.66      4826
   macro avg       0.52      0.74      0.43      4826
weighted avg       0.98      0.66      0.78      4826

ROC AUC prob1: 0.7945596427455552
MCC: 0.1292599526576016
F2: 0.16724053123462862
PR_AUC: 0.0632557784675971
CPU times: user 235 ms, sys: 3.01 ms, total

In [12]:
%%time
# Not undersampled
from sklearn.metrics import matthews_corrcoef
from sklearn.utils import class_weight
gb_weight = GradientBoostingClassifier()  
weights = class_weight.compute_sample_weight('balanced', y=y_train)
gb_weight.fit(X_train, y_train, sample_weight=weights)
print(gb_weight.get_params())
y_pred = gb_weight.predict(X_test)
probs = gb_weight.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'presort': 'deprecated', 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
[[3750  993]
 [  36   47]]
              precision    recall  f1-score   support

           0       0.99      0.79      0.88      4743
           1       0.05      0.57      0.08        83

    accuracy                           0.79      4826
   macro avg       0.52      0.68      0.48      4826
weighted avg       0.97      0.79      0.87      4826

ROC AUC prob1: 0.7975380332208022
MCC: 0.11285217232238168
F2: 0.17128279883381925
PR_AUC: 0.05756923729401617
CPU times: user 3.31 s, sys: 3.19 ms, tot

<h1 align='center'>Hyperparameter Tuning with Cross Validation</h1>

In [13]:
%%time
def create_model(trial):
    n_estimators = trial.suggest_int('n_estimators', 10, 200)
    subsample = trial.suggest_float('subsample', 0.8, 1.0)
    max_depth = trial.suggest_int('max_depth',3,20)
    max_features = trial.suggest_int('max_features',10,20)
    learning_rate = trial.suggest_uniform('learning_rate',0.08,0.2)
    min_samples_leaf = trial.suggest_int('min_samples_leaf',1,50)
    model = GradientBoostingClassifier(n_estimators=n_estimators, subsample=subsample, 
                           max_depth=max_depth, learning_rate=learning_rate, 
                           min_samples_leaf=min_samples_leaf)
#    model = GradientBoostingClassifier(n_estimators=n_estimators, subsample=subsample, 
#                           learning_rate=learning_rate, 
#                           min_samples_leaf=min_samples_leaf)
#    model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate) 
    return model

def objective(trial):
    model = create_model(trial)
    
    mcc_list = []
    roc_list = []
    pr_list = []
    f2_list = []
    cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=11)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train_c, X_test_c = X.iloc[train_idx], X.iloc[test_idx]
        y_train_c, y_test_c = y[train_idx], y[test_idx]
        sampler = RandomUnderSampler(sampling_strategy=1.0, random_state=7)
        X_res, y_res = sampler.fit_resample(X_train_c, y_train_c)

#        weights = class_weight.compute_sample_weight('balanced', y=y_train)
#        model.fit(X_train, y_train, sample_weight=weights)
        model.fit(X_res, y_res)
        y_pred = model.predict(X_test_c)
        probs = model.predict_proba(X_test_c)
        prob1 = probs[:, 1]  # Only positives
        precision, recall, pr_thresh = precision_recall_curve(y_test_c, prob1)
        roc = roc_auc_score(y_test_c, prob1)
        roc_list.append(roc)
        mcc = matthews_corrcoef(y_test_c, y_pred)  # MWB
        mcc_list.append(mcc)
        f2 = fbeta_score(y_test_c, y_pred, beta=2.0, average=None)[1]
        pr_auc = auc(recall, precision)
        pr_list.append(pr_auc)
        avg_precision = average_precision_score(y_test_c, prob1)
#        print(f'ROC: {roc}')
#        print(f'PR_AUC: {pr_auc}')
#        print(f'avg_prec: {avg_precision}')
        
    print(f'MCC: {mcc_list}')
    print(f'ROC: {roc_list}')
    print(f'PR_AUC: {pr_list}')
    return np.mean(mcc_list)

sampler = TPESampler(seed=7)
study = optuna.create_study(sampler=sampler,direction='maximize')
#study.optimize(objective,n_trials=60)
study.optimize(objective,n_trials=25)
#study.optimize(objective,n_trials=5)

[32m[I 2021-12-10 16:42:11,545][0m A new study created in memory with name: no-name-10c36113-0097-43df-a307-b953173414cd[0m
[32m[I 2021-12-10 16:42:12,130][0m Trial 0 finished with value: 0.12536268298568753 and parameters: {'n_estimators': 24, 'subsample': 0.955983758448023, 'max_depth': 10, 'max_features': 17, 'learning_rate': 0.1973587414395923, 'min_samples_leaf': 27}. Best is trial 0 with value: 0.12536268298568753.[0m


MCC: [0.1241557316554931, 0.11136301833846281, 0.14183777570860331, 0.124094206240191]
ROC: [0.7909311218410527, 0.7998951447625542, 0.8160773484715131, 0.7965477028692131]
PR_AUC: [0.04942052759246846, 0.07248624289242038, 0.05868622190750352, 0.05721185403032985]


[32m[I 2021-12-10 16:42:13,207][0m Trial 1 finished with value: 0.1288749086289513 and parameters: {'n_estimators': 105, 'subsample': 0.8144102266719524, 'max_depth': 7, 'max_features': 15, 'learning_rate': 0.16150759953451288, 'min_samples_leaf': 41}. Best is trial 1 with value: 0.1288749086289513.[0m


MCC: [0.11813849324853386, 0.12339230785261622, 0.1407174898627038, 0.13325134355195134]
ROC: [0.7675330055690597, 0.7859596637299869, 0.8117518922724872, 0.7887219092882709]
PR_AUC: [0.04809110850742917, 0.054040313029560104, 0.05385905487199185, 0.07149176314933764]


[32m[I 2021-12-10 16:42:14,272][0m Trial 2 finished with value: 0.12389641988525082 and parameters: {'n_estimators': 82, 'subsample': 0.813187269381181, 'max_depth': 8, 'max_features': 20, 'learning_rate': 0.10560624242958987, 'min_samples_leaf': 23}. Best is trial 1 with value: 0.1288749086289513.[0m


MCC: [0.12339230785261622, 0.10276217581439555, 0.13853030110098502, 0.13090089477300648]
ROC: [0.7838075649754177, 0.786773575013657, 0.8075199495394003, 0.789033620841401]
PR_AUC: [0.04999559886927741, 0.08705699169910594, 0.056077538234194674, 0.08054448443014921]


[32m[I 2021-12-10 16:42:16,392][0m Trial 3 finished with value: 0.11841994985629223 and parameters: {'n_estimators': 187, 'subsample': 0.8049798455100696, 'max_depth': 13, 'max_features': 20, 'learning_rate': 0.10763634548251577, 'min_samples_leaf': 28}. Best is trial 1 with value: 0.1288749086289513.[0m


MCC: [0.11564585187083458, 0.10649014567775025, 0.1362022936554321, 0.11534150822115197]
ROC: [0.7621252616798102, 0.769017843721701, 0.8032550020536291, 0.7793558939153906]
PR_AUC: [0.046579283208372846, 0.06059503403314244, 0.05619415266016267, 0.07296907869321126]


[32m[I 2021-12-10 16:42:18,490][0m Trial 4 finished with value: 0.11457652354416027 and parameters: {'n_estimators': 183, 'subsample': 0.8266338891518501, 'max_depth': 12, 'max_features': 18, 'learning_rate': 0.16028158890606967, 'min_samples_leaf': 24}. Best is trial 1 with value: 0.1288749086289513.[0m


MCC: [0.10247612201327762, 0.10751073289893508, 0.13650017574165085, 0.11181906352277755]
ROC: [0.7552253471038323, 0.7708326459082627, 0.8031853253535175, 0.7757473742885642]
PR_AUC: [0.047473900190972286, 0.06442300833791792, 0.06114557747030202, 0.06266642058443522]


[32m[I 2021-12-10 16:42:19,204][0m Trial 5 finished with value: 0.13404334126419687 and parameters: {'n_estimators': 49, 'subsample': 0.8981531778182141, 'max_depth': 9, 'max_features': 15, 'learning_rate': 0.12390684629367116, 'min_samples_leaf': 42}. Best is trial 5 with value: 0.13404334126419687.[0m


MCC: [0.13746650873002125, 0.1372138105693885, 0.14027074552924235, 0.1212223002281354]
ROC: [0.7815748083458903, 0.8185491114801821, 0.8167539459015432, 0.7920700580883647]
PR_AUC: [0.04215700122429387, 0.06374186569390188, 0.06301240557722118, 0.05738571981788554]


[32m[I 2021-12-10 16:42:21,255][0m Trial 6 finished with value: 0.11950314739242815 and parameters: {'n_estimators': 156, 'subsample': 0.8627989354425325, 'max_depth': 13, 'max_features': 13, 'learning_rate': 0.13434115190556806, 'min_samples_leaf': 18}. Best is trial 5 with value: 0.13404334126419687.[0m


MCC: [0.11432565914074268, 0.10863440483154806, 0.13013758598897124, 0.12491493960845061]
ROC: [0.765600882837104, 0.7818021169025909, 0.8030973126796925, 0.7782263979346359]
PR_AUC: [0.046936152076218096, 0.06782000020533806, 0.05899998464960049, 0.05642640050808938]


[32m[I 2021-12-10 16:42:22,561][0m Trial 7 finished with value: 0.1263971442413991 and parameters: {'n_estimators': 135, 'subsample': 0.8740702165976071, 'max_depth': 11, 'max_features': 17, 'learning_rate': 0.12955901949366017, 'min_samples_leaf': 46}. Best is trial 5 with value: 0.13404334126419687.[0m


MCC: [0.11149247417059895, 0.12397072729257491, 0.1546218255427507, 0.11550354995967195]
ROC: [0.76743035009184, 0.7901208768244262, 0.8204541453969372, 0.7832834594848325]
PR_AUC: [0.04379278618620369, 0.04895854179198176, 0.0623532432893214, 0.05857870418454327]


[32m[I 2021-12-10 16:42:23,356][0m Trial 8 finished with value: 0.12916136422174807 and parameters: {'n_estimators': 44, 'subsample': 0.9482237745826528, 'max_depth': 10, 'max_features': 14, 'learning_rate': 0.15612558423606065, 'min_samples_leaf': 27}. Best is trial 5 with value: 0.13404334126419687.[0m


MCC: [0.11172929053805684, 0.11920965626906006, 0.15143510206616473, 0.13427140801371062]
ROC: [0.7803282775510803, 0.7939484596179016, 0.8243157014610105, 0.7809217860705274]
PR_AUC: [0.045167220378676265, 0.07425273027145465, 0.06116720628683028, 0.05224619449803959]


[32m[I 2021-12-10 16:42:24,358][0m Trial 9 finished with value: 0.13079916444417383 and parameters: {'n_estimators': 89, 'subsample': 0.8002853761125517, 'max_depth': 4, 'max_features': 17, 'learning_rate': 0.14292147161182367, 'min_samples_leaf': 35}. Best is trial 5 with value: 0.13404334126419687.[0m


MCC: [0.12770375549639648, 0.13249623585299203, 0.13590518975428784, 0.12709147667301898]
ROC: [0.7824657112374751, 0.7954222989694124, 0.8045275186293492, 0.7929795223845567]
PR_AUC: [0.04673886219289385, 0.05819957799380449, 0.06087586192032751, 0.06441026580562279]


[32m[I 2021-12-10 16:42:26,117][0m Trial 10 finished with value: 0.12070526529989714 and parameters: {'n_estimators': 55, 'subsample': 0.924763336987846, 'max_depth': 18, 'max_features': 10, 'learning_rate': 0.09172353354166263, 'min_samples_leaf': 4}. Best is trial 5 with value: 0.13404334126419687.[0m


MCC: [0.11153890122306308, 0.12906502977983825, 0.1315974615314701, 0.11061966866521714]
ROC: [0.7762733862009041, 0.8161073776291717, 0.7965550372586986, 0.7929721879950713]
PR_AUC: [0.0440700469918097, 0.06998259740842916, 0.07138055588578912, 0.05767617524123385]


[32m[I 2021-12-10 16:42:27,064][0m Trial 11 finished with value: 0.13242517209341054 and parameters: {'n_estimators': 79, 'subsample': 0.9924233767166261, 'max_depth': 3, 'max_features': 13, 'learning_rate': 0.11796758732296449, 'min_samples_leaf': 38}. Best is trial 5 with value: 0.13404334126419687.[0m


MCC: [0.12205386421968954, 0.13505571697191893, 0.13968859076626278, 0.13290251641577083]
ROC: [0.7869898847692269, 0.8036054070106358, 0.8248162735433902, 0.8022905298362965]
PR_AUC: [0.04911953155937698, 0.0778100238053075, 0.06649572502856668, 0.06084533001662588]


[32m[I 2021-12-10 16:42:27,483][0m Trial 12 finished with value: 0.137210235208226 and parameters: {'n_estimators': 10, 'subsample': 0.9983488269913793, 'max_depth': 3, 'max_features': 12, 'learning_rate': 0.11731011779041688, 'min_samples_leaf': 48}. Best is trial 12 with value: 0.137210235208226.[0m


MCC: [0.1449337257244164, 0.135018484237907, 0.13493986743489989, 0.1339488634356808]
ROC: [0.7849202770231378, 0.8321362971436114, 0.8182593293434255, 0.8021145044886464]
PR_AUC: [0.047644859067990875, 0.06987255380144983, 0.10338892313022442, 0.07743086317100856]


[32m[I 2021-12-10 16:42:27,981][0m Trial 13 finished with value: 0.13475493162259614 and parameters: {'n_estimators': 16, 'subsample': 0.8964335096564849, 'max_depth': 6, 'max_features': 10, 'learning_rate': 0.08226340098967314, 'min_samples_leaf': 49}. Best is trial 12 with value: 0.137210235208226.[0m


MCC: [0.1355738690752395, 0.1394226244319825, 0.1296312098774949, 0.1343920231056677]
ROC: [0.7830193175610525, 0.8261364511268272, 0.816629261280291, 0.8118784104911106]
PR_AUC: [0.04869067831703586, 0.08333796789299307, 0.06863833873935993, 0.06467979674552006]


[32m[I 2021-12-10 16:42:28,436][0m Trial 14 finished with value: 0.1388873569067317 and parameters: {'n_estimators': 10, 'subsample': 0.9856209318135866, 'max_depth': 5, 'max_features': 10, 'learning_rate': 0.0832071697919903, 'min_samples_leaf': 48}. Best is trial 14 with value: 0.1388873569067317.[0m


MCC: [0.1431571181923557, 0.1434213661245574, 0.13979934192285878, 0.12917160138715506]
ROC: [0.782242068947818, 0.8329832048306735, 0.8143134278002699, 0.8060585724344305]
PR_AUC: [0.04494237127158901, 0.06768187632501105, 0.06203547739321611, 0.07303614653244207]


[32m[I 2021-12-10 16:42:28,989][0m Trial 15 finished with value: 0.14224012577845194 and parameters: {'n_estimators': 18, 'subsample': 0.9997416814239573, 'max_depth': 5, 'max_features': 11, 'learning_rate': 0.09501972433788973, 'min_samples_leaf': 50}. Best is trial 15 with value: 0.14224012577845194.[0m


MCC: [0.14420873030320783, 0.1399832572011063, 0.13722479272616384, 0.14754372288332976]
ROC: [0.7861924716872528, 0.8285818512448809, 0.8210555653347417, 0.8084734201725048]
PR_AUC: [0.044886701832684756, 0.06799696482152598, 0.07462875925140701, 0.059427080127287785]


[32m[I 2021-12-10 16:42:29,745][0m Trial 16 finished with value: 0.1312922100042916 and parameters: {'n_estimators': 37, 'subsample': 0.9703612071861728, 'max_depth': 5, 'max_features': 11, 'learning_rate': 0.08047308167198712, 'min_samples_leaf': 34}. Best is trial 15 with value: 0.14224012577845194.[0m


MCC: [0.13009861818194784, 0.13459140661344365, 0.128438453591208, 0.13204036163056682]
ROC: [0.7961097240400796, 0.8229119692620172, 0.8129859033034091, 0.8020778325412192]
PR_AUC: [0.048472146973712486, 0.06503454511796491, 0.07850420716778837, 0.05697281412165038]


[32m[I 2021-12-10 16:42:31,117][0m Trial 17 finished with value: 0.11886534679971292 and parameters: {'n_estimators': 63, 'subsample': 0.9731405376125478, 'max_depth': 17, 'max_features': 11, 'learning_rate': 0.09715405938781194, 'min_samples_leaf': 13}. Best is trial 15 with value: 0.14224012577845194.[0m


MCC: [0.11357794391562351, 0.12120155064245172, 0.13249487480015942, 0.10818701784061702]
ROC: [0.7830853103678366, 0.8040563578569936, 0.8045348530188348, 0.771772135187467]
PR_AUC: [0.045599363509919305, 0.0677033704104575, 0.059630949664580865, 0.04748619919013608]


[32m[I 2021-12-10 16:42:32,482][0m Trial 18 finished with value: 0.1303942883482057 and parameters: {'n_estimators': 128, 'subsample': 0.931514617818332, 'max_depth': 6, 'max_features': 12, 'learning_rate': 0.09396269755651515, 'min_samples_leaf': 50}. Best is trial 15 with value: 0.14224012577845194.[0m


MCC: [0.1269075901973383, 0.12624783028042982, 0.14523638548819265, 0.12318534742686203]
ROC: [0.7759544209681145, 0.8030701320222762, 0.8214772927301531, 0.796221322537112]
PR_AUC: [0.043816579597003434, 0.05797197499840783, 0.06601889503864874, 0.058209499800568554]


[32m[I 2021-12-10 16:42:33,082][0m Trial 19 finished with value: 0.1334927944794795 and parameters: {'n_estimators': 31, 'subsample': 0.9794219005138716, 'max_depth': 15, 'max_features': 10, 'learning_rate': 0.107996752326627, 'min_samples_leaf': 44}. Best is trial 15 with value: 0.14224012577845194.[0m


MCC: [0.13580183211460475, 0.13414149832596858, 0.13653824847978016, 0.12748959899756454]
ROC: [0.7896021000377627, 0.8279182569099968, 0.8188845860470575, 0.8046870416006571]
PR_AUC: [0.04695090085254256, 0.06432356467181727, 0.06963510738166724, 0.058291236926617154]


[32m[I 2021-12-10 16:42:34,159][0m Trial 20 finished with value: 0.1276431319057451 and parameters: {'n_estimators': 64, 'subsample': 0.9259701326378406, 'max_depth': 8, 'max_features': 12, 'learning_rate': 0.18220882753397885, 'min_samples_leaf': 34}. Best is trial 15 with value: 0.14224012577845194.[0m


MCC: [0.11842787879516382, 0.11068899576388593, 0.1365996435456194, 0.14485600951831124]
ROC: [0.7685009000685593, 0.7787701140575678, 0.8091995247315614, 0.7992064190576776]
PR_AUC: [0.04567804544598524, 0.061012076306006746, 0.05668442790937832, 0.06151229498534141]


[32m[I 2021-12-10 16:42:34,622][0m Trial 21 finished with value: 0.13563872388491718 and parameters: {'n_estimators': 10, 'subsample': 0.9997510486778296, 'max_depth': 3, 'max_features': 11, 'learning_rate': 0.11732559914588175, 'min_samples_leaf': 50}. Best is trial 15 with value: 0.14224012577845194.[0m


MCC: [0.14203987702633664, 0.13836184662090298, 0.1317254488111449, 0.13042772308128422]
ROC: [0.7805555861077809, 0.8307962765391905, 0.8132682772985977, 0.8019549815173385]
PR_AUC: [0.05121378777429145, 0.06682080362066352, 0.0770350914866753, 0.06841067486609222]


[32m[I 2021-12-10 16:42:35,065][0m Trial 22 finished with value: 0.1325422489796072 and parameters: {'n_estimators': 11, 'subsample': 0.9870268605510224, 'max_depth': 5, 'max_features': 12, 'learning_rate': 0.0897646969739741, 'min_samples_leaf': 45}. Best is trial 15 with value: 0.14224012577845194.[0m


MCC: [0.13227493105665947, 0.13854675610790615, 0.13125700088702563, 0.12809030786683762]
ROC: [0.7808818838746577, 0.8237423787473832, 0.8224307633632577, 0.7926568092471983]
PR_AUC: [0.043501321286075936, 0.06327166839477566, 0.07099221032124606, 0.07167874695454998]


[32m[I 2021-12-10 16:42:35,780][0m Trial 23 finished with value: 0.132632445979766 and parameters: {'n_estimators': 33, 'subsample': 0.9535286334172862, 'max_depth': 20, 'max_features': 13, 'learning_rate': 0.10315109276860635, 'min_samples_leaf': 46}. Best is trial 15 with value: 0.14224012577845194.[0m


MCC: [0.1263403784713069, 0.1328018922482015, 0.13580584799672415, 0.1355816652028314]
ROC: [0.7867130816074381, 0.8254215290533332, 0.8227534765006161, 0.8021621780203017]
PR_AUC: [0.04528041922431006, 0.06384203794479809, 0.07727539580702096, 0.0546963362238164]


[32m[I 2021-12-10 16:42:36,383][0m Trial 24 finished with value: 0.13509763866199953 and parameters: {'n_estimators': 26, 'subsample': 0.9989266796360254, 'max_depth': 3, 'max_features': 11, 'learning_rate': 0.11584077733057935, 'min_samples_leaf': 41}. Best is trial 15 with value: 0.14224012577845194.[0m


MCC: [0.13405172085693623, 0.1358495972438917, 0.12953555456754662, 0.1409536819796236]
ROC: [0.7975230699853716, 0.8217002680041209, 0.8185472041307282, 0.8102428416358622]
PR_AUC: [0.052775827729810994, 0.061661423286779496, 0.07391426357171185, 0.0588941833787083]
CPU times: user 24.5 s, sys: 498 ms, total: 25 s
Wall time: 24.8 s


In [14]:
best = study.best_trial
best

FrozenTrial(number=15, values=[0.14224012577845194], datetime_start=datetime.datetime(2021, 12, 10, 16, 42, 28, 437918), datetime_complete=datetime.datetime(2021, 12, 10, 16, 42, 28, 988676), params={'n_estimators': 18, 'subsample': 0.9997416814239573, 'max_depth': 5, 'max_features': 11, 'learning_rate': 0.09501972433788973, 'min_samples_leaf': 50}, distributions={'n_estimators': IntUniformDistribution(high=200, low=10, step=1), 'subsample': UniformDistribution(high=1.0, low=0.8), 'max_depth': IntUniformDistribution(high=20, low=3, step=1), 'max_features': IntUniformDistribution(high=20, low=10, step=1), 'learning_rate': UniformDistribution(high=0.2, low=0.08), 'min_samples_leaf': IntUniformDistribution(high=50, low=1, step=1)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=15, state=TrialState.COMPLETE, value=None)

In [15]:
%%time
gb_params = study.best_params
#gb_params['random_state'] = 42
gb = GradientBoostingClassifier(**gb_params)
gb.fit(X_rus, y_rus)
y_pred = gb.predict(X_test)
probs = gb.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

[[3171 1572]
 [  23   60]]
              precision    recall  f1-score   support

           0       0.99      0.67      0.80      4743
           1       0.04      0.72      0.07        83

    accuracy                           0.67      4826
   macro avg       0.51      0.70      0.43      4826
weighted avg       0.98      0.67      0.79      4826

ROC AUC prob1: 0.7644251896898155
MCC: 0.10757733218811995
F2: 0.15274949083503053
PR_AUC: 0.05385710590966763
CPU times: user 67.1 ms, sys: 4.03 ms, total: 71.2 ms
Wall time: 69.4 ms


In [16]:
%%time
gb_params = study.best_params
gb_params['random_state'] = 42
gb_weight = GradientBoostingClassifier(**gb_params)
weights = class_weight.compute_sample_weight('balanced', y=y_train)
gb_weight.fit(X_train, y_train, sample_weight=weights)
y_pred = gb_weight.predict(X_test)
probs = gb_weight.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

[[3342 1401]
 [  26   57]]
              precision    recall  f1-score   support

           0       0.99      0.70      0.82      4743
           1       0.04      0.69      0.07        83

    accuracy                           0.70      4826
   macro avg       0.52      0.70      0.45      4826
weighted avg       0.98      0.70      0.81      4826

ROC AUC prob1: 0.7889432492779467
MCC: 0.11081078538090128
F2: 0.15921787709497204
PR_AUC: 0.06573972225385845
CPU times: user 198 ms, sys: 4.96 ms, total: 203 ms
Wall time: 201 ms


In [17]:
# No Sitenum - 5
coeffs_wgt = pd.Series(data=gb_weight.feature_importances_,
                   index=X_test.columns.values).sort_values(ascending=False)
coeffs_wgt[0:20]

Insurance         0.275285
Admefface         0.082232
Delmode           0.081384
TrialLabor        0.051283
uscar             0.040650
new_BMI           0.037744
Lac_None          0.037406
Dilat_lst         0.028431
high_Gravidity    0.026375
Induction         0.026211
Analgesia         0.024487
new_age           0.023397
Admpresent        0.018030
Anteprevia        0.015537
Episiotomy        0.014173
spontlabor        0.011386
Lac_Min           0.011226
CS_UScar          0.010528
Admreason         0.010038
MthInd_Miso       0.009872
dtype: float64

In [1]:
plt.rcParams["figure.figsize"] = (14, 8)
x_labels = coeffs_wgt.index.values[0:25]
print(x_labels)
fig, ax = plt.subplots(1,1)  # Create a figure and an axes.
ax.set_title('Top Predictors: All Sites combined; target: High_EBLoss, vars: PI')
ax.bar(x_labels, coeffs_wgt.values[0:25])
ax.set_ylabel('Coeff')
ax.set_xlabel('Variable')
plt.draw()
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, fontsize=16)
#plt.xlim(1,20)
plt.show()

NameError: name 'plt' is not defined

In [19]:
# No Sitenum - 5
coeffs = pd.Series(data=gb.feature_importances_,
                   index=X_test.columns.values).sort_values(ascending=False)
coeffs[0:20]

Insurance      0.366219
prelaborCD     0.074164
Delmode        0.068049
Dilat_lst      0.059152
Lac_None       0.054710
TrialLabor     0.045693
new_age        0.037368
uscar          0.035613
HxnumCS        0.035379
ROMmeth        0.033584
Admpresent     0.033037
Admefface      0.030473
Admreason      0.029785
new_BMI        0.028797
IUPC           0.015767
Hxcsection     0.013559
AdmDBP         0.012503
Admcontract    0.006731
ChronicHBP     0.004507
GAmethod       0.004402
dtype: float64

In [21]:
gb_params # 5 runs

{'n_estimators': 18,
 'subsample': 0.9997416814239573,
 'max_depth': 5,
 'max_features': 11,
 'learning_rate': 0.09501972433788973,
 'min_samples_leaf': 50,
 'random_state': 42}

In [2]:
from optuna.visualization import plot_optimization_history

plotly_config = {"staticPlot": True}

fig = plot_optimization_history(study)
fig.show(config=plotly_config)

NameError: name 'study' is not defined

In [None]:
from optuna.visualization import plot_param_importances

fig = plot_param_importances(study)
fig.show(config=plotly_config)

In [24]:
np.mean([0.3149689279176914, 0.31449457458602254, 0.31066798538596907, 0.3219519143488398])

0.3155208505596307

<h1 align='center'>Thanks and make sure to learn!</h1>