In [1]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
#import plotly.express as px
import seaborn as sns

from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import StratifiedKFold
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, \
                            accuracy_score, f1_score, average_precision_score, \
                            fbeta_score, precision_recall_curve, auc
from sklearn.metrics import matthews_corrcoef

from sklearn.model_selection import train_test_split

import optuna
from optuna.samplers import TPESampler
sns.set(style='white', context='notebook', palette='deep')
%config Completer.use_jedi = False

In [2]:
csl_df = pd.read_csv('../../data/csl/Sites/CSL_he_PI_s46.csv', index_col=0)
X = csl_df.drop(['high_EBL','Sitenum'], axis=1, inplace=False)
y = csl_df['high_EBL'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=7)
X_train                                                    

Unnamed: 0_level_0,transfus_yes,Accrete,AdmBishop,Admcervpos,Admconsistency,Admcontract,AdmDBP,Admefface,Admpresent,AdmSBP,...,TD_nos,ThreatenedPB,threatpb9,UnspecHBP,uscar,version9,new_age,new_BMI,new_high_Age,new_high_BMI
MomID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
46-00490,0,0,8,8,8,99.0,79,0.0,1,120.0,...,0,0,0,0,0,0,35,20,1,0
46-03267,0,0,8,8,8,99.0,88,0.0,1,122.0,...,0,0,0,0,0,0,29,33,1,1
46-05746,0,0,8,8,8,99.0,63,0.0,1,130.0,...,0,0,0,0,0,0,22,24,1,1
46-04895,0,0,8,8,8,99.0,91,0.0,1,152.0,...,0,0,0,0,0,0,25,30,1,1
46-06459,0,0,8,8,8,99.0,74,0.0,1,127.0,...,0,0,0,0,0,0,21,24,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46-00421,0,0,8,8,8,2.0,69,0.0,1,121.0,...,0,0,0,0,0,0,22,31,1,1
46-01335,0,0,8,8,8,99.0,85,0.0,1,136.0,...,0,0,0,0,0,0,23,31,1,1
46-04625,0,0,8,8,8,99.0,100,0.0,1,151.0,...,0,0,0,0,0,0,26,44,1,2
46-05080,0,0,8,8,8,2.5,63,0.0,1,109.0,...,0,0,0,0,0,0,33,31,1,1


In [3]:
X.shape

(5805, 195)

<h1 align='center'>Exploratory Data Analysis</h1>

<h1 align='center'>Modelling</h1>

In [8]:
print(X_train.shape)
print(X.shape)
print(y.shape)

(4063, 195)
(5805, 195)
(5805,)


Here we define our validation set

<h1 align='center'>Modelling Using Undersampling</h1>

We will use `imblearn`'s RandomUnderSampler to undersample from the majority class so that they match

In [9]:
sampler = RandomUnderSampler(sampling_strategy=1.0, random_state=7)
X_rus, y_rus = sampler.fit_resample(X_train, y_train)

# Try with no undersampling
#X_rus, y_rus = X_train, y_train

In [10]:
print(X_rus.shape)
print(y_rus.shape)
print(np.bincount(y_rus))

(260, 195)
(260,)
[130 130]


<h1 align='center'>Basic Gradient Boosting</h1>

In [11]:
%%time
from sklearn.metrics import matthews_corrcoef
from sklearn.utils import class_weight
gb_rus = GradientBoostingClassifier()   # If not undersampling
gb_rus.fit(X_rus, y_rus)
print(gb_rus.get_params())
y_pred = gb_rus.predict(X_test)
probs = gb_rus.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'presort': 'deprecated', 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
[[1309  377]
 [   9   47]]
              precision    recall  f1-score   support

           0       0.99      0.78      0.87      1686
           1       0.11      0.84      0.20        56

    accuracy                           0.78      1742
   macro avg       0.55      0.81      0.53      1742
weighted avg       0.96      0.78      0.85      1742

ROC AUC prob1: 0.8567562701237079
MCC: 0.2530678007252819
F2: 0.36265432098765443
PR_AUC: 0.13549053779939826
CPU times: user 162 ms, sys: 2.07 ms, tota

In [12]:
%%time
# Not undersampled
from sklearn.metrics import matthews_corrcoef
from sklearn.utils import class_weight
gb_weight = GradientBoostingClassifier()  
weights = class_weight.compute_sample_weight('balanced', y=y_train)
gb_weight.fit(X_train, y_train, sample_weight=weights)
print(gb_weight.get_params())
y_pred = gb_weight.predict(X_test)
probs = gb_weight.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'presort': 'deprecated', 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
[[1442  244]
 [  17   39]]
              precision    recall  f1-score   support

           0       0.99      0.86      0.92      1686
           1       0.14      0.70      0.23        56

    accuracy                           0.85      1742
   macro avg       0.56      0.78      0.57      1742
weighted avg       0.96      0.85      0.89      1742

ROC AUC prob1: 0.8497712252160652
MCC: 0.2638218096977305
F2: 0.3846153846153846
PR_AUC: 0.18852169889904058
CPU times: user 1.05 s, sys: 3.73 ms, total

<h1 align='center'>Hyperparameter Tuning with Cross Validation</h1>

In [13]:
%%time
def create_model(trial):
    n_estimators = trial.suggest_int('n_estimators', 10, 200)
    subsample = trial.suggest_float('subsample', 0.8, 1.0)
    max_depth = trial.suggest_int('max_depth',3,20)
    max_features = trial.suggest_int('max_features',10,20)
    learning_rate = trial.suggest_uniform('learning_rate',0.08,0.2)
    min_samples_leaf = trial.suggest_int('min_samples_leaf',1,50)
    model = GradientBoostingClassifier(n_estimators=n_estimators, subsample=subsample, 
                           max_depth=max_depth, learning_rate=learning_rate, 
                           min_samples_leaf=min_samples_leaf)
#    model = GradientBoostingClassifier(n_estimators=n_estimators, subsample=subsample, 
#                           learning_rate=learning_rate, 
#                           min_samples_leaf=min_samples_leaf)
#    model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate) 
    return model

def objective(trial):
    model = create_model(trial)
    
    mcc_list = []
    roc_list = []
    pr_list = []
    f2_list = []
    cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=11)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train_c, X_test_c = X.iloc[train_idx], X.iloc[test_idx]
        y_train_c, y_test_c = y[train_idx], y[test_idx]
        sampler = RandomUnderSampler(sampling_strategy=1.0, random_state=7)
        X_res, y_res = sampler.fit_resample(X_train_c, y_train_c)

#        weights = class_weight.compute_sample_weight('balanced', y=y_train)
#        model.fit(X_train, y_train, sample_weight=weights)
        model.fit(X_res, y_res)
        y_pred = model.predict(X_test_c)
        probs = model.predict_proba(X_test_c)
        prob1 = probs[:, 1]  # Only positives
        precision, recall, pr_thresh = precision_recall_curve(y_test_c, prob1)
        roc = roc_auc_score(y_test_c, prob1)
        roc_list.append(roc)
        mcc = matthews_corrcoef(y_test_c, y_pred)  # MWB
        mcc_list.append(mcc)
        f2 = fbeta_score(y_test_c, y_pred, beta=2.0, average=None)[1]
        pr_auc = auc(recall, precision)
        pr_list.append(pr_auc)
        avg_precision = average_precision_score(y_test_c, prob1)
#        print(f'ROC: {roc}')
#        print(f'PR_AUC: {pr_auc}')
#        print(f'avg_prec: {avg_precision}')
        
    print(f'MCC: {mcc_list}')
    print(f'ROC: {roc_list}')
    print(f'PR_AUC: {pr_list}')
    return np.mean(mcc_list)

sampler = TPESampler(seed=7)
study = optuna.create_study(sampler=sampler,direction='maximize')
#study.optimize(objective,n_trials=60)
study.optimize(objective,n_trials=25)
#study.optimize(objective,n_trials=5)

[32m[I 2021-12-10 17:00:14,407][0m A new study created in memory with name: no-name-256eac24-d686-45cd-a14b-f54c07ede565[0m
[32m[I 2021-12-10 17:00:14,775][0m Trial 0 finished with value: 0.28638151657661026 and parameters: {'n_estimators': 24, 'subsample': 0.955983758448023, 'max_depth': 10, 'max_features': 17, 'learning_rate': 0.1973587414395923, 'min_samples_leaf': 27}. Best is trial 0 with value: 0.28638151657661026.[0m


MCC: [0.30564699604649737, 0.3074230828968115, 0.2695662766728247, 0.26288971069030764]
ROC: [0.8878549254183388, 0.8728918458920006, 0.8537211821135695, 0.8154664484451718]
PR_AUC: [0.21174375648411567, 0.11391911428393207, 0.11943774453643656, 0.10796089462368812]


[32m[I 2021-12-10 17:00:15,385][0m Trial 1 finished with value: 0.27750364589252063 and parameters: {'n_estimators': 105, 'subsample': 0.8144102266719524, 'max_depth': 7, 'max_features': 15, 'learning_rate': 0.16150759953451288, 'min_samples_leaf': 41}. Best is trial 0 with value: 0.28638151657661026.[0m


MCC: [0.2773782408833501, 0.31280523508710295, 0.25501715853810003, 0.26481394906152955]
ROC: [0.8810176421594609, 0.8896178245396874, 0.849002011449791, 0.8234982117960842]
PR_AUC: [0.19859072731377486, 0.1360634724770983, 0.13353289660387305, 0.12231766446422083]


[32m[I 2021-12-10 17:00:16,041][0m Trial 2 finished with value: 0.2800571664895577 and parameters: {'n_estimators': 82, 'subsample': 0.813187269381181, 'max_depth': 8, 'max_features': 20, 'learning_rate': 0.10560624242958987, 'min_samples_leaf': 23}. Best is trial 0 with value: 0.28638151657661026.[0m


MCC: [0.30072766327696615, 0.3028622148131213, 0.2595724776179845, 0.25706631025015886]
ROC: [0.8757931400015144, 0.8738511527154572, 0.851152715457218, 0.8127841425713767]
PR_AUC: [0.17453576314632532, 0.11967299576027038, 0.12917406086619937, 0.12118153883307348]


[32m[I 2021-12-10 17:00:17,173][0m Trial 3 finished with value: 0.2659566751114252 and parameters: {'n_estimators': 187, 'subsample': 0.8049798455100696, 'max_depth': 13, 'max_features': 20, 'learning_rate': 0.10763634548251577, 'min_samples_leaf': 28}. Best is trial 0 with value: 0.28638151657661026.[0m


MCC: [0.283348774594879, 0.3114421247815641, 0.23081299734533872, 0.23822280372391888]
ROC: [0.8804119027788294, 0.8763267832276034, 0.8382330187219557, 0.7950688003879494]
PR_AUC: [0.18406420596844794, 0.14169398299675154, 0.11508342518668971, 0.13239290793948033]


[32m[I 2021-12-10 17:00:18,392][0m Trial 4 finished with value: 0.26289392299010517 and parameters: {'n_estimators': 183, 'subsample': 0.8266338891518501, 'max_depth': 12, 'max_features': 18, 'learning_rate': 0.16028158890606967, 'min_samples_leaf': 24}. Best is trial 0 with value: 0.28638151657661026.[0m


MCC: [0.2739049406284071, 0.2832468432262327, 0.25016824128780485, 0.24425566681797606]
ROC: [0.8694631634739154, 0.8709732322450875, 0.843694878539378, 0.790719524762078]
PR_AUC: [0.16942338277913183, 0.12435985285046519, 0.11754659466137361, 0.11394364437670512]


[32m[I 2021-12-10 17:00:18,848][0m Trial 5 finished with value: 0.2881502092719719 and parameters: {'n_estimators': 49, 'subsample': 0.8981531778182141, 'max_depth': 9, 'max_features': 15, 'learning_rate': 0.12390684629367116, 'min_samples_leaf': 42}. Best is trial 5 with value: 0.2881502092719719.[0m


MCC: [0.2889949203108434, 0.3272568687521583, 0.2695662766728247, 0.2667827713520612]
ROC: [0.8908760505792382, 0.8894785703233793, 0.8545102893393162, 0.8414711765775595]
PR_AUC: [0.1921833835779145, 0.13353530269831138, 0.14538755819220972, 0.1265279606177426]


[32m[I 2021-12-10 17:00:20,193][0m Trial 6 finished with value: 0.2653098239338548 and parameters: {'n_estimators': 156, 'subsample': 0.8627989354425325, 'max_depth': 13, 'max_features': 13, 'learning_rate': 0.13434115190556806, 'min_samples_leaf': 18}. Best is trial 5 with value: 0.2881502092719719.[0m


MCC: [0.2896005266272599, 0.27015817065381187, 0.24569227831435164, 0.2557883201399959]
ROC: [0.871901264480957, 0.8693331270307907, 0.8334983753674764, 0.7947808692489543]
PR_AUC: [0.16640952723065733, 0.11844134921045493, 0.11331439679841056, 0.11929506622552678]


[32m[I 2021-12-10 17:00:20,986][0m Trial 7 finished with value: 0.280838704001411 and parameters: {'n_estimators': 135, 'subsample': 0.8740702165976071, 'max_depth': 11, 'max_features': 17, 'learning_rate': 0.12955901949366017, 'min_samples_leaf': 46}. Best is trial 5 with value: 0.2881502092719719.[0m


MCC: [0.2797385149484949, 0.33105492854125085, 0.24514225182797791, 0.26741912068792023]
ROC: [0.8772317710305142, 0.8875290112950642, 0.8512919696735263, 0.8340758925865308]
PR_AUC: [0.19005361226200096, 0.13255169906775877, 0.11994163782686902, 0.12285939485588286]


[32m[I 2021-12-10 17:00:21,445][0m Trial 8 finished with value: 0.28426236628066226 and parameters: {'n_estimators': 44, 'subsample': 0.9482237745826528, 'max_depth': 10, 'max_features': 14, 'learning_rate': 0.15612558423606065, 'min_samples_leaf': 27}. Best is trial 5 with value: 0.2881502092719719.[0m


MCC: [0.29143175781897634, 0.3058547968119677, 0.275593157398989, 0.26416975309271595]
ROC: [0.8764140228666616, 0.8800711743772242, 0.8472690700912889, 0.8110565557374069]
PR_AUC: [0.1986954562573955, 0.12105158885544196, 0.1103951108926426, 0.12552688895890338]


[32m[I 2021-12-10 17:00:22,099][0m Trial 9 finished with value: 0.2772222194233315 and parameters: {'n_estimators': 89, 'subsample': 0.8002853761125517, 'max_depth': 4, 'max_features': 17, 'learning_rate': 0.14292147161182367, 'min_samples_leaf': 35}. Best is trial 5 with value: 0.2881502092719719.[0m


MCC: [0.26816044385035576, 0.33717064681497627, 0.24903640062218735, 0.2545213864058067]
ROC: [0.8776557885969561, 0.8818041157357264, 0.8402444685130744, 0.8020094562647754]
PR_AUC: [0.171020037840858, 0.1257302784291565, 0.1337419874683443, 0.1303263096467813]


[32m[I 2021-12-10 17:00:23,187][0m Trial 10 finished with value: 0.2722770327940858 and parameters: {'n_estimators': 55, 'subsample': 0.924763336987846, 'max_depth': 18, 'max_features': 10, 'learning_rate': 0.09172353354166263, 'min_samples_leaf': 4}. Best is trial 5 with value: 0.2881502092719719.[0m


MCC: [0.3136269595785885, 0.23708240205691286, 0.2696596878789847, 0.2687390816618573]
ROC: [0.8721132732641781, 0.843014080148538, 0.8537830728763731, 0.8116930350972905]
PR_AUC: [0.15260063939365415, 0.10784648148468982, 0.11794764128879322, 0.12801705692250234]


[32m[I 2021-12-10 17:00:23,541][0m Trial 11 finished with value: 0.27469553790475526 and parameters: {'n_estimators': 16, 'subsample': 0.9924233767166261, 'max_depth': 17, 'max_features': 12, 'learning_rate': 0.19602237319055124, 'min_samples_leaf': 11}. Best is trial 5 with value: 0.2881502092719719.[0m


MCC: [0.31676413639788886, 0.2678972443651847, 0.26548730955636696, 0.2486334612995805]
ROC: [0.8777012190505035, 0.8602816029707566, 0.8460467275259167, 0.7950384918470025]
PR_AUC: [0.16508978837171898, 0.11218327584169645, 0.11610725143945672, 0.09862933684209299]


[32m[I 2021-12-10 17:00:23,864][0m Trial 12 finished with value: 0.29693911595138367 and parameters: {'n_estimators': 10, 'subsample': 0.9584495026303259, 'max_depth': 3, 'max_features': 16, 'learning_rate': 0.19834120666704083, 'min_samples_leaf': 37}. Best is trial 12 with value: 0.29693911595138367.[0m


MCC: [0.31300693644398986, 0.3123602063773523, 0.28536963110328467, 0.2770196898809078]
ROC: [0.8824941318997501, 0.8862138325854866, 0.8573263190468823, 0.8295523428502153]
PR_AUC: [0.18991168842629513, 0.1262747165827653, 0.12773348694010794, 0.12546650936864523]


[32m[I 2021-12-10 17:00:24,404][0m Trial 13 finished with value: 0.28881526846544214 and parameters: {'n_estimators': 57, 'subsample': 0.9076814971706537, 'max_depth': 3, 'max_features': 15, 'learning_rate': 0.17357284938843362, 'min_samples_leaf': 49}. Best is trial 12 with value: 0.29693911595138367.[0m


MCC: [0.29143175781897634, 0.33863858269171027, 0.2725494220140698, 0.25264131133701206]
ROC: [0.8851896721435603, 0.8899118056630049, 0.855067306204549, 0.8518518518518519]
PR_AUC: [0.17894938815497488, 0.13858376201823155, 0.12896589854261398, 0.12147929387845702]


[32m[I 2021-12-10 17:00:24,734][0m Trial 14 finished with value: 0.2981591994406853 and parameters: {'n_estimators': 11, 'subsample': 0.9856209318135866, 'max_depth': 3, 'max_features': 12, 'learning_rate': 0.18022130213787646, 'min_samples_leaf': 47}. Best is trial 14 with value: 0.2981591994406853.[0m


MCC: [0.31424942901532776, 0.3123602063773523, 0.2890074724891535, 0.2770196898809078]
ROC: [0.8773529189066405, 0.9067460931455981, 0.8610629738511527, 0.864141965205795]
PR_AUC: [0.16067972498768313, 0.18514549498878874, 0.12978628602025918, 0.12093543986386726]


[32m[I 2021-12-10 17:00:25,173][0m Trial 15 finished with value: 0.28915252446101947 and parameters: {'n_estimators': 18, 'subsample': 0.9998277518480155, 'max_depth': 5, 'max_features': 11, 'learning_rate': 0.18167158322018723, 'min_samples_leaf': 36}. Best is trial 14 with value: 0.2981591994406853.[0m


MCC: [0.31300693644398986, 0.3136711607209051, 0.2637726777754649, 0.266159322903718]
ROC: [0.8898008631786174, 0.8851152715457218, 0.8516401052142967, 0.8108822816269625]
PR_AUC: [0.1746705277044396, 0.12902650311364286, 0.12129484399976402, 0.11932911529598647]


[32m[I 2021-12-10 17:00:25,850][0m Trial 16 finished with value: 0.26736732515205314 and parameters: {'n_estimators': 75, 'subsample': 0.9682757969285061, 'max_depth': 6, 'max_features': 13, 'learning_rate': 0.18272928189632393, 'min_samples_leaf': 35}. Best is trial 14 with value: 0.2981591994406853.[0m


MCC: [0.2658987476193031, 0.3093173878304099, 0.23909968222112538, 0.2551534829373742]
ROC: [0.8753994094041039, 0.8714528856568158, 0.8420393006343804, 0.8090713463053888]
PR_AUC: [0.18229816360998197, 0.11937964118575975, 0.11816470524661335, 0.12806792916366633]


[32m[I 2021-12-10 17:00:26,180][0m Trial 17 finished with value: 0.2983154329226311 and parameters: {'n_estimators': 11, 'subsample': 0.9803712313231584, 'max_depth': 3, 'max_features': 10, 'learning_rate': 0.1832536562134998, 'min_samples_leaf': 50}. Best is trial 17 with value: 0.2983154329226311.[0m


MCC: [0.3148743629431107, 0.3123602063773523, 0.2890074724891535, 0.2770196898809078]
ROC: [0.883001438631029, 0.8885811542627262, 0.8571483831038218, 0.8579590228526397]
PR_AUC: [0.16024961151071324, 0.15463232655637052, 0.1493048171628013, 0.11607790109348942]


[32m[I 2021-12-10 17:00:26,589][0m Trial 18 finished with value: 0.2860094291796264 and parameters: {'n_estimators': 36, 'subsample': 0.9806692309066943, 'max_depth': 6, 'max_features': 10, 'learning_rate': 0.17521314425168996, 'min_samples_leaf': 50}. Best is trial 17 with value: 0.2983154329226311.[0m


MCC: [0.2854104781455702, 0.3176713038008203, 0.27541746662263034, 0.2655384681494848]
ROC: [0.8917543726811539, 0.8871112486461395, 0.8495977100417762, 0.851692732011881]
PR_AUC: [0.1866074445970391, 0.13373986898699158, 0.11682233347566653, 0.1242549586335781]


[32m[I 2021-12-10 17:00:27,399][0m Trial 19 finished with value: 0.28161481186421145 and parameters: {'n_estimators': 115, 'subsample': 0.9347810722414596, 'max_depth': 15, 'max_features': 11, 'learning_rate': 0.15068865800455267, 'min_samples_leaf': 44}. Best is trial 17 with value: 0.2983154329226311.[0m


MCC: [0.28153298172148183, 0.3408660169853871, 0.25016824128780485, 0.253892007462172]
ROC: [0.8763534489285985, 0.8867244313786168, 0.8500541544174531, 0.8141480269139844]
PR_AUC: [0.1932155637677079, 0.13288800437179094, 0.13799648069244008, 0.1215687407876066]


[32m[I 2021-12-10 17:00:27,924][0m Trial 20 finished with value: 0.2869659654221257 and parameters: {'n_estimators': 64, 'subsample': 0.9759250412281357, 'max_depth': 5, 'max_features': 12, 'learning_rate': 0.18627552525260013, 'min_samples_leaf': 50}. Best is trial 17 with value: 0.2983154329226311.[0m


MCC: [0.2865960379163445, 0.3364416994558284, 0.2632054780498593, 0.2616206462664706]
ROC: [0.8860452790187021, 0.8898499149002012, 0.849303728918459, 0.8375916833363642]
PR_AUC: [0.1961300446117215, 0.13523964162906543, 0.11940929087575686, 0.11897219439999893]


[32m[I 2021-12-10 17:00:28,283][0m Trial 21 finished with value: 0.2975461385423037 and parameters: {'n_estimators': 10, 'subsample': 0.956789178953835, 'max_depth': 3, 'max_features': 10, 'learning_rate': 0.19621793637947094, 'min_samples_leaf': 39}. Best is trial 17 with value: 0.2983154329226311.[0m


MCC: [0.3136269595785885, 0.3123602063773523, 0.2871776983323663, 0.2770196898809078]
ROC: [0.8811690770046188, 0.8980117592449327, 0.8601423487544485, 0.801918530641935]
PR_AUC: [0.14659414886367986, 0.17369762709797562, 0.13492502871597675, 0.10744188683341845]


[32m[I 2021-12-10 17:00:28,716][0m Trial 22 finished with value: 0.2862832899639957 and parameters: {'n_estimators': 34, 'subsample': 0.9837955856439575, 'max_depth': 3, 'max_features': 10, 'learning_rate': 0.16959633554912465, 'min_samples_leaf': 42}. Best is trial 17 with value: 0.2983154329226311.[0m


MCC: [0.3031999306678697, 0.3136711607209051, 0.26664142220073755, 0.2616206462664706]
ROC: [0.8896039978799122, 0.8854711434318427, 0.8518335138480582, 0.8145117294053463]
PR_AUC: [0.18453309775139434, 0.1323306240331567, 0.13736964808675753, 0.12405138731171858]


[32m[I 2021-12-10 17:00:29,025][0m Trial 23 finished with value: 0.2983154329226311 and parameters: {'n_estimators': 10, 'subsample': 0.9397223442136705, 'max_depth': 20, 'max_features': 11, 'learning_rate': 0.18885759166233912, 'min_samples_leaf': 45}. Best is trial 17 with value: 0.2983154329226311.[0m


MCC: [0.3148743629431107, 0.3123602063773523, 0.2890074724891535, 0.2770196898809078]
ROC: [0.8798440221094873, 0.9056165867244313, 0.8503171901593687, 0.8599063466084743]
PR_AUC: [0.13950454331950254, 0.18673299320683023, 0.1317132429892332, 0.11677633882152202]


[32m[I 2021-12-10 17:00:29,451][0m Trial 24 finished with value: 0.2873656249449844 and parameters: {'n_estimators': 31, 'subsample': 0.9312220930123101, 'max_depth': 16, 'max_features': 12, 'learning_rate': 0.1866563240882886, 'min_samples_leaf': 46}. Best is trial 17 with value: 0.2983154329226311.[0m


MCC: [0.29341404627307777, 0.31632653068859906, 0.27541746662263034, 0.26430445619563053]
ROC: [0.8924282577421065, 0.8837536747640415, 0.8516787869410491, 0.8555116081711825]
PR_AUC: [0.215738744105224, 0.12912321789301826, 0.1146087670953535, 0.14299037641886148]
CPU times: user 15 s, sys: 249 ms, total: 15.2 s
Wall time: 15 s


In [14]:
best = study.best_trial
best

FrozenTrial(number=17, values=[0.2983154329226311], datetime_start=datetime.datetime(2021, 12, 10, 17, 0, 25, 851732), datetime_complete=datetime.datetime(2021, 12, 10, 17, 0, 26, 179753), params={'n_estimators': 11, 'subsample': 0.9803712313231584, 'max_depth': 3, 'max_features': 10, 'learning_rate': 0.1832536562134998, 'min_samples_leaf': 50}, distributions={'n_estimators': IntUniformDistribution(high=200, low=10, step=1), 'subsample': UniformDistribution(high=1.0, low=0.8), 'max_depth': IntUniformDistribution(high=20, low=3, step=1), 'max_features': IntUniformDistribution(high=20, low=10, step=1), 'learning_rate': UniformDistribution(high=0.2, low=0.08), 'min_samples_leaf': IntUniformDistribution(high=50, low=1, step=1)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=17, state=TrialState.COMPLETE, value=None)

In [15]:
%%time
gb_params = study.best_params
#gb_params['random_state'] = 42
gb = GradientBoostingClassifier(**gb_params)
gb.fit(X_rus, y_rus)
y_pred = gb.predict(X_test)
probs = gb.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

[[1350  336]
 [   5   51]]
              precision    recall  f1-score   support

           0       1.00      0.80      0.89      1686
           1       0.13      0.91      0.23        56

    accuracy                           0.80      1742
   macro avg       0.56      0.86      0.56      1742
weighted avg       0.97      0.80      0.87      1742

ROC AUC prob1: 0.8833990001694627
MCC: 0.30187518867674695
F2: 0.4173486088379706
PR_AUC: 0.17095294177771772
CPU times: user 32 ms, sys: 1 ms, total: 33 ms
Wall time: 31.6 ms


In [16]:
%%time
gb_params = study.best_params
gb_params['random_state'] = 42
gb_weight = GradientBoostingClassifier(**gb_params)
weights = class_weight.compute_sample_weight('balanced', y=y_train)
gb_weight.fit(X_train, y_train, sample_weight=weights)
y_pred = gb_weight.predict(X_test)
probs = gb_weight.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

[[1370  316]
 [  17   39]]
              precision    recall  f1-score   support

           0       0.99      0.81      0.89      1686
           1       0.11      0.70      0.19        56

    accuracy                           0.81      1742
   macro avg       0.55      0.75      0.54      1742
weighted avg       0.96      0.81      0.87      1742

ROC AUC prob1: 0.8421136248093544
MCC: 0.22288995029478373
F2: 0.3367875647668394
PR_AUC: 0.11925043631444184
CPU times: user 64.7 ms, sys: 5.04 ms, total: 69.7 ms
Wall time: 67.3 ms


In [17]:
# No Sitenum - 5
coeffs_wgt = pd.Series(data=gb_weight.feature_importances_,
                   index=X_test.columns.values).sort_values(ascending=False)
coeffs_wgt[0:20]

Delmode           0.291215
spontlabor        0.195732
Parity            0.090745
TrialLabor        0.078018
Induction         0.047108
MthInd_Oxy        0.035118
HxnumCS           0.027300
prelaborCD        0.026633
new_high_BMI      0.024017
CS_FTP            0.023671
new_BMI           0.021933
Delfetalpos       0.013585
Dilat_lst         0.012538
AdmSBP            0.011563
new_age           0.010891
uscar             0.010769
Lac_None          0.010673
CS_NRFHT          0.010667
Admreason         0.010346
high_Gravidity    0.007625
dtype: float64

In [1]:
plt.rcParams["figure.figsize"] = (14, 8)
x_labels = coeffs_wgt.index.values[0:25]
print(x_labels)
fig, ax = plt.subplots(1,1)  # Create a figure and an axes.
ax.set_title('Top Predictors: All Sites combined; target: High_EBLoss, vars: PI')
ax.bar(x_labels, coeffs_wgt.values[0:25])
ax.set_ylabel('Coeff')
ax.set_xlabel('Variable')
plt.draw()
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, fontsize=16)
#plt.xlim(1,20)
plt.show()

NameError: name 'plt' is not defined

In [19]:
# No Sitenum - 5
coeffs = pd.Series(data=gb.feature_importances_,
                   index=X_test.columns.values).sort_values(ascending=False)
coeffs[0:20]

Delmode           0.827671
spontlabor        0.097039
new_age           0.032086
Augment           0.012469
Lac_None          0.009450
BESTGA            0.006233
ROMmeth           0.005280
Admcontract       0.005178
high_Gravidity    0.003397
momrace_new       0.001198
Hxcsection        0.000000
Hxanemia          0.000000
Hxasthma          0.000000
HxGIdis           0.000000
Hostype           0.000000
HospElectInd      0.000000
Hxdepression      0.000000
HxnumCS           0.000000
Hxheartdis        0.000000
Hxmacrosomia      0.000000
dtype: float64

In [21]:
gb_params # 5 runs

{'n_estimators': 11,
 'subsample': 0.9803712313231584,
 'max_depth': 3,
 'max_features': 10,
 'learning_rate': 0.1832536562134998,
 'min_samples_leaf': 50,
 'random_state': 42}

In [2]:
from optuna.visualization import plot_optimization_history

plotly_config = {"staticPlot": True}

fig = plot_optimization_history(study)
fig.show(config=plotly_config)

NameError: name 'study' is not defined

In [None]:
from optuna.visualization import plot_param_importances

fig = plot_param_importances(study)
fig.show(config=plotly_config)

In [24]:
np.mean([0.3149689279176914, 0.31449457458602254, 0.31066798538596907, 0.3219519143488398])

0.3155208505596307

<h1 align='center'>Thanks and make sure to learn!</h1>