In [1]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
#import plotly.express as px
import seaborn as sns

from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import StratifiedKFold
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, \
                            accuracy_score, f1_score, average_precision_score, \
                            fbeta_score, precision_recall_curve, auc
from sklearn.metrics import matthews_corrcoef

from sklearn.model_selection import train_test_split

import optuna
from optuna.samplers import TPESampler
sns.set(style='white', context='notebook', palette='deep')
%config Completer.use_jedi = False

In [2]:
csl_df = pd.read_csv('../../data/csl/Sites/CSL_he_PI_s51.csv', index_col=0)
X = csl_df.drop(['high_EBL','Sitenum'], axis=1, inplace=False)
y = csl_df['high_EBL'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=7)
X_train                                                    

Unnamed: 0_level_0,transfus_yes,Accrete,AdmBishop,Admcervpos,Admconsistency,Admcontract,AdmDBP,Admefface,Admpresent,AdmSBP,...,TD_nos,ThreatenedPB,threatpb9,UnspecHBP,uscar,version9,new_age,new_BMI,new_high_Age,new_high_BMI
MomID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
51-05748,0,0,8,8,8,99.0,84,0.0,77,120.0,...,0,0,0,0,1,0,40,24,2,1
51-15063,0,0,8,8,8,99.0,74,70.0,1,124.4,...,0,0,0,0,0,0,25,28,1,1
51-14676,0,0,8,8,8,99.0,74,70.0,1,124.4,...,0,0,0,0,0,0,33,23,1,1
51-03203,0,0,8,8,8,99.0,74,0.0,77,124.4,...,0,0,0,0,0,0,34,18,1,0
51-08914,0,0,8,8,8,2.5,80,60.0,1,130.0,...,0,0,0,0,1,0,40,27,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51-03948,0,0,8,8,8,99.0,74,100.0,77,124.4,...,0,0,0,0,0,0,43,21,2,1
51-00031,0,0,8,8,8,99.0,74,100.0,1,124.4,...,0,0,0,0,0,0,23,17,1,0
51-07760,0,0,8,8,8,99.0,74,80.0,1,124.4,...,0,0,0,0,0,0,32,21,1,1
51-10263,0,0,8,8,8,2.0,74,90.0,77,124.4,...,0,0,0,0,0,0,24,24,1,1


In [3]:
X.shape

(14959, 195)

<h1 align='center'>Exploratory Data Analysis</h1>

<h1 align='center'>Modelling</h1>

In [8]:
print(X_train.shape)
print(X.shape)
print(y.shape)

(10471, 195)
(14959, 195)
(14959,)


Here we define our validation set

<h1 align='center'>Modelling Using Undersampling</h1>

We will use `imblearn`'s RandomUnderSampler to undersample from the majority class so that they match

In [9]:
sampler = RandomUnderSampler(sampling_strategy=1.0, random_state=7)
X_rus, y_rus = sampler.fit_resample(X_train, y_train)

# Try with no undersampling
#X_rus, y_rus = X_train, y_train

In [10]:
print(X_rus.shape)
print(y_rus.shape)
print(np.bincount(y_rus))

(200, 195)
(200,)
[100 100]


<h1 align='center'>Basic Gradient Boosting</h1>

In [11]:
%%time
from sklearn.metrics import matthews_corrcoef
from sklearn.utils import class_weight
gb_rus = GradientBoostingClassifier()   # If not undersampling
gb_rus.fit(X_rus, y_rus)
print(gb_rus.get_params())
y_pred = gb_rus.predict(X_test)
probs = gb_rus.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'presort': 'deprecated', 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
[[3012 1433]
 [  11   32]]
              precision    recall  f1-score   support

           0       1.00      0.68      0.81      4445
           1       0.02      0.74      0.04        43

    accuracy                           0.68      4488
   macro avg       0.51      0.71      0.42      4488
weighted avg       0.99      0.68      0.80      4488

ROC AUC prob1: 0.7528631595469172
MCC: 0.08762745214593817
F2: 0.09773976786805132
PR_AUC: 0.03476844401727625
CPU times: user 170 ms, sys: 6.98 ms, tot

In [34]:
%%time
# Not undersampled
from sklearn.metrics import matthews_corrcoef
from sklearn.utils import class_weight
gb_weight = GradientBoostingClassifier(random_state=7, verbose=1, n_estimators=400, learning_rate=.1)  
weights = class_weight.compute_sample_weight('balanced', y=y_train)
gb_weight.fit(X_train, y_train, sample_weight=weights)
print(gb_weight.get_params())
y_pred = gb_weight.predict(X_test)
probs = gb_weight.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

      Iter       Train Loss   Remaining Time 
         1           1.3170           12.95s
         2           1.2613           11.67s
         3           1.2163           11.22s
         4           1.1764           10.97s
         5           1.1439           10.72s
         6           1.1145           10.57s
         7           1.0868           10.50s
         8           1.0655           10.43s
         9           1.0451           10.43s
        10           1.0255           10.39s
        20           0.9123           10.11s
        30           0.8403           10.01s
        40           0.7902            9.73s
        50           0.7511            9.50s
        60           0.7133            9.21s
        70           0.6761            8.97s
        80           0.6462            8.72s
        90           0.6191            8.42s
       100           0.5940            8.18s
       200           0.4049            5.43s
       300           0.2923            2.69s
       40

<h1 align='center'>Hyperparameter Tuning with Cross Validation</h1>

In [13]:
%%time
def create_model(trial):
    n_estimators = trial.suggest_int('n_estimators', 10, 200)
    subsample = trial.suggest_float('subsample', 0.8, 1.0)
    max_depth = trial.suggest_int('max_depth',3,20)
    max_features = trial.suggest_int('max_features',10,20)
    learning_rate = trial.suggest_uniform('learning_rate',0.08,0.2)
    min_samples_leaf = trial.suggest_int('min_samples_leaf',1,50)
    model = GradientBoostingClassifier(n_estimators=n_estimators, subsample=subsample, 
                           max_depth=max_depth, learning_rate=learning_rate, 
                           min_samples_leaf=min_samples_leaf)
#    model = GradientBoostingClassifier(n_estimators=n_estimators, subsample=subsample, 
#                           learning_rate=learning_rate, 
#                           min_samples_leaf=min_samples_leaf)
#    model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate) 
    return model

def objective(trial):
    model = create_model(trial)
    
    mcc_list = []
    roc_list = []
    pr_list = []
    f2_list = []
    cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=11)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train_c, X_test_c = X.iloc[train_idx], X.iloc[test_idx]
        y_train_c, y_test_c = y[train_idx], y[test_idx]
        sampler = RandomUnderSampler(sampling_strategy=1.0, random_state=7)
        X_res, y_res = sampler.fit_resample(X_train_c, y_train_c)

#        weights = class_weight.compute_sample_weight('balanced', y=y_train)
#        model.fit(X_train, y_train, sample_weight=weights)
        model.fit(X_res, y_res)
        y_pred = model.predict(X_test_c)
        probs = model.predict_proba(X_test_c)
        prob1 = probs[:, 1]  # Only positives
        precision, recall, pr_thresh = precision_recall_curve(y_test_c, prob1)
        roc = roc_auc_score(y_test_c, prob1)
        roc_list.append(roc)
        mcc = matthews_corrcoef(y_test_c, y_pred)  # MWB
        mcc_list.append(mcc)
        f2 = fbeta_score(y_test_c, y_pred, beta=2.0, average=None)[1]
        pr_auc = auc(recall, precision)
        pr_list.append(pr_auc)
        avg_precision = average_precision_score(y_test_c, prob1)
#        print(f'ROC: {roc}')
#        print(f'PR_AUC: {pr_auc}')
#        print(f'avg_prec: {avg_precision}')
        
    print(f'MCC: {mcc_list}')
    print(f'ROC: {roc_list}')
    print(f'PR_AUC: {pr_list}')
    return np.mean(mcc_list)

sampler = TPESampler(seed=7)
study = optuna.create_study(sampler=sampler,direction='maximize')
#study.optimize(objective,n_trials=60)
study.optimize(objective,n_trials=25)
#study.optimize(objective,n_trials=5)

[32m[I 2021-12-10 16:06:29,566][0m A new study created in memory with name: no-name-cd99b30b-1a1e-4915-9e3a-3d52715ac898[0m
[32m[I 2021-12-10 16:06:30,023][0m Trial 0 finished with value: 0.09877687612773779 and parameters: {'n_estimators': 24, 'subsample': 0.955983758448023, 'max_depth': 10, 'max_features': 17, 'learning_rate': 0.1973587414395923, 'min_samples_leaf': 27}. Best is trial 0 with value: 0.09877687612773779.[0m


MCC: [0.09345554365545881, 0.1207972495510677, 0.08013528913425619, 0.10071942217016845]
ROC: [0.7621265298776098, 0.7688909887209023, 0.6829103671706264, 0.7392008639308856]
PR_AUC: [0.023715263948287393, 0.02532114163205567, 0.02033336421774882, 0.031037953921590505]


[32m[I 2021-12-10 16:06:30,749][0m Trial 1 finished with value: 0.09603059469416647 and parameters: {'n_estimators': 105, 'subsample': 0.8144102266719524, 'max_depth': 7, 'max_features': 15, 'learning_rate': 0.16150759953451288, 'min_samples_leaf': 41}. Best is trial 0 with value: 0.09877687612773779.[0m


MCC: [0.10006831162634375, 0.11304472779599739, 0.06912057299161206, 0.1018887663627127]
ROC: [0.7436255099592032, 0.7751417386609071, 0.6605246580273578, 0.7558238198087011]
PR_AUC: [0.024544059094195114, 0.026369186640705183, 0.021021157958250835, 0.025903559418023074]


[32m[I 2021-12-10 16:06:31,419][0m Trial 2 finished with value: 0.08878718647504173 and parameters: {'n_estimators': 82, 'subsample': 0.813187269381181, 'max_depth': 8, 'max_features': 20, 'learning_rate': 0.10560624242958987, 'min_samples_leaf': 23}. Best is trial 0 with value: 0.09877687612773779.[0m


MCC: [0.0800884600952329, 0.10799135893257612, 0.07490788562314811, 0.09216104124920982]
ROC: [0.7358711303095752, 0.7681035517158628, 0.6688114950803935, 0.7500231410058624]
PR_AUC: [0.021317314226503157, 0.023324107727558295, 0.01929473364908045, 0.025292143043985485]


[32m[I 2021-12-10 16:06:32,541][0m Trial 3 finished with value: 0.07874330161224209 and parameters: {'n_estimators': 187, 'subsample': 0.8049798455100696, 'max_depth': 13, 'max_features': 20, 'learning_rate': 0.10763634548251577, 'min_samples_leaf': 28}. Best is trial 0 with value: 0.09877687612773779.[0m


MCC: [0.07823760196024632, 0.09495403666345324, 0.06377604169083569, 0.07800552613443314]
ROC: [0.7233696304295657, 0.7464002879769618, 0.6488930885529157, 0.7397562480715828]
PR_AUC: [0.01918017798199114, 0.021631973573335232, 0.023162383824927923, 0.023894895846822257]


[32m[I 2021-12-10 16:06:33,734][0m Trial 4 finished with value: 0.061957919225733876 and parameters: {'n_estimators': 183, 'subsample': 0.8266338891518501, 'max_depth': 12, 'max_features': 18, 'learning_rate': 0.16028158890606967, 'min_samples_leaf': 24}. Best is trial 0 with value: 0.09877687612773779.[0m


MCC: [0.052793616428204626, 0.07871210393649618, 0.06190907986852824, 0.054416876669706446]
ROC: [0.6957718382529398, 0.7204898608111351, 0.646553275737941, 0.7122724467756865]
PR_AUC: [0.01710736672272439, 0.01952632208156295, 0.02082195469765876, 0.024764862267923814]


[32m[I 2021-12-10 16:06:34,234][0m Trial 5 finished with value: 0.10711949820396768 and parameters: {'n_estimators': 49, 'subsample': 0.8981531778182141, 'max_depth': 9, 'max_features': 15, 'learning_rate': 0.12390684629367116, 'min_samples_leaf': 42}. Best is trial 5 with value: 0.10711949820396768.[0m


MCC: [0.10825171311060451, 0.12464938496231993, 0.09144361413542253, 0.10413328060752375]
ROC: [0.7562694984401248, 0.7803575713942884, 0.7009501739860811, 0.7488815180499846]
PR_AUC: [0.02722507447417112, 0.02647205060511469, 0.025376946840630382, 0.057736528800613064]


[32m[I 2021-12-10 16:06:35,403][0m Trial 6 finished with value: 0.07345826378610718 and parameters: {'n_estimators': 156, 'subsample': 0.8627989354425325, 'max_depth': 13, 'max_features': 13, 'learning_rate': 0.13434115190556806, 'min_samples_leaf': 18}. Best is trial 5 with value: 0.10711949820396768.[0m


MCC: [0.08740815542713692, 0.08522671517552687, 0.06066253894682077, 0.06053564559494409]
ROC: [0.7287991960643149, 0.7143778497720182, 0.6594522438204944, 0.7264116013576056]
PR_AUC: [0.01945438150776192, 0.01848980883175195, 0.02138362278765688, 0.02225660890284511]


[32m[I 2021-12-10 16:06:36,195][0m Trial 7 finished with value: 0.1020499541484101 and parameters: {'n_estimators': 135, 'subsample': 0.8740702165976071, 'max_depth': 11, 'max_features': 17, 'learning_rate': 0.12955901949366017, 'min_samples_leaf': 46}. Best is trial 5 with value: 0.10711949820396768.[0m


MCC: [0.10682417339875595, 0.11946917961800371, 0.0740068337653269, 0.10789962981155382]
ROC: [0.7384059275257979, 0.7784714722822174, 0.660134689224862, 0.7586933045356371]
PR_AUC: [0.023299259805989306, 0.026775963649040013, 0.020625826776789723, 0.03018258477236989]


[32m[I 2021-12-10 16:06:36,719][0m Trial 8 finished with value: 0.09628798100171593 and parameters: {'n_estimators': 44, 'subsample': 0.9482237745826528, 'max_depth': 10, 'max_features': 14, 'learning_rate': 0.15612558423606065, 'min_samples_leaf': 27}. Best is trial 5 with value: 0.10711949820396768.[0m


MCC: [0.09334864751848389, 0.12508858095786646, 0.06599527336034496, 0.10071942217016845]
ROC: [0.757949364050876, 0.7635514158867289, 0.6589947804175665, 0.7547670472076519]
PR_AUC: [0.0251086908513205, 0.024472816196685215, 0.021580766856910346, 0.02698332763467279]


[32m[I 2021-12-10 16:06:37,375][0m Trial 9 finished with value: 0.09625387572940011 and parameters: {'n_estimators': 89, 'subsample': 0.8002853761125517, 'max_depth': 4, 'max_features': 17, 'learning_rate': 0.14292147161182367, 'min_samples_leaf': 35}. Best is trial 5 with value: 0.10711949820396768.[0m


MCC: [0.09814592642666467, 0.11393533285029439, 0.07295779934118837, 0.09997644429945303]
ROC: [0.7437230021598273, 0.7634539236861051, 0.6725011999040076, 0.7575208269052762]
PR_AUC: [0.02347234181561583, 0.022902169616947944, 0.020455513569683805, 0.025657947382519357]


[32m[I 2021-12-10 16:06:38,412][0m Trial 10 finished with value: 0.08826609375790545 and parameters: {'n_estimators': 55, 'subsample': 0.924763336987846, 'max_depth': 18, 'max_features': 10, 'learning_rate': 0.09172353354166263, 'min_samples_leaf': 4}. Best is trial 5 with value: 0.10711949820396768.[0m


MCC: [0.08196482698192745, 0.11296422663939026, 0.07810938765253791, 0.08002593375776615]
ROC: [0.7362161027117831, 0.8180345572354212, 0.7046661267098632, 0.7389385991977785]
PR_AUC: [0.02744401810023067, 0.047021186463637024, 0.03101204991471714, 0.027708907958135662]


[32m[I 2021-12-10 16:06:39,186][0m Trial 11 finished with value: 0.10499081934068019 and parameters: {'n_estimators': 140, 'subsample': 0.8806671160017245, 'max_depth': 17, 'max_features': 13, 'learning_rate': 0.12218635350714935, 'min_samples_leaf': 49}. Best is trial 5 with value: 0.10711949820396768.[0m


MCC: [0.10918864026163656, 0.12121661371972173, 0.08097024573936248, 0.10858777764200003]
ROC: [0.7328338732901367, 0.7829598632109431, 0.6668916486681064, 0.7565643319962975]
PR_AUC: [0.021482534182377683, 0.028320579317428915, 0.02215338486542846, 0.03424977352391308]


[32m[I 2021-12-10 16:06:39,999][0m Trial 12 finished with value: 0.09745275427294982 and parameters: {'n_estimators': 139, 'subsample': 0.9983488269913793, 'max_depth': 20, 'max_features': 12, 'learning_rate': 0.11731011779041688, 'min_samples_leaf': 48}. Best is trial 5 with value: 0.10711949820396768.[0m


MCC: [0.089315190594211, 0.11768010422837598, 0.07922674206046074, 0.10358898020875155]
ROC: [0.7327288816894649, 0.7765853731701464, 0.6793856491480681, 0.7514308855291576]
PR_AUC: [0.022276625178666753, 0.02575143187676276, 0.021284588850505437, 0.027780939051430076]


[32m[I 2021-12-10 16:06:40,565][0m Trial 13 finished with value: 0.10524295265311535 and parameters: {'n_estimators': 65, 'subsample': 0.8871894694183349, 'max_depth': 16, 'max_features': 11, 'learning_rate': 0.08283159061684342, 'min_samples_leaf': 38}. Best is trial 5 with value: 0.10711949820396768.[0m


MCC: [0.10876145307831238, 0.12274394047633493, 0.08525129062195315, 0.10421512643586094]
ROC: [0.756138258939285, 0.7673086153107751, 0.6918571514278856, 0.7563676334464672]
PR_AUC: [0.024105862370953784, 0.022933320991678923, 0.021385748913678185, 0.06697165915335768]


[32m[I 2021-12-10 16:06:40,936][0m Trial 14 finished with value: 0.10660603320394925 and parameters: {'n_estimators': 11, 'subsample': 0.900795069380725, 'max_depth': 17, 'max_features': 10, 'learning_rate': 0.08368180211045036, 'min_samples_leaf': 39}. Best is trial 5 with value: 0.10711949820396768.[0m


MCC: [0.11009822804007668, 0.12013059276906828, 0.09412383251568268, 0.10207147949096933]
ROC: [0.7964962802975762, 0.7960200683945283, 0.7076021418286537, 0.7910829990743599]
PR_AUC: [0.07733149678289637, 0.040724524099221476, 0.06397391272996579, 0.09003879271350271]


[32m[I 2021-12-10 16:06:41,402][0m Trial 15 finished with value: 0.10640420928803475 and parameters: {'n_estimators': 27, 'subsample': 0.9188326849130758, 'max_depth': 15, 'max_features': 10, 'learning_rate': 0.09646367993844544, 'min_samples_leaf': 33}. Best is trial 5 with value: 0.10711949820396768.[0m


MCC: [0.10944601302736129, 0.12274394047633493, 0.09075312730730595, 0.10267375634113686]
ROC: [0.7798551115910727, 0.765467512598992, 0.7176700863930885, 0.7648488120950323]
PR_AUC: [0.028004134276603632, 0.022747388623089823, 0.024285266311471817, 0.033989189744643364]


[32m[I 2021-12-10 16:06:41,806][0m Trial 16 finished with value: 0.10694997643707634 and parameters: {'n_estimators': 16, 'subsample': 0.9072222961136597, 'max_depth': 4, 'max_features': 15, 'learning_rate': 0.18432802855034425, 'min_samples_leaf': 42}. Best is trial 5 with value: 0.10711949820396768.[0m


MCC: [0.11293549968109234, 0.12071362755961727, 0.08829174531709406, 0.10585903319050166]
ROC: [0.7794201463882889, 0.7750704943604512, 0.702967512598992, 0.7667579450786794]
PR_AUC: [0.029962773057236637, 0.024576287729439705, 0.024136862500317442, 0.07464888836896615]


[32m[I 2021-12-10 16:06:42,414][0m Trial 17 finished with value: 0.0905398850615406 and parameters: {'n_estimators': 37, 'subsample': 0.844657081309488, 'max_depth': 3, 'max_features': 15, 'learning_rate': 0.19766585955873117, 'min_samples_leaf': 13}. Best is trial 5 with value: 0.10711949820396768.[0m


MCC: [0.1038807939496881, 0.10178720732967979, 0.0699692978653452, 0.0865222411014493]
ROC: [0.7595017398608112, 0.7710958123350131, 0.6902147828173746, 0.7330414995371799]
PR_AUC: [0.02791732243047985, 0.032982819234597026, 0.06295330579167623, 0.023088566541293584]


[32m[I 2021-12-10 16:06:42,746][0m Trial 18 finished with value: 0.1058672680452148 and parameters: {'n_estimators': 10, 'subsample': 0.9144676074902556, 'max_depth': 5, 'max_features': 16, 'learning_rate': 0.18082454795888583, 'min_samples_leaf': 44}. Best is trial 5 with value: 0.10711949820396768.[0m


MCC: [0.11009822804007668, 0.12013059276906828, 0.09116877188074493, 0.10207147949096933]
ROC: [0.7839235361171107, 0.8067817074634028, 0.707013438924886, 0.7614123727244677]
PR_AUC: [0.056681978631156385, 0.026723242757989308, 0.03689870849418428, 0.05817028400765567]


[32m[I 2021-12-10 16:06:43,370][0m Trial 19 finished with value: 0.08637926222209447 and parameters: {'n_estimators': 65, 'subsample': 0.9435750491575882, 'max_depth': 7, 'max_features': 14, 'learning_rate': 0.17895011146457307, 'min_samples_leaf': 32}. Best is trial 5 with value: 0.10711949820396768.[0m


MCC: [0.08628110803953518, 0.11097611449559185, 0.06669633997799818, 0.08156348637525268]
ROC: [0.7451478881689465, 0.7601916846652268, 0.6867425605951524, 0.7529774760876273]
PR_AUC: [0.021625111333674597, 0.023211070878170443, 0.024371475918852845, 0.026363106988403766]


[32m[I 2021-12-10 16:06:44,026][0m Trial 20 finished with value: 0.09868669679162713 and parameters: {'n_estimators': 85, 'subsample': 0.9754969163535497, 'max_depth': 5, 'max_features': 19, 'learning_rate': 0.14721355785167314, 'min_samples_leaf': 43}. Best is trial 5 with value: 0.10711949820396768.[0m


MCC: [0.08605212193630399, 0.12163808511052075, 0.08558736459122934, 0.10146921552845445]
ROC: [0.7472552195824335, 0.7659812215022798, 0.683844042476602, 0.7600624807158285]
PR_AUC: [0.02270137863782078, 0.023161476256730512, 0.02649335362103268, 0.028879225859612292]


[32m[I 2021-12-10 16:06:44,390][0m Trial 21 finished with value: 0.10607970660832099 and parameters: {'n_estimators': 10, 'subsample': 0.9018984752484351, 'max_depth': 8, 'max_features': 12, 'learning_rate': 0.1706011746715841, 'min_samples_leaf': 38}. Best is trial 5 with value: 0.10711949820396768.[0m


MCC: [0.11009822804007668, 0.12013059276906828, 0.09201852613316962, 0.10207147949096933]
ROC: [0.7876169906407487, 0.7846659767218622, 0.7191699664026877, 0.7578756556618328]
PR_AUC: [0.031985687247391294, 0.03873207899645022, 0.02375204291459101, 0.02735459970506501]


[32m[I 2021-12-10 16:06:44,791][0m Trial 22 finished with value: 0.10672253270772755 and parameters: {'n_estimators': 25, 'subsample': 0.8985517696641264, 'max_depth': 20, 'max_features': 15, 'learning_rate': 0.11212377763763448, 'min_samples_leaf': 39}. Best is trial 5 with value: 0.10711949820396768.[0m


MCC: [0.10690747351253803, 0.12088095491280633, 0.09387281593065631, 0.10522888647490955]
ROC: [0.7784902207823374, 0.76532502399808, 0.7169613930885529, 0.7655931811169392]
PR_AUC: [0.03105470592022238, 0.02242797904477649, 0.02555994587172878, 0.0652852567622307]


[32m[I 2021-12-10 16:06:45,255][0m Trial 23 finished with value: 0.10615605175328258 and parameters: {'n_estimators': 46, 'subsample': 0.8608267291762682, 'max_depth': 20, 'max_features': 15, 'learning_rate': 0.11482662289828563, 'min_samples_leaf': 49}. Best is trial 5 with value: 0.10711949820396768.[0m


MCC: [0.10842127921860473, 0.12146924220577644, 0.08792042836192847, 0.10681325722682064]
ROC: [0.7560632649388048, 0.7742118130549556, 0.7039874310055195, 0.7555808392471459]
PR_AUC: [0.03430521099807913, 0.025816201847752282, 0.02200666887287144, 0.056267795219155904]


[32m[I 2021-12-10 16:06:45,669][0m Trial 24 finished with value: 0.10948886104872017 and parameters: {'n_estimators': 28, 'subsample': 0.9290306649374853, 'max_depth': 14, 'max_features': 16, 'learning_rate': 0.12914925927473922, 'min_samples_leaf': 35}. Best is trial 24 with value: 0.10948886104872017.[0m


MCC: [0.11378866086505647, 0.12248744297291562, 0.0894117676464688, 0.1122675727104398]
ROC: [0.7707133429325654, 0.7726444384449244, 0.7065522258219343, 0.7630013884603517]
PR_AUC: [0.026935398775932668, 0.024248432368101733, 0.022465231570272613, 0.03970435209421735]
CPU times: user 15.9 s, sys: 325 ms, total: 16.2 s
Wall time: 16.1 s


In [14]:
best = study.best_trial
best

FrozenTrial(number=24, values=[0.10948886104872017], datetime_start=datetime.datetime(2021, 12, 10, 16, 6, 45, 256743), datetime_complete=datetime.datetime(2021, 12, 10, 16, 6, 45, 669113), params={'n_estimators': 28, 'subsample': 0.9290306649374853, 'max_depth': 14, 'max_features': 16, 'learning_rate': 0.12914925927473922, 'min_samples_leaf': 35}, distributions={'n_estimators': IntUniformDistribution(high=200, low=10, step=1), 'subsample': UniformDistribution(high=1.0, low=0.8), 'max_depth': IntUniformDistribution(high=20, low=3, step=1), 'max_features': IntUniformDistribution(high=20, low=10, step=1), 'learning_rate': UniformDistribution(high=0.2, low=0.08), 'min_samples_leaf': IntUniformDistribution(high=50, low=1, step=1)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=24, state=TrialState.COMPLETE, value=None)

In [15]:
%%time
gb_params = study.best_params
#gb_params['random_state'] = 42
gb = GradientBoostingClassifier(**gb_params)
gb.fit(X_rus, y_rus)
y_pred = gb.predict(X_test)
probs = gb.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

[[3052 1393]
 [  10   33]]
              precision    recall  f1-score   support

           0       1.00      0.69      0.81      4445
           1       0.02      0.77      0.04        43

    accuracy                           0.69      4488
   macro avg       0.51      0.73      0.43      4488
weighted avg       0.99      0.69      0.81      4488

ROC AUC prob1: 0.7417767546498548
MCC: 0.09499860011710942
F2: 0.1032540675844806
PR_AUC: 0.024459407859954145
CPU times: user 66.9 ms, sys: 3.58 ms, total: 70.5 ms
Wall time: 68.2 ms


In [16]:
%%time
gb_params = study.best_params
gb_params['random_state'] = 42
gb_weight = GradientBoostingClassifier(**gb_params)
weights = class_weight.compute_sample_weight('balanced', y=y_train)
gb_weight.fit(X_train, y_train, sample_weight=weights)
y_pred = gb_weight.predict(X_test)
probs = gb_weight.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

[[4236  209]
 [  35    8]]
              precision    recall  f1-score   support

           0       0.99      0.95      0.97      4445
           1       0.04      0.19      0.06        43

    accuracy                           0.95      4488
   macro avg       0.51      0.57      0.52      4488
weighted avg       0.98      0.95      0.96      4488

ROC AUC prob1: 0.6980249561828027
MCC: 0.06313581986103019
F2: 0.10282776349614395
PR_AUC: 0.03747984192857726
CPU times: user 510 ms, sys: 4.99 ms, total: 515 ms
Wall time: 512 ms


In [17]:
# No Sitenum - 5
coeffs_wgt = pd.Series(data=gb_weight.feature_importances_,
                   index=X_test.columns.values).sort_values(ascending=False)
coeffs_wgt[0:20]

Delmode             0.088513
new_age             0.074347
BESTGA              0.064599
Admefface           0.063229
new_BMI             0.045447
Lac_Min             0.039832
CS_FTP              0.036352
Dilat_lst           0.031011
Lac_None            0.028408
Delfetalpos         0.026131
momrace_new         0.025598
Admpresent          0.025554
Admreason           0.022888
Analgesia           0.022414
Hxpreterm           0.018354
high_Gravidity      0.018100
ROMmeth             0.016925
Antehospital        0.013785
Malpresentation9    0.013620
Insurance           0.013303
dtype: float64

In [1]:
plt.rcParams["figure.figsize"] = (14, 8)
x_labels = coeffs_wgt.index.values[0:25]
print(x_labels)
fig, ax = plt.subplots(1,1)  # Create a figure and an axes.
ax.set_title('Top Predictors: All Sites combined; target: High_EBLoss, vars: PI')
ax.bar(x_labels, coeffs_wgt.values[0:25])
ax.set_ylabel('Coeff')
ax.set_xlabel('Variable')
plt.draw()
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, fontsize=16)
#plt.xlim(1,20)
plt.show()

NameError: name 'plt' is not defined

In [19]:
# No Sitenum - 5
coeffs = pd.Series(data=gb.feature_importances_,
                   index=X_test.columns.values).sort_values(ascending=False)
coeffs[0:20]

Delmode           0.258989
CS_FTP            0.122801
Dilat_lst         0.089096
Delfetalpos       0.081375
Admefface         0.073909
spontlabor        0.052047
new_BMI           0.042307
Lac_Min           0.034317
momrace_new       0.033286
BESTGA            0.032297
Insurance         0.030733
Parity            0.024060
MthInd_Oxy        0.023034
Lac_None          0.021233
uscar             0.019975
Induction         0.019037
high_Gravidity    0.009494
Admpresent        0.009227
Ind_Unkn          0.007132
prelaborCD        0.004863
dtype: float64

In [21]:
gb_params # 5 runs

{'n_estimators': 28,
 'subsample': 0.9290306649374853,
 'max_depth': 14,
 'max_features': 16,
 'learning_rate': 0.12914925927473922,
 'min_samples_leaf': 35,
 'random_state': 42}

In [2]:
from optuna.visualization import plot_optimization_history

plotly_config = {"staticPlot": True}

fig = plot_optimization_history(study)
fig.show(config=plotly_config)

NameError: name 'study' is not defined

In [None]:
from optuna.visualization import plot_param_importances

fig = plot_param_importances(study)
fig.show(config=plotly_config)

In [24]:
np.mean([0.3149689279176914, 0.31449457458602254, 0.31066798538596907, 0.3219519143488398])

0.3155208505596307

<h1 align='center'>Thanks and make sure to learn!</h1>