In [1]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
#import plotly.express as px
import seaborn as sns

from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import StratifiedKFold
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, \
                            accuracy_score, f1_score, average_precision_score, \
                            fbeta_score, precision_recall_curve, auc
from sklearn.metrics import matthews_corrcoef

from sklearn.model_selection import train_test_split

import optuna
from optuna.samplers import TPESampler
sns.set(style='white', context='notebook', palette='deep')
%config Completer.use_jedi = False

In [2]:
csl_df = pd.read_csv('../../data/csl/Sites/CSL_he_PI_s52.csv', index_col=0)
X = csl_df.drop(['high_EBL','Sitenum'], axis=1, inplace=False)
y = csl_df['high_EBL'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=7)
X_train                                                    

Unnamed: 0_level_0,transfus_yes,Accrete,AdmBishop,Admcervpos,Admconsistency,Admcontract,AdmDBP,Admefface,Admpresent,AdmSBP,...,TD_nos,ThreatenedPB,threatpb9,UnspecHBP,uscar,version9,new_age,new_BMI,new_high_Age,new_high_BMI
MomID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
52-00970,0,0,8,8,8,66.0,74,90.0,77,124.4,...,0,0,0,0,0,0,27,24,1,1
52-03198,0,0,8,8,8,66.0,74,70.0,77,124.4,...,0,0,0,0,0,0,26,24,1,1
52-05045,0,0,8,8,8,77.0,74,20.0,77,124.4,...,0,0,0,0,0,0,21,37,1,1
52-01722,0,0,8,8,8,99.0,74,0.0,77,124.4,...,0,0,0,0,0,0,24,28,1,1
52-00989,0,0,8,8,8,77.0,74,60.0,1,124.4,...,0,0,0,0,0,0,34,38,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52-03765,0,0,8,8,8,77.0,74,0.0,77,124.4,...,0,0,0,0,1,0,36,24,1,1
52-00172,0,0,8,8,8,77.0,74,50.0,1,124.4,...,0,0,0,0,0,0,41,21,2,1
52-04394,0,0,8,8,8,99.0,74,0.0,77,124.4,...,0,0,0,0,1,0,23,24,1,1
52-05451,0,0,8,8,8,77.0,74,0.0,1,124.4,...,0,0,0,0,0,0,23,21,1,1


In [3]:
X.shape

(6020, 195)

<h1 align='center'>Exploratory Data Analysis</h1>

<h1 align='center'>Modelling</h1>

In [8]:
print(X_train.shape)
print(X.shape)
print(y.shape)

(4214, 195)
(6020, 195)
(6020,)


Here we define our validation set

<h1 align='center'>Modelling Using Undersampling</h1>

We will use `imblearn`'s RandomUnderSampler to undersample from the majority class so that they match

In [9]:
sampler = RandomUnderSampler(sampling_strategy=1.0, random_state=7)
X_rus, y_rus = sampler.fit_resample(X_train, y_train)

# Try with no undersampling
#X_rus, y_rus = X_train, y_train

In [10]:
print(X_rus.shape)
print(y_rus.shape)
print(np.bincount(y_rus))

(646, 195)
(646,)
[323 323]


<h1 align='center'>Basic Gradient Boosting</h1>

In [11]:
%%time
from sklearn.metrics import matthews_corrcoef
from sklearn.utils import class_weight
gb_rus = GradientBoostingClassifier()   # If not undersampling
gb_rus.fit(X_rus, y_rus)
print(gb_rus.get_params())
y_pred = gb_rus.predict(X_test)
probs = gb_rus.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'presort': 'deprecated', 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
[[1217  450]
 [  16  123]]
              precision    recall  f1-score   support

           0       0.99      0.73      0.84      1667
           1       0.21      0.88      0.35       139

    accuracy                           0.74      1806
   macro avg       0.60      0.81      0.59      1806
weighted avg       0.93      0.74      0.80      1806

ROC AUC prob1: 0.8478074169338794
MCC: 0.3521708333566601
F2: 0.5447298494242693
PR_AUC: 0.2858238860040616
CPU times: user 268 ms, sys: 0 ns, total: 26

In [12]:
%%time
# Not undersampled
from sklearn.metrics import matthews_corrcoef
from sklearn.utils import class_weight
gb_weight = GradientBoostingClassifier()  
weights = class_weight.compute_sample_weight('balanced', y=y_train)
gb_weight.fit(X_train, y_train, sample_weight=weights)
print(gb_weight.get_params())
y_pred = gb_weight.predict(X_test)
probs = gb_weight.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'presort': 'deprecated', 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
[[1239  428]
 [  17  122]]
              precision    recall  f1-score   support

           0       0.99      0.74      0.85      1667
           1       0.22      0.88      0.35       139

    accuracy                           0.75      1806
   macro avg       0.60      0.81      0.60      1806
weighted avg       0.93      0.75      0.81      1806

ROC AUC prob1: 0.8582233193649039
MCC: 0.3596293280150705
F2: 0.5515370705244123
PR_AUC: 0.30447775875825994
CPU times: user 1.13 s, sys: 5.49 ms, total

<h1 align='center'>Hyperparameter Tuning with Cross Validation</h1>

In [13]:
%%time
def create_model(trial):
    n_estimators = trial.suggest_int('n_estimators', 10, 200)
    subsample = trial.suggest_float('subsample', 0.8, 1.0)
    max_depth = trial.suggest_int('max_depth',3,20)
    max_features = trial.suggest_int('max_features',10,20)
    learning_rate = trial.suggest_uniform('learning_rate',0.08,0.2)
    min_samples_leaf = trial.suggest_int('min_samples_leaf',1,50)
    model = GradientBoostingClassifier(n_estimators=n_estimators, subsample=subsample, 
                           max_depth=max_depth, learning_rate=learning_rate, 
                           min_samples_leaf=min_samples_leaf)
#    model = GradientBoostingClassifier(n_estimators=n_estimators, subsample=subsample, 
#                           learning_rate=learning_rate, 
#                           min_samples_leaf=min_samples_leaf)
#    model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate) 
    return model

def objective(trial):
    model = create_model(trial)
    
    mcc_list = []
    roc_list = []
    pr_list = []
    f2_list = []
    cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=11)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train_c, X_test_c = X.iloc[train_idx], X.iloc[test_idx]
        y_train_c, y_test_c = y[train_idx], y[test_idx]
        sampler = RandomUnderSampler(sampling_strategy=1.0, random_state=7)
        X_res, y_res = sampler.fit_resample(X_train_c, y_train_c)

#        weights = class_weight.compute_sample_weight('balanced', y=y_train)
#        model.fit(X_train, y_train, sample_weight=weights)
        model.fit(X_res, y_res)
        y_pred = model.predict(X_test_c)
        probs = model.predict_proba(X_test_c)
        prob1 = probs[:, 1]  # Only positives
        precision, recall, pr_thresh = precision_recall_curve(y_test_c, prob1)
        roc = roc_auc_score(y_test_c, prob1)
        roc_list.append(roc)
        mcc = matthews_corrcoef(y_test_c, y_pred)  # MWB
        mcc_list.append(mcc)
        f2 = fbeta_score(y_test_c, y_pred, beta=2.0, average=None)[1]
        pr_auc = auc(recall, precision)
        pr_list.append(pr_auc)
        avg_precision = average_precision_score(y_test_c, prob1)
#        print(f'ROC: {roc}')
#        print(f'PR_AUC: {pr_auc}')
#        print(f'avg_prec: {avg_precision}')
        
    print(f'MCC: {mcc_list}')
    print(f'ROC: {roc_list}')
    print(f'PR_AUC: {pr_list}')
    return np.mean(mcc_list)

sampler = TPESampler(seed=7)
study = optuna.create_study(sampler=sampler,direction='maximize')
#study.optimize(objective,n_trials=60)
study.optimize(objective,n_trials=25)
#study.optimize(objective,n_trials=5)

[32m[I 2021-12-10 17:24:17,700][0m A new study created in memory with name: no-name-cbb9348f-72d4-4886-b7a8-f915172baa87[0m
[32m[I 2021-12-10 17:24:18,307][0m Trial 0 finished with value: 0.36147334514803375 and parameters: {'n_estimators': 24, 'subsample': 0.955983758448023, 'max_depth': 10, 'max_features': 17, 'learning_rate': 0.1973587414395923, 'min_samples_leaf': 27}. Best is trial 0 with value: 0.36147334514803375.[0m


MCC: [0.3446576100619373, 0.3701820554869729, 0.36037798214316813, 0.3706757329000567]
ROC: [0.8466499843603379, 0.8573913043478261, 0.8487934758322783, 0.8640860455301507]
PR_AUC: [0.23900539023781747, 0.2903254718299719, 0.2770902795969193, 0.24345882511315797]


[32m[I 2021-12-10 17:24:19,633][0m Trial 1 finished with value: 0.3626606208438515 and parameters: {'n_estimators': 105, 'subsample': 0.8144102266719524, 'max_depth': 7, 'max_features': 15, 'learning_rate': 0.16150759953451288, 'min_samples_leaf': 41}. Best is trial 1 with value: 0.3626606208438515.[0m


MCC: [0.3504470831860561, 0.3613459385912328, 0.3728695450004163, 0.36597991659770074]
ROC: [0.8288645605254926, 0.8600250234594933, 0.8545592214691791, 0.8633412775253841]
PR_AUC: [0.2105331102698293, 0.324399737600382, 0.22871758588525656, 0.25365058344000596]


[32m[I 2021-12-10 17:24:21,012][0m Trial 2 finished with value: 0.3698028368020847 and parameters: {'n_estimators': 82, 'subsample': 0.813187269381181, 'max_depth': 8, 'max_features': 20, 'learning_rate': 0.10560624242958987, 'min_samples_leaf': 23}. Best is trial 2 with value: 0.3698028368020847.[0m


MCC: [0.3632810050265504, 0.3866225967968583, 0.36385845302093783, 0.36544929236399226]
ROC: [0.842352205192368, 0.862414763841101, 0.8500161366401033, 0.8731101511879051]
PR_AUC: [0.2480000356418052, 0.3025299250672792, 0.2462654272961379, 0.27240149933558166]


[32m[I 2021-12-10 17:24:23,947][0m Trial 3 finished with value: 0.3611779220011739 and parameters: {'n_estimators': 187, 'subsample': 0.8049798455100696, 'max_depth': 13, 'max_features': 20, 'learning_rate': 0.10763634548251577, 'min_samples_leaf': 28}. Best is trial 2 with value: 0.3698028368020847.[0m


MCC: [0.3508372577002082, 0.37030475595051277, 0.35826347005712605, 0.3653062042968485]
ROC: [0.8372349077259933, 0.8559899906162027, 0.8522628534544823, 0.8761512872073682]
PR_AUC: [0.21332853384504957, 0.3029708178118584, 0.222558815987981, 0.27790563094305143]


[32m[I 2021-12-10 17:24:27,094][0m Trial 4 finished with value: 0.35098293195777236 and parameters: {'n_estimators': 183, 'subsample': 0.8266338891518501, 'max_depth': 12, 'max_features': 18, 'learning_rate': 0.16028158890606967, 'min_samples_leaf': 24}. Best is trial 2 with value: 0.3698028368020847.[0m


MCC: [0.34436625191909936, 0.3588547662138211, 0.3498986896554734, 0.3508120200426957]
ROC: [0.8408758210822647, 0.8598748827025338, 0.8570728134852661, 0.8592450534991684]
PR_AUC: [0.23107138203226468, 0.30509011863506325, 0.2394681210430952, 0.2607181710937906]


[32m[I 2021-12-10 17:24:27,969][0m Trial 5 finished with value: 0.370607360915226 and parameters: {'n_estimators': 49, 'subsample': 0.8981531778182141, 'max_depth': 9, 'max_features': 15, 'learning_rate': 0.12390684629367116, 'min_samples_leaf': 42}. Best is trial 5 with value: 0.370607360915226.[0m


MCC: [0.36068088979269397, 0.37739004719530805, 0.36544929236399226, 0.37890921430890967]
ROC: [0.8381607757272442, 0.8610634970284643, 0.8495196246369257, 0.8731908343884213]
PR_AUC: [0.22557651032789433, 0.27762215683555647, 0.22133926503915047, 0.255908336231096]


[32m[I 2021-12-10 17:24:31,015][0m Trial 6 finished with value: 0.36263072636724974 and parameters: {'n_estimators': 156, 'subsample': 0.8627989354425325, 'max_depth': 13, 'max_features': 13, 'learning_rate': 0.13434115190556806, 'min_samples_leaf': 18}. Best is trial 5 with value: 0.370607360915226.[0m


MCC: [0.3683800990417092, 0.3594261953862824, 0.3512782283202046, 0.3714383827208027]
ROC: [0.8465561463872381, 0.867225523928683, 0.8573893398872918, 0.8760271592065739]
PR_AUC: [0.24654941352233867, 0.2989405721640927, 0.24808650879210342, 0.3136020868062333]


[32m[I 2021-12-10 17:24:32,848][0m Trial 7 finished with value: 0.3540949122757857 and parameters: {'n_estimators': 135, 'subsample': 0.8740702165976071, 'max_depth': 11, 'max_features': 17, 'learning_rate': 0.12955901949366017, 'min_samples_leaf': 46}. Best is trial 5 with value: 0.370607360915226.[0m


MCC: [0.3539869151041672, 0.3452748045916619, 0.34149972263561396, 0.37561820677169966]
ROC: [0.8385048482952768, 0.8568720675633406, 0.8472480822223878, 0.8692683895633176]
PR_AUC: [0.22154084494736326, 0.31214489358842246, 0.22513696382498305, 0.24721920535245337]


[32m[I 2021-12-10 17:24:33,815][0m Trial 8 finished with value: 0.3559944242383313 and parameters: {'n_estimators': 44, 'subsample': 0.9482237745826528, 'max_depth': 10, 'max_features': 14, 'learning_rate': 0.15612558423606065, 'min_samples_leaf': 27}. Best is trial 5 with value: 0.370607360915226.[0m


MCC: [0.3429367939853348, 0.34603793626463086, 0.37283999831245906, 0.3621629683909005]
ROC: [0.8421832968407882, 0.8573662808883329, 0.8591147190983344, 0.8678595367543009]
PR_AUC: [0.23214482641959622, 0.28737031028474674, 0.2636928653003998, 0.26774661524883697]


[32m[I 2021-12-10 17:24:34,897][0m Trial 9 finished with value: 0.36019301859524167 and parameters: {'n_estimators': 89, 'subsample': 0.8002853761125517, 'max_depth': 4, 'max_features': 17, 'learning_rate': 0.14292147161182367, 'min_samples_leaf': 35}. Best is trial 5 with value: 0.370607360915226.[0m


MCC: [0.3518953726031772, 0.35688992822291943, 0.3514799646373634, 0.38050680891750666]
ROC: [0.8380606818892712, 0.8608007507037848, 0.8433380501973635, 0.8775042824160274]
PR_AUC: [0.22825878784547307, 0.3224160938057595, 0.21341783215121235, 0.2713141483081782]


[32m[I 2021-12-10 17:24:37,203][0m Trial 10 finished with value: 0.36173089271774117 and parameters: {'n_estimators': 55, 'subsample': 0.924763336987846, 'max_depth': 18, 'max_features': 10, 'learning_rate': 0.09172353354166263, 'min_samples_leaf': 4}. Best is trial 5 with value: 0.370607360915226.[0m


MCC: [0.3505188821368794, 0.3435538718392845, 0.3661119324171131, 0.3867388844776877]
ROC: [0.8439099155458242, 0.8522552392868314, 0.8559060102777984, 0.8683684615575582]
PR_AUC: [0.23762521921068766, 0.26882600073741564, 0.2432669814412409, 0.25396326993139884]


[32m[I 2021-12-10 17:24:38,593][0m Trial 11 finished with value: 0.36045320103736916 and parameters: {'n_estimators': 71, 'subsample': 0.9924233767166261, 'max_depth': 6, 'max_features': 12, 'learning_rate': 0.10567843309335981, 'min_samples_leaf': 12}. Best is trial 5 with value: 0.370607360915226.[0m


MCC: [0.33766907425782167, 0.3418569417198152, 0.3776619062241067, 0.3846248819477331]
ROC: [0.8396621832968408, 0.8695214263371912, 0.8479618182269556, 0.8763312728085202]
PR_AUC: [0.2317153037282645, 0.3285169107349486, 0.27707064989938607, 0.30641260386697294]


[32m[I 2021-12-10 17:24:38,944][0m Trial 12 finished with value: 0.36714949220336845 and parameters: {'n_estimators': 10, 'subsample': 0.8827011620638204, 'max_depth': 7, 'max_features': 19, 'learning_rate': 0.08034296707314204, 'min_samples_leaf': 37}. Best is trial 5 with value: 0.370607360915226.[0m


MCC: [0.3564994981942729, 0.3677440543379747, 0.378707462044946, 0.36564695423628013]
ROC: [0.8342539881138566, 0.8474695026587425, 0.8353380005461633, 0.8687284327598619]
PR_AUC: [0.24411609783562452, 0.24698750091412808, 0.2152719706113946, 0.278410777949455]


[32m[I 2021-12-10 17:24:40,057][0m Trial 13 finished with value: 0.363435919485745 and parameters: {'n_estimators': 79, 'subsample': 0.8449858762275286, 'max_depth': 16, 'max_features': 15, 'learning_rate': 0.11562625731777346, 'min_samples_leaf': 49}. Best is trial 5 with value: 0.370607360915226.[0m


MCC: [0.3637432664541949, 0.3537563500871728, 0.36665551268278723, 0.3695885487188251]
ROC: [0.8485267438223334, 0.8617391304347827, 0.8503823142424469, 0.8697649015664956]
PR_AUC: [0.23645626003386466, 0.32943240408532654, 0.22074635421492786, 0.2454263917410542]


[32m[I 2021-12-10 17:24:41,277][0m Trial 14 finished with value: 0.3674626755247012 and parameters: {'n_estimators': 109, 'subsample': 0.9059678479504827, 'max_depth': 3, 'max_features': 11, 'learning_rate': 0.12067373789737436, 'min_samples_leaf': 16}. Best is trial 5 with value: 0.370607360915226.[0m


MCC: [0.35543935404745414, 0.37574844343663905, 0.35998300749157697, 0.3786798971231346]
ROC: [0.8461933062245856, 0.8644166406005631, 0.845795784613093, 0.8818549688438717]
PR_AUC: [0.25548726122661153, 0.3064623554248126, 0.2503248619478185, 0.32623609843615464]


[32m[I 2021-12-10 17:24:42,027][0m Trial 15 finished with value: 0.37498398545776834 and parameters: {'n_estimators': 41, 'subsample': 0.8483757495448924, 'max_depth': 8, 'max_features': 20, 'learning_rate': 0.09614761637067157, 'min_samples_leaf': 35}. Best is trial 15 with value: 0.37498398545776834.[0m


MCC: [0.35881286538168194, 0.3677481945629599, 0.39099932051489367, 0.38237556137153783]
ROC: [0.838210822646231, 0.8642915233030966, 0.8463047094163502, 0.8732963431890965]
PR_AUC: [0.2220359960216589, 0.35567074906642254, 0.22401425616606582, 0.26147697699658407]


[32m[I 2021-12-10 17:24:42,784][0m Trial 16 finished with value: 0.3737901685379942 and parameters: {'n_estimators': 35, 'subsample': 0.8460741138153358, 'max_depth': 15, 'max_features': 16, 'learning_rate': 0.09222689677431806, 'min_samples_leaf': 36}. Best is trial 15 with value: 0.37498398545776834.[0m


MCC: [0.364771414722262, 0.375931502420471, 0.38456445135395373, 0.36989330565529005]
ROC: [0.8351079136690648, 0.8645605254926494, 0.8413147637844146, 0.8757354584047068]
PR_AUC: [0.2307877317954979, 0.3061178189658323, 0.2194164409201393, 0.2604474055326717]


[32m[I 2021-12-10 17:24:43,434][0m Trial 17 finished with value: 0.3700988932642058 and parameters: {'n_estimators': 28, 'subsample': 0.844657081309488, 'max_depth': 15, 'max_features': 18, 'learning_rate': 0.08033171450109255, 'min_samples_leaf': 35}. Best is trial 15 with value: 0.37498398545776834.[0m


MCC: [0.3573681481706981, 0.3677440543379747, 0.3855068731016043, 0.36977649744654606]
ROC: [0.841332499218017, 0.8644103847356897, 0.8447841414066186, 0.8700503959683225]
PR_AUC: [0.24741535483703575, 0.30640262309509136, 0.22377761322451292, 0.25489946812825753]


[32m[I 2021-12-10 17:24:43,809][0m Trial 18 finished with value: 0.3670334670221982 and parameters: {'n_estimators': 10, 'subsample': 0.841212937001562, 'max_depth': 19, 'max_features': 16, 'learning_rate': 0.09424803924968712, 'min_samples_leaf': 32}. Best is trial 15 with value: 0.37498398545776834.[0m


MCC: [0.3564994981942729, 0.36431673932475755, 0.3799298078016317, 0.36738782276813053]
ROC: [0.8310509852987176, 0.8615045355020332, 0.839545939773094, 0.8578672326903503]
PR_AUC: [0.22993278958254237, 0.31201481792009117, 0.24136785329760974, 0.25014593322864065]


[32m[I 2021-12-10 17:24:44,697][0m Trial 19 finished with value: 0.3723769888148154 and parameters: {'n_estimators': 56, 'subsample': 0.8659909718931323, 'max_depth': 16, 'max_features': 19, 'learning_rate': 0.09239754470885392, 'min_samples_leaf': 43}. Best is trial 15 with value: 0.37498398545776834.[0m


MCC: [0.3647394648665929, 0.37863138135681085, 0.36978299262793035, 0.37635411640792726]
ROC: [0.8453612761964341, 0.8662058179543323, 0.8474032422233808, 0.8713102951763859]
PR_AUC: [0.23915597289622856, 0.34377415353842955, 0.22682947559617303, 0.24876231902169835]


[32m[I 2021-12-10 17:24:45,369][0m Trial 20 finished with value: 0.3669334699553328 and parameters: {'n_estimators': 32, 'subsample': 0.8940857521211301, 'max_depth': 20, 'max_features': 13, 'learning_rate': 0.18220882753397885, 'min_samples_leaf': 50}. Best is trial 15 with value: 0.37498398545776834.[0m


MCC: [0.3606008944528476, 0.36083022604021253, 0.375214237847416, 0.37108852148085536]
ROC: [0.8469940569283703, 0.8612324053800438, 0.8480859462277501, 0.8755554728035551]
PR_AUC: [0.24766811796536206, 0.33649770376514115, 0.21803695312368665, 0.257325332971351]


[32m[I 2021-12-10 17:24:46,290][0m Trial 21 finished with value: 0.377204797975063 and parameters: {'n_estimators': 58, 'subsample': 0.8592355833508208, 'max_depth': 16, 'max_features': 19, 'learning_rate': 0.09319382600103734, 'min_samples_leaf': 41}. Best is trial 21 with value: 0.377204797975063.[0m


MCC: [0.3689621015107697, 0.38050529592832355, 0.37850857853799524, 0.3808432159231634]
ROC: [0.8458617453862995, 0.8599249296215203, 0.8504381718428043, 0.8785841960229388]
PR_AUC: [0.22889961843550263, 0.3029652193979159, 0.22657125534418832, 0.2608340685654756]


[32m[I 2021-12-10 17:24:47,332][0m Trial 22 finished with value: 0.3728453923166124 and parameters: {'n_estimators': 67, 'subsample': 0.8549248473833944, 'max_depth': 15, 'max_features': 19, 'learning_rate': 0.09576643425523988, 'min_samples_leaf': 38}. Best is trial 21 with value: 0.377204797975063.[0m


MCC: [0.3619012441539235, 0.37445256163143475, 0.3715818227041864, 0.38344594077690497]
ROC: [0.8396746950265874, 0.8591429465123552, 0.847490131823937, 0.8718254263796827]
PR_AUC: [0.22101096137072074, 0.3270945421719344, 0.22666985092404934, 0.25272434672345745]


[32m[I 2021-12-10 17:24:48,009][0m Trial 23 finished with value: 0.3683072337416541 and parameters: {'n_estimators': 36, 'subsample': 0.831722675533267, 'max_depth': 17, 'max_features': 20, 'learning_rate': 0.11087704486314144, 'min_samples_leaf': 31}. Best is trial 21 with value: 0.377204797975063.[0m


MCC: [0.357037742736113, 0.36300536717503823, 0.37998852512795234, 0.3731972999275129]
ROC: [0.8314670003127934, 0.8582546137003441, 0.8426429333929149, 0.8678222983540627]
PR_AUC: [0.21711793044917913, 0.3208209540952861, 0.22326890878007005, 0.25603767236812713]


[32m[I 2021-12-10 17:24:49,532][0m Trial 24 finished with value: 0.36624743801062654 and parameters: {'n_estimators': 99, 'subsample': 0.8799731267520511, 'max_depth': 14, 'max_features': 18, 'learning_rate': 0.08691890839385083, 'min_samples_leaf': 39}. Best is trial 21 with value: 0.377204797975063.[0m


MCC: [0.34943594215992924, 0.37316791616035233, 0.36453976018166423, 0.37784613354056024]
ROC: [0.8411448232718174, 0.8598748827025336, 0.8472853206226261, 0.8729922295871503]
PR_AUC: [0.2213757834872554, 0.32754785988108426, 0.223647243667564, 0.2570076071817302]
CPU times: user 31.7 s, sys: 232 ms, total: 32 s
Wall time: 31.8 s


In [14]:
best = study.best_trial
best

FrozenTrial(number=21, values=[0.377204797975063], datetime_start=datetime.datetime(2021, 12, 10, 17, 24, 45, 370578), datetime_complete=datetime.datetime(2021, 12, 10, 17, 24, 46, 290307), params={'n_estimators': 58, 'subsample': 0.8592355833508208, 'max_depth': 16, 'max_features': 19, 'learning_rate': 0.09319382600103734, 'min_samples_leaf': 41}, distributions={'n_estimators': IntUniformDistribution(high=200, low=10, step=1), 'subsample': UniformDistribution(high=1.0, low=0.8), 'max_depth': IntUniformDistribution(high=20, low=3, step=1), 'max_features': IntUniformDistribution(high=20, low=10, step=1), 'learning_rate': UniformDistribution(high=0.2, low=0.08), 'min_samples_leaf': IntUniformDistribution(high=50, low=1, step=1)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=21, state=TrialState.COMPLETE, value=None)

In [15]:
%%time
gb_params = study.best_params
#gb_params['random_state'] = 42
gb = GradientBoostingClassifier(**gb_params)
gb.fit(X_rus, y_rus)
y_pred = gb.predict(X_test)
probs = gb.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

[[1201  466]
 [  10  129]]
              precision    recall  f1-score   support

           0       0.99      0.72      0.83      1667
           1       0.22      0.93      0.35       139

    accuracy                           0.74      1806
   macro avg       0.60      0.82      0.59      1806
weighted avg       0.93      0.74      0.80      1806

ROC AUC prob1: 0.8548980851311753
MCC: 0.3677592811415039
F2: 0.5603822762814944
PR_AUC: 0.24886704241749635
CPU times: user 80.7 ms, sys: 5 ms, total: 85.7 ms
Wall time: 83.2 ms


In [16]:
%%time
gb_params = study.best_params
gb_params['random_state'] = 42
gb_weight = GradientBoostingClassifier(**gb_params)
weights = class_weight.compute_sample_weight('balanced', y=y_train)
gb_weight.fit(X_train, y_train, sample_weight=weights)
y_pred = gb_weight.predict(X_test)
probs = gb_weight.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

[[1334  333]
 [  32  107]]
              precision    recall  f1-score   support

           0       0.98      0.80      0.88      1667
           1       0.24      0.77      0.37       139

    accuracy                           0.80      1806
   macro avg       0.61      0.79      0.62      1806
weighted avg       0.92      0.80      0.84      1806

ROC AUC prob1: 0.856883299599073
MCC: 0.3539294885253634
F2: 0.5371485943775101
PR_AUC: 0.29758072889850945
CPU times: user 352 ms, sys: 9.07 ms, total: 361 ms
Wall time: 356 ms


In [17]:
# No Sitenum - 5
coeffs_wgt = pd.Series(data=gb_weight.feature_importances_,
                   index=X_test.columns.values).sort_values(ascending=False)
coeffs_wgt[0:20]

Delfetalpos    0.136056
Delmode        0.117158
Presentdel     0.081490
Lac_Min        0.068870
Admefface      0.048784
TrialLabor     0.046878
new_age        0.033182
BESTGA         0.032594
CS_FTP         0.032448
Analgesia      0.028164
Admreason      0.023778
Lac_None       0.022372
prelaborCD     0.018640
new_BMI        0.018010
spontlabor     0.017460
Augment        0.015708
Dilat_lst      0.012437
Episiotomy     0.011962
IUPC           0.011245
ROMmeth        0.010898
dtype: float64

In [1]:
plt.rcParams["figure.figsize"] = (14, 8)
x_labels = coeffs_wgt.index.values[0:25]
print(x_labels)
fig, ax = plt.subplots(1,1)  # Create a figure and an axes.
ax.set_title('Top Predictors: All Sites combined; target: High_EBLoss, vars: PI')
ax.bar(x_labels, coeffs_wgt.values[0:25])
ax.set_ylabel('Coeff')
ax.set_xlabel('Variable')
plt.draw()
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, fontsize=16)
#plt.xlim(1,20)
plt.show()

NameError: name 'plt' is not defined

In [19]:
# No Sitenum - 5
coeffs = pd.Series(data=gb.feature_importances_,
                   index=X_test.columns.values).sort_values(ascending=False)
coeffs[0:20]

Delmode        0.397347
Delfetalpos    0.090754
CS_FTP         0.065424
Presentdel     0.055248
Admefface      0.037554
Admreason      0.033011
TrialLabor     0.028965
Lac_None       0.026507
new_age        0.022852
BESTGA         0.018979
Analgesia      0.018239
CS_UScar       0.017569
CS_Elect       0.017411
Lac_Min        0.014694
uscar          0.013476
DMControl      0.010380
Admcontract    0.010124
prelaborCD     0.009512
MthInd_Oxy     0.009113
AnteGDM        0.008989
dtype: float64

In [2]:
plt.rcParams["figure.figsize"] = (14, 8)
x_labels = coeffs.index.values[0:25]
print(x_labels)
fig, ax = plt.subplots(1,1)  # Create a figure and an axes.
ax.set_title('Top Predictors: All Sites combined; target: High_EBLoss, vars: PI')
ax.bar(x_labels, coeffs.values[0:25])
ax.set_ylabel('Coeff')
ax.set_xlabel('Variable')
plt.draw()
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, fontsize=16)
#plt.xlim(1,20)
plt.show()

NameError: name 'plt' is not defined

In [22]:
gb_params # 5 runs

{'n_estimators': 58,
 'subsample': 0.8592355833508208,
 'max_depth': 16,
 'max_features': 19,
 'learning_rate': 0.09319382600103734,
 'min_samples_leaf': 41,
 'random_state': 42}

In [3]:
from optuna.visualization import plot_optimization_history

plotly_config = {"staticPlot": True}

fig = plot_optimization_history(study)
fig.show(config=plotly_config)

NameError: name 'study' is not defined

In [None]:
from optuna.visualization import plot_param_importances

fig = plot_param_importances(study)
fig.show(config=plotly_config)

In [25]:
np.mean([0.3149689279176914, 0.31449457458602254, 0.31066798538596907, 0.3219519143488398])

0.3155208505596307

<h1 align='center'>Thanks and make sure to learn!</h1>