In [1]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
#import plotly.express as px
import seaborn as sns

from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import StratifiedKFold
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, \
                            accuracy_score, f1_score, average_precision_score, \
                            fbeta_score, precision_recall_curve, auc
from sklearn.metrics import matthews_corrcoef

from sklearn.model_selection import train_test_split

import optuna
from optuna.samplers import TPESampler
sns.set(style='white', context='notebook', palette='deep')
%config Completer.use_jedi = False

In [2]:
csl_df = pd.read_csv('../../data/csl/CSL_hety_PI_comp.csv', index_col=0)
X = csl_df.drop(['high_EBL','Sitenum'], axis=1, inplace=False)
y = csl_df['high_EBL'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=7)
X_train                                                    

Unnamed: 0_level_0,transfus_yes,Accrete,AdmBishop,Admcervpos,Admconsistency,Admcontract,AdmDBP,Admefface,Admpresent,AdmSBP,...,TD_nos,ThreatenedPB,threatpb9,UnspecHBP,uscar,version9,new_age,new_BMI,new_high_Age,new_high_BMI
MomID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
44-01170,0,0,8,8,8,2.5,76,50.0,77,139.0,...,0,0,0,0,0,0,25,24,1,1
41-42094,0,0,11,1,3,2.9,79,90.0,77,122.0,...,0,0,0,0,0,0,27,21,1,1
48-15713,0,0,8,8,8,66.0,90,100.0,77,127.0,...,0,0,0,0,0,0,18,26,0,1
49-04689,1,0,8,2,3,3.0,75,70.0,1,118.0,...,0,0,0,0,0,0,30,24,1,1
49-02986,0,0,8,1,3,1.0,81,100.0,1,128.0,...,0,0,0,0,0,0,26,24,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49-22617,0,0,8,8,8,1.0,88,0.0,77,125.0,...,0,0,0,0,1,0,31,33,1,1
45-01038,0,0,8,3,3,2.0,79,50.0,1,106.0,...,0,0,0,0,0,0,31,24,1,1
52-04194,0,0,8,8,8,66.0,74,70.0,1,124.4,...,0,0,0,0,0,0,30,21,1,1
44-11143,0,0,8,8,8,99.0,76,100.0,77,133.0,...,0,0,0,0,0,0,15,24,0,1


<h1 align='center'>Exploratory Data Analysis</h1>

In [8]:
plt.figure(figsize=(20,10))
sns.barplot(csl_df['new_age'],csl_df['high_EBL'])
plt.show()

NameError: name 'plt' is not defined

In [7]:
sns.boxplot(csl_df['new_age'])
plt.show()

NameError: name 'sns' is not defined

In [6]:
plt.figure(figsize=(20,10))
sns.barplot(csl_df['new_BMI'],csl_df['high_EBL'])
plt.show()

NameError: name 'plt' is not defined

In [5]:
sns.boxplot(csl_df['new_BMI'])
plt.show()

NameError: name 'sns' is not defined

<h1 align='center'>Modelling</h1>

In [7]:
print(X_train.shape)
print(X.shape)
print(y.shape)

(95540, 195)
(136487, 195)
(136487,)


Here we define our validation set

<h1 align='center'>Modelling Using Undersampling</h1>

We will use `imblearn`'s RandomUnderSampler to undersample from the majority class so that they match

In [8]:
sampler = RandomUnderSampler(sampling_strategy=1.0, random_state=7)
X_rus, y_rus = sampler.fit_resample(X_train, y_train)

# Try with no undersampling
#X_rus, y_rus = X_train, y_train

In [9]:
print(X_rus.shape)
print(y_rus.shape)
print(np.bincount(y_rus))

(8064, 195)
(8064,)
[4032 4032]


<h1 align='center'>Basic Gradient Boosting</h1>

In [10]:
%%time
from sklearn.metrics import matthews_corrcoef
from sklearn.utils import class_weight
gb_rus = GradientBoostingClassifier()   # If not undersampling
gb_rus.fit(X_rus, y_rus)
print(gb_rus.get_params())
y_pred = gb_rus.predict(X_test)
probs = gb_rus.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'presort': 'deprecated', 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
[[30535  8684]
 [  169  1559]]
              precision    recall  f1-score   support

           0       0.99      0.78      0.87     39219
           1       0.15      0.90      0.26      1728

    accuracy                           0.78     40947
   macro avg       0.57      0.84      0.57     40947
weighted avg       0.96      0.78      0.85     40947

ROC AUC prob1: 0.8912720889841753
MCC: 0.31601897576278604
F2: 0.4543864762459925
PR_AUC: 0.2663190042652628
CPU times: user 3.82 s, sys: 91.5 ms, t

In [11]:
%%time
# Not undersampled
from sklearn.metrics import matthews_corrcoef
from sklearn.utils import class_weight
gb_weight = GradientBoostingClassifier()  
weights = class_weight.compute_sample_weight('balanced', y=y_train)
gb_weight.fit(X_train, y_train, sample_weight=weights)
print(gb_weight.get_params())
y_pred = gb_weight.predict(X_test)
probs = gb_weight.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'presort': 'deprecated', 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
[[30530  8689]
 [  169  1559]]
              precision    recall  f1-score   support

           0       0.99      0.78      0.87     39219
           1       0.15      0.90      0.26      1728

    accuracy                           0.78     40947
   macro avg       0.57      0.84      0.57     40947
weighted avg       0.96      0.78      0.85     40947

ROC AUC prob1: 0.8949700512459475
MCC: 0.3159084300584172
F2: 0.45425407925407923
PR_AUC: 0.28249431802839825
CPU times: user 42.8 s, sys: 159 ms, t

In [12]:
%%time
# Not undersampled - many parameters
from sklearn.metrics import matthews_corrcoef
from sklearn.utils import class_weight
gb_weight = GradientBoostingClassifier(learning_rate=0.2, max_features = 15, max_leaf_nodes=100)  
#'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'presort': 'deprecated', 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False
weights = class_weight.compute_sample_weight('balanced', y=y_train)
gb_weight.fit(X_train, y_train, sample_weight=weights)
print(gb_weight.get_params())
y_pred = gb_weight.predict(X_test)
probs = gb_weight.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.2, 'loss': 'deviance', 'max_depth': 3, 'max_features': 15, 'max_leaf_nodes': 100, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'presort': 'deprecated', 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
[[30642  8577]
 [  180  1548]]
              precision    recall  f1-score   support

           0       0.99      0.78      0.87     39219
           1       0.15      0.90      0.26      1728

    accuracy                           0.79     40947
   macro avg       0.57      0.84      0.57     40947
weighted avg       0.96      0.79      0.85     40947

ROC AUC prob1: 0.8907457606290601
MCC: 0.3155510213978344
F2: 0.45430533544638135
PR_AUC: 0.2628961420578432
CPU times: user 5.71 s, sys: 188 ms, total

<h1 align='center'>Hyperparameter Tuning with Cross Validation</h1>

In [13]:
%%time
def create_model(trial):
    n_estimators = trial.suggest_int('n_estimators', 10, 200)
    subsample = trial.suggest_float('subsample', 0.8, 1.0)
    max_depth = trial.suggest_int('max_depth',3,20)
    max_features = trial.suggest_int('max_features',10,20)
    learning_rate = trial.suggest_uniform('learning_rate',0.08,0.2)
    min_samples_leaf = trial.suggest_int('min_samples_leaf',1,50)
    model = GradientBoostingClassifier(n_estimators=n_estimators, subsample=subsample, 
                           max_depth=max_depth, learning_rate=learning_rate, 
                           min_samples_leaf=min_samples_leaf)
#    model = GradientBoostingClassifier(n_estimators=n_estimators, subsample=subsample, 
#                           learning_rate=learning_rate, 
#                           min_samples_leaf=min_samples_leaf)
#    model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate) 
    return model

def objective(trial):
    model = create_model(trial)
    
    mcc_list = []
    roc_list = []
    pr_list = []
    f2_list = []
    cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=11)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train_c, X_test_c = X.iloc[train_idx], X.iloc[test_idx]
        y_train_c, y_test_c = y[train_idx], y[test_idx]
        sampler = RandomUnderSampler(sampling_strategy=1.0, random_state=7)
        X_res, y_res = sampler.fit_resample(X_train_c, y_train_c)

#        weights = class_weight.compute_sample_weight('balanced', y=y_train)
#        model.fit(X_train, y_train, sample_weight=weights)
        model.fit(X_res, y_res)
        y_pred = model.predict(X_test_c)
        probs = model.predict_proba(X_test_c)
        prob1 = probs[:, 1]  # Only positives
        precision, recall, pr_thresh = precision_recall_curve(y_test_c, prob1)
        roc = roc_auc_score(y_test_c, prob1)
        roc_list.append(roc)
        mcc = matthews_corrcoef(y_test_c, y_pred)  # MWB
        mcc_list.append(mcc)
        f2 = fbeta_score(y_test_c, y_pred, beta=2.0, average=None)[1]
        pr_auc = auc(recall, precision)
        pr_list.append(pr_auc)
        avg_precision = average_precision_score(y_test_c, prob1)
#        print(f'ROC: {roc}')
#        print(f'PR_AUC: {pr_auc}')
#        print(f'avg_prec: {avg_precision}')
        
    print(f'MCC: {mcc_list}')
    print(f'ROC: {roc_list}')
    print(f'PR_AUC: {pr_list}')
    return np.mean(mcc_list)

sampler = TPESampler(seed=7)
study = optuna.create_study(sampler=sampler,direction='maximize')
#study.optimize(objective,n_trials=60)
study.optimize(objective,n_trials=25)
#study.optimize(objective,n_trials=5)

[32m[I 2021-12-09 11:23:25,197][0m A new study created in memory with name: no-name-83ac6fac-bd87-4e13-a472-b95d747494ba[0m
[32m[I 2021-12-09 11:23:37,400][0m Trial 0 finished with value: 0.3152829671332036 and parameters: {'n_estimators': 24, 'subsample': 0.955983758448023, 'max_depth': 10, 'max_features': 17, 'learning_rate': 0.1973587414395923, 'min_samples_leaf': 27}. Best is trial 0 with value: 0.3152829671332036.[0m


MCC: [0.3138936185267518, 0.31420967625081936, 0.32035485850998396, 0.31267371524525933]
ROC: [0.889038584779933, 0.88843264046128, 0.8933235738836873, 0.8892968412669272]
PR_AUC: [0.25017462769277027, 0.2507082785687574, 0.2508721096895517, 0.25410747744698314]


[32m[I 2021-12-09 11:24:07,170][0m Trial 1 finished with value: 0.31563438937732824 and parameters: {'n_estimators': 105, 'subsample': 0.8144102266719524, 'max_depth': 7, 'max_features': 15, 'learning_rate': 0.16150759953451288, 'min_samples_leaf': 41}. Best is trial 1 with value: 0.31563438937732824.[0m


MCC: [0.3146507782962976, 0.3149759738994252, 0.3201994417363983, 0.3127113635771919]
ROC: [0.8906296321794532, 0.8855576294120447, 0.8952697798312357, 0.8874103497105011]
PR_AUC: [0.26807612257059, 0.2376509234007379, 0.2467480186133173, 0.25513550793652606]


[32m[I 2021-12-09 11:24:33,109][0m Trial 2 finished with value: 0.32035225259792793 and parameters: {'n_estimators': 82, 'subsample': 0.813187269381181, 'max_depth': 8, 'max_features': 20, 'learning_rate': 0.10560624242958987, 'min_samples_leaf': 23}. Best is trial 2 with value: 0.32035225259792793.[0m


MCC: [0.3199162653601873, 0.3167278710442086, 0.3250726728418134, 0.3196922011455024]
ROC: [0.8928373119080161, 0.8905349274830181, 0.8968487368174123, 0.891043789459727]
PR_AUC: [0.26925765446585365, 0.2609829160462497, 0.2549562581277983, 0.2711168370161262]


[32m[I 2021-12-09 11:25:52,379][0m Trial 3 finished with value: 0.31408737850015656 and parameters: {'n_estimators': 187, 'subsample': 0.8049798455100696, 'max_depth': 13, 'max_features': 20, 'learning_rate': 0.10763634548251577, 'min_samples_leaf': 28}. Best is trial 2 with value: 0.32035225259792793.[0m


MCC: [0.3134111221345986, 0.3112484963357451, 0.31920184579949984, 0.31248804973078276]
ROC: [0.8872197531430825, 0.8832559461885239, 0.8920482265127252, 0.883968577562906]
PR_AUC: [0.25052838401047917, 0.2306936841639274, 0.2456176578314017, 0.2381964610844519]


[32m[I 2021-12-09 11:27:09,005][0m Trial 4 finished with value: 0.3116463771041588 and parameters: {'n_estimators': 183, 'subsample': 0.8266338891518501, 'max_depth': 12, 'max_features': 18, 'learning_rate': 0.16028158890606967, 'min_samples_leaf': 24}. Best is trial 2 with value: 0.32035225259792793.[0m


MCC: [0.31280106480357883, 0.31013566863762687, 0.31441111571655345, 0.3092376592588762]
ROC: [0.8870790028830006, 0.8839340930107636, 0.8881839689193507, 0.8826122424174426]
PR_AUC: [0.24422871457433917, 0.24315888171887176, 0.23102388026789175, 0.23840228080647002]


[32m[I 2021-12-09 11:27:28,007][0m Trial 5 finished with value: 0.3187437529489088 and parameters: {'n_estimators': 49, 'subsample': 0.8981531778182141, 'max_depth': 9, 'max_features': 15, 'learning_rate': 0.12390684629367116, 'min_samples_leaf': 42}. Best is trial 2 with value: 0.32035225259792793.[0m


MCC: [0.3181746179722698, 0.31889655397445227, 0.3200604546088654, 0.3178433852400475]
ROC: [0.8944534750695252, 0.8910643133495162, 0.8963759145367141, 0.8909975831182916]
PR_AUC: [0.27813950306306445, 0.2664411889998704, 0.2559678509404887, 0.2594274785111417]


[32m[I 2021-12-09 11:28:41,884][0m Trial 6 finished with value: 0.3121698763711181 and parameters: {'n_estimators': 156, 'subsample': 0.8627989354425325, 'max_depth': 13, 'max_features': 13, 'learning_rate': 0.13434115190556806, 'min_samples_leaf': 18}. Best is trial 2 with value: 0.32035225259792793.[0m


MCC: [0.3123050152366151, 0.3099568462078223, 0.3146173565268046, 0.31180028751323036]
ROC: [0.8875334451855932, 0.8828229861493585, 0.8884565450570822, 0.8842243326907582]
PR_AUC: [0.2501124971544412, 0.245341556634164, 0.24394570431338175, 0.23427986699844638]


[32m[I 2021-12-09 11:29:35,993][0m Trial 7 finished with value: 0.3126011234989732 and parameters: {'n_estimators': 135, 'subsample': 0.8740702165976071, 'max_depth': 11, 'max_features': 17, 'learning_rate': 0.12955901949366017, 'min_samples_leaf': 46}. Best is trial 2 with value: 0.32035225259792793.[0m


MCC: [0.30967409974197835, 0.31015356363815355, 0.31880127474683184, 0.31177555586892897]
ROC: [0.8870125587309359, 0.8841201876330158, 0.891735426908458, 0.8842538265522951]
PR_AUC: [0.25105763582434576, 0.23723241120257244, 0.24298092328400575, 0.24517450495291537]


[32m[I 2021-12-09 11:29:55,616][0m Trial 8 finished with value: 0.31555412342470923 and parameters: {'n_estimators': 44, 'subsample': 0.9482237745826528, 'max_depth': 10, 'max_features': 14, 'learning_rate': 0.15612558423606065, 'min_samples_leaf': 27}. Best is trial 2 with value: 0.32035225259792793.[0m


MCC: [0.31333118526184917, 0.3129387868882539, 0.3215506519354949, 0.31439586961323895]
ROC: [0.8918953433422405, 0.8880106021663301, 0.8938172197233951, 0.8891287071319048]
PR_AUC: [0.25313198235115797, 0.2600553906845552, 0.24955463332384956, 0.2556274980075258]


[32m[I 2021-12-09 11:30:11,381][0m Trial 9 finished with value: 0.31816975084184407 and parameters: {'n_estimators': 89, 'subsample': 0.8002853761125517, 'max_depth': 4, 'max_features': 17, 'learning_rate': 0.14292147161182367, 'min_samples_leaf': 35}. Best is trial 2 with value: 0.32035225259792793.[0m


MCC: [0.3184256330367897, 0.31479248689347905, 0.31961465249360493, 0.31984623094350273]
ROC: [0.8956328853293353, 0.8920367841795347, 0.9001234433327213, 0.891846965957114]
PR_AUC: [0.2865464614528554, 0.2675354912744491, 0.2680505913482358, 0.2604620845900611]


[32m[I 2021-12-09 11:31:14,518][0m Trial 10 finished with value: 0.31159321208617335 and parameters: {'n_estimators': 78, 'subsample': 0.9869602664638994, 'max_depth': 18, 'max_features': 10, 'learning_rate': 0.09172353354166263, 'min_samples_leaf': 4}. Best is trial 2 with value: 0.32035225259792793.[0m


MCC: [0.3138161541159152, 0.3087751887620664, 0.3169955344602467, 0.3067859710064651]
ROC: [0.8857425766136984, 0.881829808627243, 0.889855675737239, 0.8834629320808217]
PR_AUC: [0.24542094134327236, 0.23083401870597198, 0.24708570010263997, 0.23425477327491287]


[32m[I 2021-12-09 11:31:32,102][0m Trial 11 finished with value: 0.31800238321670815 and parameters: {'n_estimators': 59, 'subsample': 0.913378595521485, 'max_depth': 6, 'max_features': 12, 'learning_rate': 0.10567843309335981, 'min_samples_leaf': 12}. Best is trial 2 with value: 0.32035225259792793.[0m


MCC: [0.31730755903580155, 0.31752785985732573, 0.32104396627138615, 0.3161301477023192]
ROC: [0.8956716532715936, 0.8903834467154872, 0.8985199442948548, 0.8916613224979516]
PR_AUC: [0.27701758730308845, 0.2630299622489142, 0.26419687241080114, 0.2680968270264322]


[32m[I 2021-12-09 11:31:37,225][0m Trial 12 finished with value: 0.31561254766985064 and parameters: {'n_estimators': 10, 'subsample': 0.8700428616176594, 'max_depth': 7, 'max_features': 19, 'learning_rate': 0.08034296707314204, 'min_samples_leaf': 37}. Best is trial 2 with value: 0.32035225259792793.[0m


MCC: [0.3148135409952374, 0.31622373138890536, 0.31538775942972425, 0.31602515886553567]
ROC: [0.8902435782693837, 0.8837910160366904, 0.8934760002107854, 0.8874507762750357]
PR_AUC: [0.24373334903585886, 0.2450869348051489, 0.25484446778008657, 0.24614093899564693]


[32m[I 2021-12-09 11:32:05,154][0m Trial 13 finished with value: 0.3194684083307261 and parameters: {'n_estimators': 55, 'subsample': 0.9076814971706537, 'max_depth': 16, 'max_features': 15, 'learning_rate': 0.11562625731777346, 'min_samples_leaf': 49}. Best is trial 2 with value: 0.32035225259792793.[0m


MCC: [0.31730464507083583, 0.31822993194883564, 0.3235941253547109, 0.318744930948522]
ROC: [0.8912269495950879, 0.8873191962616187, 0.8943408153655766, 0.8891707592586926]
PR_AUC: [0.2582339968968553, 0.24201043820702847, 0.24933235667748604, 0.25117678352198375]


[32m[I 2021-12-09 11:32:48,175][0m Trial 14 finished with value: 0.3129983554246025 and parameters: {'n_estimators': 75, 'subsample': 0.8403871437580014, 'max_depth': 17, 'max_features': 11, 'learning_rate': 0.11284056296551773, 'min_samples_leaf': 16}. Best is trial 2 with value: 0.32035225259792793.[0m


MCC: [0.31048739817652454, 0.31022434930310205, 0.3177396477991388, 0.31354202641964457]
ROC: [0.88780999479836, 0.8842188658044864, 0.8898119037662594, 0.8849801022680526]
PR_AUC: [0.2500374801747404, 0.23210746335704674, 0.2369947666035628, 0.237872148110495]


[32m[I 2021-12-09 11:34:18,213][0m Trial 15 finished with value: 0.3128665877521826 and parameters: {'n_estimators': 127, 'subsample': 0.9121780388310307, 'max_depth': 16, 'max_features': 20, 'learning_rate': 0.09625971787342757, 'min_samples_leaf': 3}. Best is trial 2 with value: 0.32035225259792793.[0m


MCC: [0.3175002426678758, 0.30922317103462305, 0.316624769653458, 0.30811816765277367]
ROC: [0.888922121589186, 0.8840768831296875, 0.8902192805757841, 0.8851124634089124]
PR_AUC: [0.2529436280003533, 0.24034167533902268, 0.2510022391520761, 0.24252355537417647]


[32m[I 2021-12-09 11:35:19,837][0m Trial 16 finished with value: 0.3128520647830608 and parameters: {'n_estimators': 109, 'subsample': 0.9406334366958918, 'max_depth': 20, 'max_features': 16, 'learning_rate': 0.11797660761955531, 'min_samples_leaf': 49}. Best is trial 2 with value: 0.32035225259792793.[0m


MCC: [0.3112271431522205, 0.3098094049230841, 0.3179767109109325, 0.3123950001460063]
ROC: [0.8886233247659262, 0.8835278211247783, 0.8912160278508728, 0.8850820345834649]
PR_AUC: [0.2474482931231553, 0.23640565183250617, 0.2363021431704712, 0.23701729456841308]


[32m[I 2021-12-09 11:35:36,534][0m Trial 17 finished with value: 0.31886634684111476 and parameters: {'n_estimators': 33, 'subsample': 0.844657081309488, 'max_depth': 15, 'max_features': 13, 'learning_rate': 0.08038437038246449, 'min_samples_leaf': 32}. Best is trial 2 with value: 0.32035225259792793.[0m


MCC: [0.31940493020617344, 0.3163453905631556, 0.3227403687598206, 0.3169746978353094]
ROC: [0.8930771865586901, 0.8881592993764832, 0.8937097127878749, 0.8912470378643385]
PR_AUC: [0.25233222568508956, 0.24538376108884238, 0.24552599780069304, 0.26109603518737434]


[32m[I 2021-12-09 11:35:47,094][0m Trial 18 finished with value: 0.3121460376789501 and parameters: {'n_estimators': 65, 'subsample': 0.8931260292704439, 'max_depth': 3, 'max_features': 14, 'learning_rate': 0.18656989022946982, 'min_samples_leaf': 21}. Best is trial 2 with value: 0.32035225259792793.[0m


MCC: [0.3126730149386404, 0.31045075735218275, 0.3143410094386351, 0.31111936898634207]
ROC: [0.8956149196975569, 0.8899795971618764, 0.8993258903983845, 0.8917295323650507]
PR_AUC: [0.2854119101936469, 0.2688828042961129, 0.267363358861766, 0.2708767166302575]


[32m[I 2021-12-09 11:36:48,305][0m Trial 19 finished with value: 0.31452160727090805 and parameters: {'n_estimators': 99, 'subsample': 0.9269591326190438, 'max_depth': 15, 'max_features': 18, 'learning_rate': 0.1442383626335499, 'min_samples_leaf': 7}. Best is trial 2 with value: 0.32035225259792793.[0m


MCC: [0.31251085117391364, 0.31505715657610706, 0.3198340100869046, 0.31068441124670676]
ROC: [0.8868108039423671, 0.8844996013775847, 0.8888101630867146, 0.8841035098545197]
PR_AUC: [0.25198692887952945, 0.23781803821542538, 0.23800332107518252, 0.2401037346518432]


[32m[I 2021-12-09 11:38:16,750][0m Trial 20 finished with value: 0.31212397101582884 and parameters: {'n_estimators': 123, 'subsample': 0.9666012370478584, 'max_depth': 20, 'max_features': 16, 'learning_rate': 0.09516443398500667, 'min_samples_leaf': 11}. Best is trial 2 with value: 0.32035225259792793.[0m


MCC: [0.31413817832304636, 0.31036721235155335, 0.313295770458798, 0.3106947229299177]
ROC: [0.8875965958155695, 0.8840439691573343, 0.8896101702262205, 0.8844498502357808]
PR_AUC: [0.24312034643646108, 0.23185240723325717, 0.24163823043697236, 0.2440584336620562]


[32m[I 2021-12-09 11:38:30,074][0m Trial 21 finished with value: 0.31793837940715775 and parameters: {'n_estimators': 26, 'subsample': 0.8422960162108752, 'max_depth': 15, 'max_features': 13, 'learning_rate': 0.0870304757548705, 'min_samples_leaf': 33}. Best is trial 2 with value: 0.32035225259792793.[0m


MCC: [0.3176269085144189, 0.31863611859286245, 0.3194779160842686, 0.316012574437081]
ROC: [0.8924464983273155, 0.8871628708293386, 0.895769970218061, 0.8910764494490511]
PR_AUC: [0.25125533517934595, 0.23879311429442132, 0.2521773078345739, 0.26034958484836973]


[32m[I 2021-12-09 11:38:47,929][0m Trial 22 finished with value: 0.3181297485194846 and parameters: {'n_estimators': 36, 'subsample': 0.8496302102601551, 'max_depth': 14, 'max_features': 13, 'learning_rate': 0.09892424992504256, 'min_samples_leaf': 31}. Best is trial 2 with value: 0.32035225259792793.[0m


MCC: [0.318453810015736, 0.31820750097023204, 0.32128512539987825, 0.3145725576920922]
ROC: [0.8914366407094629, 0.8879781344130985, 0.8934805473961203, 0.892269633392151]
PR_AUC: [0.25323851815539183, 0.2494129906659161, 0.24715354026872283, 0.26404838420760207]


[32m[I 2021-12-09 11:39:16,110][0m Trial 23 finished with value: 0.31856386681302135 and parameters: {'n_estimators': 58, 'subsample': 0.8265540110211977, 'max_depth': 18, 'max_features': 14, 'learning_rate': 0.08190386388453622, 'min_samples_leaf': 40}. Best is trial 2 with value: 0.32035225259792793.[0m


MCC: [0.31710164019681464, 0.3175834766910986, 0.3217024938288767, 0.3178678565352953]
ROC: [0.8920012247652461, 0.8866777668985307, 0.8957414334428059, 0.8898455907101985]
PR_AUC: [0.2540962794681403, 0.2441543272593494, 0.2548644739892797, 0.251754640289544]


[32m[I 2021-12-09 11:39:56,259][0m Trial 24 finished with value: 0.3165383440163901 and parameters: {'n_estimators': 79, 'subsample': 0.8799731267520511, 'max_depth': 17, 'max_features': 11, 'learning_rate': 0.11764679805898987, 'min_samples_leaf': 49}. Best is trial 2 with value: 0.32035225259792793.[0m


MCC: [0.3171920886697995, 0.3135512030302957, 0.3200714342494366, 0.3153386501160285]
ROC: [0.8905542423964261, 0.8866425368364509, 0.8915925305468863, 0.8863563053966117]
PR_AUC: [0.2517535011202005, 0.24690433907681875, 0.24431934366495422, 0.24838861276610702]
CPU times: user 16min 9s, sys: 22.3 s, total: 16min 31s
Wall time: 16min 31s


In [14]:
best = study.best_trial
best

FrozenTrial(number=2, values=[0.32035225259792793], datetime_start=datetime.datetime(2021, 12, 9, 11, 24, 7, 171647), datetime_complete=datetime.datetime(2021, 12, 9, 11, 24, 33, 109668), params={'n_estimators': 82, 'subsample': 0.813187269381181, 'max_depth': 8, 'max_features': 20, 'learning_rate': 0.10560624242958987, 'min_samples_leaf': 23}, distributions={'n_estimators': IntUniformDistribution(high=200, low=10, step=1), 'subsample': UniformDistribution(high=1.0, low=0.8), 'max_depth': IntUniformDistribution(high=20, low=3, step=1), 'max_features': IntUniformDistribution(high=20, low=10, step=1), 'learning_rate': UniformDistribution(high=0.2, low=0.08), 'min_samples_leaf': IntUniformDistribution(high=50, low=1, step=1)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=2, state=TrialState.COMPLETE, value=None)

In [15]:
%%time
gb_params = study.best_params
#gb_params['random_state'] = 42
gb = GradientBoostingClassifier(**gb_params)
gb.fit(X_rus, y_rus)
y_pred = gb.predict(X_test)
probs = gb.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

[[30774  8445]
 [  184  1544]]
              precision    recall  f1-score   support

           0       0.99      0.78      0.88     39219
           1       0.15      0.89      0.26      1728

    accuracy                           0.79     40947
   macro avg       0.57      0.84      0.57     40947
weighted avg       0.96      0.79      0.85     40947

ROC AUC prob1: 0.8929236056810146
MCC: 0.3174852482145634
F2: 0.45677770546121543
PR_AUC: 0.26745242409145414
CPU times: user 1.69 s, sys: 82.4 ms, total: 1.78 s
Wall time: 1.77 s


In [16]:
%%time
gb_params = study.best_params
gb_params['random_state'] = 42
gb_weight = GradientBoostingClassifier(**gb_params)
weights = class_weight.compute_sample_weight('balanced', y=y_train)
gb_weight.fit(X_train, y_train, sample_weight=weights)
y_pred = gb_weight.predict(X_test)
probs = gb_weight.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

[[31741  7478]
 [  258  1470]]
              precision    recall  f1-score   support

           0       0.99      0.81      0.89     39219
           1       0.16      0.85      0.28      1728

    accuracy                           0.81     40947
   macro avg       0.58      0.83      0.58     40947
weighted avg       0.96      0.81      0.87     40947

ROC AUC prob1: 0.8902099886274888
MCC: 0.3211049909932943
F2: 0.4634300126103405
PR_AUC: 0.27181562205926146
CPU times: user 12.9 s, sys: 146 ms, total: 13 s
Wall time: 13 s


In [24]:
# No Sitenum - 5
coeffs_wgt = pd.Series(data=gb_weight.feature_importances_,
                   index=X_test.columns.values).sort_values(ascending=False)
coeffs_wgt[0:20]

Delmode           0.231738
CS_FTP            0.080058
Admefface         0.072331
TrialLabor        0.053035
Lac_Min           0.034673
Delfetalpos       0.031561
transfus_yes      0.031359
CS_NRFHT          0.026901
prelaborCD        0.024939
Insurance         0.020519
BESTGA            0.020229
Hxanemia          0.019345
new_BMI           0.015959
HospElectInd      0.014446
new_age           0.013785
Admpresent        0.011479
AdmSBP            0.011131
Admcontract       0.011049
AdmDBP            0.010503
Inoxy_incrdose    0.010442
dtype: float64

In [1]:
plt.rcParams["figure.figsize"] = (14, 8)
x_labels = coeffs_wgt.index.values[0:25]
print(x_labels)
fig, ax = plt.subplots(1,1)  # Create a figure and an axes.
ax.set_title('Top Predictors: All Sites combined; target: High_EBLoss, vars: PI')
ax.bar(x_labels, coeffs_wgt.values[0:25])
ax.set_ylabel('Coeff')
ax.set_xlabel('Variable')
plt.draw()
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, fontsize=16)
#plt.xlim(1,20)
plt.show()

NameError: name 'plt' is not defined

In [26]:
# No Sitenum - 5
coeffs = pd.Series(data=gb.feature_importances_,
                   index=X_test.columns.values).sort_values(ascending=False)
coeffs[0:20]

Delmode           0.261331
Lac_Min           0.096959
TrialLabor        0.042401
Delfetalpos       0.039474
Analgesia         0.037037
CS_FTP            0.036993
transfus_yes      0.029821
CS_NRFHT          0.025537
Insurance         0.021843
Dilat_lst         0.020394
Hxanemia          0.018626
new_BMI           0.018430
BESTGA            0.017936
HospElectInd      0.015323
Admcontract       0.014507
Inoxy_incrdose    0.014255
prelaborCD        0.014023
Augment           0.013633
AdmSBP            0.013415
new_age           0.012889
dtype: float64

In [4]:
plt.rcParams["figure.figsize"] = (6, 4)
sns.heatmap(confusion_matrix(y_test, y_pred), cmap='magma', annot=True, fmt='g') # 25
plt.show()

NameError: name 'plt' is not defined

In [20]:
gb_params # 5 runs

{'n_estimators': 82,
 'subsample': 0.813187269381181,
 'max_depth': 8,
 'max_features': 20,
 'learning_rate': 0.10560624242958987,
 'min_samples_leaf': 23,
 'random_state': 42}

In [3]:
from optuna.visualization import plot_optimization_history

plotly_config = {"staticPlot": True}

fig = plot_optimization_history(study)
fig.show(config=plotly_config)

NameError: name 'study' is not defined

In [2]:
from optuna.visualization import plot_param_importances

fig = plot_param_importances(study)
fig.show(config=plotly_config)

NameError: name 'study' is not defined

In [23]:
np.mean([0.3149689279176914, 0.31449457458602254, 0.31066798538596907, 0.3219519143488398])

0.3155208505596307

<h1 align='center'>Thanks and make sure to learn!</h1>