In [1]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
#import plotly.express as px
import seaborn as sns

from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import StratifiedKFold
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, \
                            accuracy_score, f1_score, average_precision_score, \
                            fbeta_score, precision_recall_curve, auc
from sklearn.metrics import matthews_corrcoef

from sklearn.model_selection import train_test_split

import optuna
from optuna.samplers import TPESampler
sns.set(style='white', context='notebook', palette='deep')
%config Completer.use_jedi = False

In [2]:
csl_df = pd.read_csv('../../data/csl/Sites/CSL_he_PI_s50.csv', index_col=0)
X = csl_df.drop(['high_EBL','Sitenum'], axis=1, inplace=False)
y = csl_df['high_EBL'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=7)
X_train                                                    

Unnamed: 0_level_0,transfus_yes,Accrete,AdmBishop,Admcervpos,Admconsistency,Admcontract,AdmDBP,Admefface,Admpresent,AdmSBP,...,TD_nos,ThreatenedPB,threatpb9,UnspecHBP,uscar,version9,new_age,new_BMI,new_high_Age,new_high_BMI
MomID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
50-13687,0,0,8,2,3,2.0,75,70.0,1,123.0,...,0,0,0,0,0,0,24,24,1,1
50-08549,0,0,9,2,3,3.0,77,80.0,1,128.0,...,0,0,0,0,0,0,35,24,1,1
50-14151,0,0,13,1,3,2.0,78,90.0,1,124.0,...,0,0,0,0,0,0,22,24,1,1
50-00959,0,0,8,8,8,2.0,60,0.0,77,107.0,...,0,0,0,0,1,0,41,24,2,1
50-15186,0,0,8,8,8,2.0,77,0.0,77,124.0,...,0,0,0,0,0,0,34,24,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50-10286,0,0,8,2,3,1.0,73,80.0,1,104.0,...,0,0,0,0,0,0,22,20,1,1
50-10663,0,0,8,8,8,3.0,62,0.0,77,117.0,...,0,0,0,0,0,0,29,24,1,1
50-07713,0,0,8,8,8,3.0,72,90.0,1,125.0,...,0,0,0,0,0,0,29,24,1,1
50-13792,0,0,8,3,3,2.0,67,70.0,1,122.0,...,0,0,0,0,0,0,24,24,1,1


In [3]:
X.shape

(11175, 195)

<h1 align='center'>Exploratory Data Analysis</h1>

<h1 align='center'>Modelling</h1>

In [8]:
print(X_train.shape)
print(X.shape)
print(y.shape)

(7822, 195)
(11175, 195)
(11175,)


Here we define our validation set

<h1 align='center'>Modelling Using Undersampling</h1>

We will use `imblearn`'s RandomUnderSampler to undersample from the majority class so that they match

In [9]:
sampler = RandomUnderSampler(sampling_strategy=1.0, random_state=7)
X_rus, y_rus = sampler.fit_resample(X_train, y_train)

# Try with no undersampling
#X_rus, y_rus = X_train, y_train

In [10]:
print(X_rus.shape)
print(y_rus.shape)
print(np.bincount(y_rus))

(938, 195)
(938,)
[469 469]


<h1 align='center'>Basic Gradient Boosting</h1>

In [11]:
%%time
from sklearn.metrics import matthews_corrcoef
from sklearn.utils import class_weight
gb_rus = GradientBoostingClassifier()   # If not undersampling
gb_rus.fit(X_rus, y_rus)
print(gb_rus.get_params())
y_pred = gb_rus.predict(X_test)
probs = gb_rus.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'presort': 'deprecated', 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
[[2468  684]
 [  24  177]]
              precision    recall  f1-score   support

           0       0.99      0.78      0.87      3152
           1       0.21      0.88      0.33       201

    accuracy                           0.79      3353
   macro avg       0.60      0.83      0.60      3353
weighted avg       0.94      0.79      0.84      3353

ROC AUC prob1: 0.894348845872162
MCC: 0.36059245889417924
F2: 0.5315315315315315
PR_AUC: 0.34201454182809427
CPU times: user 389 ms, sys: 2.86 ms, total

In [12]:
%%time
# Not undersampled
from sklearn.metrics import matthews_corrcoef
from sklearn.utils import class_weight
gb_weight = GradientBoostingClassifier()  
weights = class_weight.compute_sample_weight('balanced', y=y_train)
gb_weight.fit(X_train, y_train, sample_weight=weights)
print(gb_weight.get_params())
y_pred = gb_weight.predict(X_test)
probs = gb_weight.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'presort': 'deprecated', 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
[[2516  636]
 [  26  175]]
              precision    recall  f1-score   support

           0       0.99      0.80      0.88      3152
           1       0.22      0.87      0.35       201

    accuracy                           0.80      3353
   macro avg       0.60      0.83      0.61      3353
weighted avg       0.94      0.80      0.85      3353

ROC AUC prob1: 0.8926449604768038
MCC: 0.37079573149315526
F2: 0.5417956656346749
PR_AUC: 0.35054167764247335
CPU times: user 2.42 s, sys: 5.2 ms, total

<h1 align='center'>Hyperparameter Tuning with Cross Validation</h1>

In [13]:
%%time
def create_model(trial):
    n_estimators = trial.suggest_int('n_estimators', 10, 200)
    subsample = trial.suggest_float('subsample', 0.8, 1.0)
    max_depth = trial.suggest_int('max_depth',3,20)
    max_features = trial.suggest_int('max_features',10,20)
    learning_rate = trial.suggest_uniform('learning_rate',0.08,0.2)
    min_samples_leaf = trial.suggest_int('min_samples_leaf',1,50)
    model = GradientBoostingClassifier(n_estimators=n_estimators, subsample=subsample, 
                           max_depth=max_depth, learning_rate=learning_rate, 
                           min_samples_leaf=min_samples_leaf)
#    model = GradientBoostingClassifier(n_estimators=n_estimators, subsample=subsample, 
#                           learning_rate=learning_rate, 
#                           min_samples_leaf=min_samples_leaf)
#    model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate) 
    return model

def objective(trial):
    model = create_model(trial)
    
    mcc_list = []
    roc_list = []
    pr_list = []
    f2_list = []
    cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=11)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train_c, X_test_c = X.iloc[train_idx], X.iloc[test_idx]
        y_train_c, y_test_c = y[train_idx], y[test_idx]
        sampler = RandomUnderSampler(sampling_strategy=1.0, random_state=7)
        X_res, y_res = sampler.fit_resample(X_train_c, y_train_c)

#        weights = class_weight.compute_sample_weight('balanced', y=y_train)
#        model.fit(X_train, y_train, sample_weight=weights)
        model.fit(X_res, y_res)
        y_pred = model.predict(X_test_c)
        probs = model.predict_proba(X_test_c)
        prob1 = probs[:, 1]  # Only positives
        precision, recall, pr_thresh = precision_recall_curve(y_test_c, prob1)
        roc = roc_auc_score(y_test_c, prob1)
        roc_list.append(roc)
        mcc = matthews_corrcoef(y_test_c, y_pred)  # MWB
        mcc_list.append(mcc)
        f2 = fbeta_score(y_test_c, y_pred, beta=2.0, average=None)[1]
        pr_auc = auc(recall, precision)
        pr_list.append(pr_auc)
        avg_precision = average_precision_score(y_test_c, prob1)
#        print(f'ROC: {roc}')
#        print(f'PR_AUC: {pr_auc}')
#        print(f'avg_prec: {avg_precision}')
        
    print(f'MCC: {mcc_list}')
    print(f'ROC: {roc_list}')
    print(f'PR_AUC: {pr_list}')
    return np.mean(mcc_list)

sampler = TPESampler(seed=7)
study = optuna.create_study(sampler=sampler,direction='maximize')
#study.optimize(objective,n_trials=60)
study.optimize(objective,n_trials=25)
#study.optimize(objective,n_trials=5)

[32m[I 2021-12-10 14:35:37,116][0m A new study created in memory with name: no-name-b7b22ce5-207e-4a74-8088-0f0ca8377269[0m
[32m[I 2021-12-10 14:35:38,122][0m Trial 0 finished with value: 0.38311164347611826 and parameters: {'n_estimators': 24, 'subsample': 0.955983758448023, 'max_depth': 10, 'max_features': 17, 'learning_rate': 0.1973587414395923, 'min_samples_leaf': 27}. Best is trial 0 with value: 0.38311164347611826.[0m


MCC: [0.37716894682933133, 0.3876087228162542, 0.3889104192862239, 0.37875848497266384]
ROC: [0.8980303572527575, 0.8878069125593878, 0.9086379791825338, 0.8911164723105198]
PR_AUC: [0.32098475408182586, 0.26752594545025743, 0.30960392965705963, 0.3038204947368855]


[32m[I 2021-12-10 14:35:40,652][0m Trial 1 finished with value: 0.38062334119182795 and parameters: {'n_estimators': 105, 'subsample': 0.8144102266719524, 'max_depth': 7, 'max_features': 15, 'learning_rate': 0.16150759953451288, 'min_samples_leaf': 41}. Best is trial 0 with value: 0.38311164347611826.[0m


MCC: [0.3716694202750342, 0.3757038838803931, 0.3872849104928098, 0.38783515011907455]
ROC: [0.8900501243421036, 0.8848012548507598, 0.8998386102346498, 0.8867816537526623]
PR_AUC: [0.2878372365326239, 0.26390388399314535, 0.2873027030673795, 0.29200991819820094]


[32m[I 2021-12-10 14:35:42,842][0m Trial 2 finished with value: 0.3858550059689128 and parameters: {'n_estimators': 82, 'subsample': 0.813187269381181, 'max_depth': 8, 'max_features': 20, 'learning_rate': 0.10560624242958987, 'min_samples_leaf': 23}. Best is trial 2 with value: 0.3858550059689128.[0m


MCC: [0.37362614498882735, 0.38723140952966767, 0.3889104192862239, 0.39365205007093246]
ROC: [0.8981785192462429, 0.8856535378812604, 0.8991495303376492, 0.8946554719958407]
PR_AUC: [0.3130284654754564, 0.2568259891655635, 0.2813883028109565, 0.29816754995657685]


[32m[I 2021-12-10 14:35:47,713][0m Trial 3 finished with value: 0.3714953478041795 and parameters: {'n_estimators': 187, 'subsample': 0.8049798455100696, 'max_depth': 13, 'max_features': 20, 'learning_rate': 0.10763634548251577, 'min_samples_leaf': 28}. Best is trial 2 with value: 0.3858550059689128.[0m


MCC: [0.3606412802379647, 0.35576849575650427, 0.369133989413383, 0.400437625808866]
ROC: [0.8910530670672359, 0.8830037536720705, 0.8903070938962028, 0.8929087749862042]
PR_AUC: [0.2877274381156708, 0.25637317224974376, 0.261765064350853, 0.2835179395606865]


[32m[I 2021-12-10 14:35:52,602][0m Trial 4 finished with value: 0.37954478096743954 and parameters: {'n_estimators': 183, 'subsample': 0.8266338891518501, 'max_depth': 12, 'max_features': 18, 'learning_rate': 0.16028158890606967, 'min_samples_leaf': 24}. Best is trial 2 with value: 0.3858550059689128.[0m


MCC: [0.3744834124345514, 0.3868345528533507, 0.38462823696576287, 0.3722329216160933]
ROC: [0.891210346721859, 0.8862383491096363, 0.8959557356834584, 0.8868705848014555]
PR_AUC: [0.28881638346868715, 0.254925529522089, 0.26307052752805765, 0.2762358949869827]


[32m[I 2021-12-10 14:35:53,912][0m Trial 5 finished with value: 0.3825323022577879 and parameters: {'n_estimators': 49, 'subsample': 0.8981531778182141, 'max_depth': 9, 'max_features': 15, 'learning_rate': 0.12390684629367116, 'min_samples_leaf': 42}. Best is trial 2 with value: 0.3858550059689128.[0m


MCC: [0.3768040942275017, 0.3838059716375483, 0.38574395807934736, 0.38377518508675423]
ROC: [0.9012899211094371, 0.8906312334530155, 0.8980139085337107, 0.8903480168376121]
PR_AUC: [0.3272949826951657, 0.27903772284034867, 0.27883314184602015, 0.2827074602104866]


[32m[I 2021-12-10 14:35:59,053][0m Trial 6 finished with value: 0.38364536561570184 and parameters: {'n_estimators': 156, 'subsample': 0.8627989354425325, 'max_depth': 13, 'max_features': 13, 'learning_rate': 0.13434115190556806, 'min_samples_leaf': 18}. Best is trial 2 with value: 0.3858550059689128.[0m


MCC: [0.37558112665313514, 0.38491593598719326, 0.382327123398736, 0.3917572764237429]
ROC: [0.8897674768468392, 0.8845859173829471, 0.895520527327458, 0.8921038349804579]
PR_AUC: [0.2647441625409147, 0.2596331760341017, 0.2796966327069434, 0.28738300475873285]


[32m[I 2021-12-10 14:36:02,371][0m Trial 7 finished with value: 0.3865833832605876 and parameters: {'n_estimators': 135, 'subsample': 0.8740702165976071, 'max_depth': 11, 'max_features': 17, 'learning_rate': 0.12955901949366017, 'min_samples_leaf': 46}. Best is trial 7 with value: 0.3865833832605876.[0m


MCC: [0.3780877378456175, 0.3810919362444566, 0.3993187088332017, 0.38783515011907455]
ROC: [0.8958603539020171, 0.8841031081130092, 0.8931971493852681, 0.8867679720528479]
PR_AUC: [0.30135353239282614, 0.2654339729072892, 0.26494491335263126, 0.27847514807177853]


[32m[I 2021-12-10 14:36:03,947][0m Trial 8 finished with value: 0.37984180531362466 and parameters: {'n_estimators': 44, 'subsample': 0.9482237745826528, 'max_depth': 10, 'max_features': 14, 'learning_rate': 0.15612558423606065, 'min_samples_leaf': 27}. Best is trial 7 with value: 0.3865833832605876.[0m


MCC: [0.37034054594522997, 0.39447564402457314, 0.3715000350543422, 0.38305099623035316]
ROC: [0.8975516800430354, 0.8902028252275778, 0.9000970151960251, 0.893549534594178]
PR_AUC: [0.31577116031610614, 0.28080881093596116, 0.27495081810571287, 0.28916608628154455]


[32m[I 2021-12-10 14:36:05,563][0m Trial 9 finished with value: 0.3801980202941383 and parameters: {'n_estimators': 89, 'subsample': 0.8002853761125517, 'max_depth': 4, 'max_features': 17, 'learning_rate': 0.14292147161182367, 'min_samples_leaf': 35}. Best is trial 7 with value: 0.3865833832605876.[0m


MCC: [0.3657881566846982, 0.38493535889365765, 0.3776262539005808, 0.3924423116976165]
ROC: [0.8996077126295561, 0.8874193051173249, 0.898453650310086, 0.8906832184830644]
PR_AUC: [0.32674617667453726, 0.27667255316173256, 0.27242688228868145, 0.28608155294802984]


[32m[I 2021-12-10 14:36:14,597][0m Trial 10 finished with value: 0.38317468212351313 and parameters: {'n_estimators': 136, 'subsample': 0.9087175647948915, 'max_depth': 18, 'max_features': 10, 'learning_rate': 0.09172353354166263, 'min_samples_leaf': 4}. Best is trial 7 with value: 0.3865833832605876.[0m


MCC: [0.36928354462821367, 0.38734913257291786, 0.3853514432249526, 0.39071460806796854]
ROC: [0.8924389515601457, 0.8962390744568962, 0.8933059514742684, 0.8945574198138376]
PR_AUC: [0.27709725450531686, 0.2800275214780988, 0.27009628097027144, 0.28285268953134984]


[32m[I 2021-12-10 14:36:16,518][0m Trial 11 finished with value: 0.38533147372657645 and parameters: {'n_estimators': 84, 'subsample': 0.8616479235360296, 'max_depth': 17, 'max_features': 20, 'learning_rate': 0.0820677705499854, 'min_samples_leaf': 48}. Best is trial 7 with value: 0.3865833832605876.[0m


MCC: [0.3828959521173896, 0.38882719512521186, 0.3858183415505269, 0.3837844061131774]
ROC: [0.8984588873262229, 0.8942194356798319, 0.9019625176803395, 0.8939394630388879]
PR_AUC: [0.31688439658176065, 0.2731131607908945, 0.2813267117169207, 0.31123719743065253]


[32m[I 2021-12-10 14:36:19,671][0m Trial 12 finished with value: 0.38423132564511836 and parameters: {'n_estimators': 130, 'subsample': 0.9983488269913793, 'max_depth': 5, 'max_features': 18, 'learning_rate': 0.11112453890800206, 'min_samples_leaf': 13}. Best is trial 7 with value: 0.3865833832605876.[0m


MCC: [0.37535091980341395, 0.38569852581352043, 0.38153637077385316, 0.394339486189686]
ROC: [0.8954842503800925, 0.8934963551300183, 0.8947407790229572, 0.898306205562979]
PR_AUC: [0.3099934375360088, 0.30362367702383924, 0.2869214879066779, 0.31515404370791045]


[32m[I 2021-12-10 14:36:22,338][0m Trial 13 finished with value: 0.3849897867065429 and parameters: {'n_estimators': 69, 'subsample': 0.8492197673991577, 'max_depth': 15, 'max_features': 19, 'learning_rate': 0.10608646327289792, 'min_samples_leaf': 12}. Best is trial 7 with value: 0.3865833832605876.[0m


MCC: [0.37571328221664374, 0.38418177001119685, 0.38542089889891484, 0.3946431956994161]
ROC: [0.8972166059962298, 0.8871336996336996, 0.8947611794146448, 0.8983016449963743]
PR_AUC: [0.31388298104464996, 0.2655668087577261, 0.27659051067456386, 0.296341895429119]


[32m[I 2021-12-10 14:36:25,859][0m Trial 14 finished with value: 0.3795886949132491 and parameters: {'n_estimators': 133, 'subsample': 0.8967899887012859, 'max_depth': 8, 'max_features': 17, 'learning_rate': 0.12341180929200884, 'min_samples_leaf': 36}. Best is trial 7 with value: 0.3865833832605876.[0m


MCC: [0.36897211291989357, 0.3883653544056773, 0.3793087795237028, 0.3817085328037226]
ROC: [0.8912787291803906, 0.8869750299205744, 0.8957403982156457, 0.8907356649990195]
PR_AUC: [0.29794475381578256, 0.28877347564060385, 0.2659982193886359, 0.2929620017263046]


[32m[I 2021-12-10 14:36:28,213][0m Trial 15 finished with value: 0.3766033397051845 and parameters: {'n_estimators': 109, 'subsample': 0.8401823443116015, 'max_depth': 6, 'max_features': 10, 'learning_rate': 0.1808280525874309, 'min_samples_leaf': 47}. Best is trial 7 with value: 0.3865833832605876.[0m


MCC: [0.3876655758072028, 0.36222366901949715, 0.38781678040204143, 0.36870733359199664]
ROC: [0.8901185068006355, 0.8774163130598772, 0.8927030065643926, 0.8850281158931185]
PR_AUC: [0.2810485241860044, 0.24245821038344598, 0.26619728487391714, 0.2767567656755675]


[32m[I 2021-12-10 14:36:33,458][0m Trial 16 finished with value: 0.3811794967991 and parameters: {'n_estimators': 156, 'subsample': 0.8786217093617273, 'max_depth': 15, 'max_features': 12, 'learning_rate': 0.09359443598916382, 'min_samples_leaf': 18}. Best is trial 7 with value: 0.3865833832605876.[0m


MCC: [0.37019865229716764, 0.38569161931043866, 0.3791816554237538, 0.3896460601650399]
ROC: [0.8898723299499214, 0.8846017843542595, 0.8950739854205201, 0.8911164723105198]
PR_AUC: [0.27407046998277734, 0.25903235845664324, 0.26566236239537244, 0.2803052966826101]


[32m[I 2021-12-10 14:36:34,654][0m Trial 17 finished with value: 0.38535581907626926 and parameters: {'n_estimators': 63, 'subsample': 0.924580558278454, 'max_depth': 3, 'max_features': 19, 'learning_rate': 0.13930173816515404, 'min_samples_leaf': 32}. Best is trial 7 with value: 0.3865833832605876.[0m


MCC: [0.37767019274406216, 0.389274379241052, 0.389524370917243, 0.3849543334027198]
ROC: [0.9078979460188872, 0.8974041634932723, 0.9049863090704674, 0.8974875838574183]
PR_AUC: [0.34353349216399875, 0.2981446985043001, 0.3061519018659528, 0.3106431662188092]


[32m[I 2021-12-10 14:36:35,160][0m Trial 18 finished with value: 0.38386618405606415 and parameters: {'n_estimators': 10, 'subsample': 0.8293633716154828, 'max_depth': 11, 'max_features': 16, 'learning_rate': 0.11961751882400314, 'min_samples_leaf': 21}. Best is trial 7 with value: 0.3865833832605876.[0m


MCC: [0.376132393323029, 0.38117888290289137, 0.3857921830642953, 0.39236127693404105]
ROC: [0.9046623160226939, 0.8898254179813586, 0.8996878740071809, 0.8960270624022328]
PR_AUC: [0.31130155127289855, 0.2597070660258781, 0.2802851818335626, 0.2991893435946177]


[32m[I 2021-12-10 14:36:41,113][0m Trial 19 finished with value: 0.37568679696362467 and parameters: {'n_estimators': 162, 'subsample': 0.8633168552618011, 'max_depth': 8, 'max_features': 19, 'learning_rate': 0.09781664319532736, 'min_samples_leaf': 1}. Best is trial 7 with value: 0.3865833832605876.[0m


MCC: [0.3626405727333516, 0.38542252308325503, 0.372419777148104, 0.3822643148897882]
ROC: [0.8925734370619249, 0.8966720161027091, 0.8976217676712727, 0.8937433586748816]
PR_AUC: [0.30262484326774897, 0.27862660017698015, 0.30098779593109115, 0.29466157100855794]


[32m[I 2021-12-10 14:36:43,779][0m Trial 20 finished with value: 0.37253760626403576 and parameters: {'n_estimators': 115, 'subsample': 0.8822469757571317, 'max_depth': 15, 'max_features': 16, 'learning_rate': 0.15109563069647927, 'min_samples_leaf': 41}. Best is trial 7 with value: 0.3865833832605876.[0m


MCC: [0.36335997828759226, 0.3710911237261358, 0.382327123398736, 0.373372199643679]
ROC: [0.8894118880624742, 0.881439723642694, 0.8917464548652667, 0.8858991841146344]
PR_AUC: [0.2828985617055114, 0.2439714993842777, 0.26374660806169065, 0.2867164909647322]


[32m[I 2021-12-10 14:36:44,866][0m Trial 21 finished with value: 0.3890382077666189 and parameters: {'n_estimators': 61, 'subsample': 0.9247900894485763, 'max_depth': 3, 'max_features': 19, 'learning_rate': 0.13522323478552553, 'min_samples_leaf': 35}. Best is trial 21 with value: 0.3890382077666189.[0m


MCC: [0.379486108072869, 0.3948904313118821, 0.38877270379209494, 0.39300358788962964]
ROC: [0.9058487516782195, 0.8981521778551482, 0.9076156928879702, 0.8952164216882306]
PR_AUC: [0.3343847877485201, 0.28739177949526373, 0.328728889246385, 0.30654864226365897]


[32m[I 2021-12-10 14:36:46,238][0m Trial 22 finished with value: 0.3836158868466414 and parameters: {'n_estimators': 86, 'subsample': 0.928260701201433, 'max_depth': 3, 'max_features': 20, 'learning_rate': 0.13168219152594374, 'min_samples_leaf': 50}. Best is trial 21 with value: 0.3890382077666189.[0m


MCC: [0.37730886336295505, 0.3823093654222866, 0.381465634429466, 0.3933796841718578]
ROC: [0.9008910234346685, 0.8973678961302725, 0.9005412903927756, 0.8942655435511309]
PR_AUC: [0.3245296734698713, 0.2942314830337145, 0.29419100704059975, 0.30060887826731203]


[32m[I 2021-12-10 14:36:48,289][0m Trial 23 finished with value: 0.3834886998035195 and parameters: {'n_estimators': 69, 'subsample': 0.977104756139798, 'max_depth': 20, 'max_features': 18, 'learning_rate': 0.11496449307660377, 'min_samples_leaf': 32}. Best is trial 21 with value: 0.3890382077666189.[0m


MCC: [0.3645929049890338, 0.3856813117574368, 0.3952217485144742, 0.38845883395313313]
ROC: [0.8941485130234392, 0.8863290175171363, 0.8992265984840243, 0.8920947138472485]
PR_AUC: [0.3176102940578714, 0.2698962460027142, 0.274441838160545, 0.3064146598172]


[32m[I 2021-12-10 14:36:49,455][0m Trial 24 finished with value: 0.3852080294502233 and parameters: {'n_estimators': 46, 'subsample': 0.9243927389568112, 'max_depth': 6, 'max_features': 19, 'learning_rate': 0.14670437395550778, 'min_samples_leaf': 43}. Best is trial 21 with value: 0.3890382077666189.[0m


MCC: [0.37863465687650566, 0.3857287862856787, 0.3865311671369369, 0.3899375075017719]
ROC: [0.9021401430105149, 0.890848837631016, 0.9021891886990897, 0.8908268763311153]
PR_AUC: [0.3031070265324341, 0.2820028581279868, 0.2819837892098025, 0.2793317795314956]
CPU times: user 1min 12s, sys: 279 ms, total: 1min 12s
Wall time: 1min 12s


In [14]:
best = study.best_trial
best

FrozenTrial(number=21, values=[0.3890382077666189], datetime_start=datetime.datetime(2021, 12, 10, 14, 36, 43, 781671), datetime_complete=datetime.datetime(2021, 12, 10, 14, 36, 44, 866571), params={'n_estimators': 61, 'subsample': 0.9247900894485763, 'max_depth': 3, 'max_features': 19, 'learning_rate': 0.13522323478552553, 'min_samples_leaf': 35}, distributions={'n_estimators': IntUniformDistribution(high=200, low=10, step=1), 'subsample': UniformDistribution(high=1.0, low=0.8), 'max_depth': IntUniformDistribution(high=20, low=3, step=1), 'max_features': IntUniformDistribution(high=20, low=10, step=1), 'learning_rate': UniformDistribution(high=0.2, low=0.08), 'min_samples_leaf': IntUniformDistribution(high=50, low=1, step=1)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=21, state=TrialState.COMPLETE, value=None)

In [15]:
%%time
gb_params = study.best_params
#gb_params['random_state'] = 42
gb = GradientBoostingClassifier(**gb_params)
gb.fit(X_rus, y_rus)
y_pred = gb.predict(X_test)
probs = gb.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

[[2451  701]
 [  18  183]]
              precision    recall  f1-score   support

           0       0.99      0.78      0.87      3152
           1       0.21      0.91      0.34       201

    accuracy                           0.79      3353
   macro avg       0.60      0.84      0.60      3353
weighted avg       0.95      0.79      0.84      3353

ROC AUC prob1: 0.8941586483824533
MCC: 0.3707012094152902
F2: 0.5420616113744074
PR_AUC: 0.3468718489818416
CPU times: user 108 ms, sys: 0 ns, total: 108 ms
Wall time: 106 ms


In [16]:
%%time
gb_params = study.best_params
gb_params['random_state'] = 42
gb_weight = GradientBoostingClassifier(**gb_params)
weights = class_weight.compute_sample_weight('balanced', y=y_train)
gb_weight.fit(X_train, y_train, sample_weight=weights)
y_pred = gb_weight.predict(X_test)
probs = gb_weight.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

[[2454  698]
 [  19  182]]
              precision    recall  f1-score   support

           0       0.99      0.78      0.87      3152
           1       0.21      0.91      0.34       201

    accuracy                           0.79      3353
   macro avg       0.60      0.84      0.60      3353
weighted avg       0.95      0.79      0.84      3353

ROC AUC prob1: 0.8939061039977775
MCC: 0.3690713284913456
F2: 0.5403800475059383
PR_AUC: 0.33108666410017273
CPU times: user 318 ms, sys: 2.01 ms, total: 320 ms
Wall time: 318 ms


In [17]:
# No Sitenum - 5
coeffs_wgt = pd.Series(data=gb_weight.feature_importances_,
                   index=X_test.columns.values).sort_values(ascending=False)
coeffs_wgt[0:20]

Delmode            0.217712
Delfetalpos        0.120239
Lac_Min            0.106153
Hxanemia           0.076912
Episiotomy         0.073830
Hxcsection         0.050294
Antefetdistress    0.040191
CS_NRFHT           0.035949
CS_FTP             0.033980
ROMmeth            0.026279
uscar              0.023722
CS_UScar           0.021648
new_age            0.018952
transfus_yes       0.015455
Admefface          0.013004
new_BMI            0.012977
CS_Breech          0.009977
TrialLabor         0.009367
prelaborCD         0.008360
Admcervpos         0.008231
dtype: float64

In [1]:
plt.rcParams["figure.figsize"] = (14, 8)
x_labels = coeffs_wgt.index.values[0:25]
print(x_labels)
fig, ax = plt.subplots(1,1)  # Create a figure and an axes.
ax.set_title('Top Predictors: All Sites combined; target: High_EBLoss, vars: PI')
ax.bar(x_labels, coeffs_wgt.values[0:25])
ax.set_ylabel('Coeff')
ax.set_xlabel('Variable')
plt.draw()
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, fontsize=16)
#plt.xlim(1,20)
plt.show()

NameError: name 'plt' is not defined

In [19]:
# No Sitenum - 5
coeffs = pd.Series(data=gb.feature_importances_,
                   index=X_test.columns.values).sort_values(ascending=False)
coeffs[0:20]

Delmode            0.425730
Hxanemia           0.110664
Delfetalpos        0.051138
Lac_Min            0.036525
Lac_None           0.036066
CS_NRFHT           0.033599
Admpresent         0.029036
CS_UScar           0.027319
Admefface          0.023339
prelaborCD         0.020943
CS_FTP             0.018013
TrialLabor         0.017209
new_BMI            0.014104
AdmDBP             0.013791
Dilat_lst          0.013083
Episiotomy         0.012295
BESTGA             0.011769
Antefetdistress    0.010323
transfus_yes       0.009052
Hxcsection         0.007714
dtype: float64

In [21]:
gb_params # 5 runs

{'n_estimators': 61,
 'subsample': 0.9247900894485763,
 'max_depth': 3,
 'max_features': 19,
 'learning_rate': 0.13522323478552553,
 'min_samples_leaf': 35,
 'random_state': 42}

In [2]:
from optuna.visualization import plot_optimization_history

plotly_config = {"staticPlot": True}

fig = plot_optimization_history(study)
fig.show(config=plotly_config)

NameError: name 'study' is not defined

In [None]:
from optuna.visualization import plot_param_importances

fig = plot_param_importances(study)
fig.show(config=plotly_config)

In [24]:
np.mean([0.3149689279176914, 0.31449457458602254, 0.31066798538596907, 0.3219519143488398])

0.3155208505596307

<h1 align='center'>Thanks and make sure to learn!</h1>