In [1]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
#import plotly.express as px
import seaborn as sns

from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import StratifiedKFold
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, \
                            accuracy_score, f1_score, average_precision_score, \
                            fbeta_score, precision_recall_curve, auc
from sklearn.metrics import matthews_corrcoef

from sklearn.model_selection import train_test_split

import optuna
from optuna.samplers import TPESampler
sns.set(style='white', context='notebook', palette='deep')
%config Completer.use_jedi = False

In [2]:
csl_df = pd.read_csv('../../data/csl/Sites/CSL_he_PI_s42.csv', index_col=0)
X = csl_df.drop(['high_EBL','Sitenum'], axis=1, inplace=False)
y = csl_df['high_EBL'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=7)
X_train                                                    

Unnamed: 0_level_0,transfus_yes,Accrete,AdmBishop,Admcervpos,Admconsistency,Admcontract,AdmDBP,Admefface,Admpresent,AdmSBP,...,TD_nos,ThreatenedPB,threatpb9,UnspecHBP,uscar,version9,new_age,new_BMI,new_high_Age,new_high_BMI
MomID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
42-04780,0,0,8,8,8,99.0,77,100.0,1,121.0,...,0,0,0,0,0,0,35,25,1,1
42-02059,0,0,8,8,8,99.0,75,70.0,1,126.0,...,0,0,0,0,0,0,20,25,1,1
42-03840,0,0,8,8,8,99.0,74,100.0,1,121.0,...,0,0,0,0,0,0,21,26,1,1
42-00258,0,0,8,8,8,99.0,74,50.0,1,124.4,...,0,0,0,0,0,0,23,28,1,1
42-04735,0,0,8,8,8,99.0,79,100.0,1,151.0,...,1,0,0,0,0,0,20,38,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42-04313,0,0,8,8,8,99.0,81,90.0,1,115.0,...,0,0,0,0,0,0,17,24,0,1
42-05627,0,0,8,8,8,99.0,89,100.0,1,137.0,...,0,0,0,0,0,0,30,19,1,0
42-02719,0,0,8,8,8,99.0,76,100.0,1,127.0,...,0,0,0,0,0,0,24,24,1,1
42-03467,0,0,8,8,8,99.0,74,50.0,77,124.4,...,0,0,0,0,0,0,33,33,1,1


In [3]:
X.shape

(5784, 195)

<h1 align='center'>Exploratory Data Analysis</h1>

<h1 align='center'>Modelling</h1>

In [8]:
print(X_train.shape)
print(X.shape)
print(y.shape)

(4048, 195)
(5784, 195)
(5784,)


Here we define our validation set

<h1 align='center'>Modelling Using Undersampling</h1>

We will use `imblearn`'s RandomUnderSampler to undersample from the majority class so that they match

In [9]:
sampler = RandomUnderSampler(sampling_strategy=1.0, random_state=7)
X_rus, y_rus = sampler.fit_resample(X_train, y_train)

# Try with no undersampling
#X_rus, y_rus = X_train, y_train

In [10]:
print(X_rus.shape)
print(y_rus.shape)
print(np.bincount(y_rus))

(398, 195)
(398,)
[199 199]


<h1 align='center'>Basic Gradient Boosting</h1>

In [11]:
%%time
from sklearn.metrics import matthews_corrcoef
from sklearn.utils import class_weight
gb_rus = GradientBoostingClassifier()   # If not undersampling
gb_rus.fit(X_rus, y_rus)
print(gb_rus.get_params())
y_pred = gb_rus.predict(X_test)
probs = gb_rus.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'presort': 'deprecated', 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
[[1294  356]
 [  12   74]]
              precision    recall  f1-score   support

           0       0.99      0.78      0.88      1650
           1       0.17      0.86      0.29        86

    accuracy                           0.79      1736
   macro avg       0.58      0.82      0.58      1736
weighted avg       0.95      0.79      0.85      1736

ROC AUC prob1: 0.8597392529950669
MCC: 0.3240769412945344
F2: 0.47803617571059437
PR_AUC: 0.23617949195964885
CPU times: user 216 ms, sys: 1.16 ms, tota

In [12]:
%%time
# Not undersampled
from sklearn.metrics import matthews_corrcoef
from sklearn.utils import class_weight
gb_weight = GradientBoostingClassifier()  
weights = class_weight.compute_sample_weight('balanced', y=y_train)
gb_weight.fit(X_train, y_train, sample_weight=weights)
print(gb_weight.get_params())
y_pred = gb_weight.predict(X_test)
probs = gb_weight.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'presort': 'deprecated', 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
[[1371  279]
 [  16   70]]
              precision    recall  f1-score   support

           0       0.99      0.83      0.90      1650
           1       0.20      0.81      0.32        86

    accuracy                           0.83      1736
   macro avg       0.59      0.82      0.61      1736
weighted avg       0.95      0.83      0.87      1736

ROC AUC prob1: 0.8586962649753348
MCC: 0.3491462884650192
F2: 0.5050505050505051
PR_AUC: 0.20716533440804316
CPU times: user 1.23 s, sys: 4.98 ms, total

<h1 align='center'>Hyperparameter Tuning with Cross Validation</h1>

In [13]:
%%time
def create_model(trial):
    n_estimators = trial.suggest_int('n_estimators', 10, 200)
    subsample = trial.suggest_float('subsample', 0.8, 1.0)
    max_depth = trial.suggest_int('max_depth',3,20)
    max_features = trial.suggest_int('max_features',10,20)
    learning_rate = trial.suggest_uniform('learning_rate',0.08,0.2)
    min_samples_leaf = trial.suggest_int('min_samples_leaf',1,50)
    model = GradientBoostingClassifier(n_estimators=n_estimators, subsample=subsample, 
                           max_depth=max_depth, learning_rate=learning_rate, 
                           min_samples_leaf=min_samples_leaf)
#    model = GradientBoostingClassifier(n_estimators=n_estimators, subsample=subsample, 
#                           learning_rate=learning_rate, 
#                           min_samples_leaf=min_samples_leaf)
#    model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate) 
    return model

def objective(trial):
    model = create_model(trial)
    
    mcc_list = []
    roc_list = []
    pr_list = []
    f2_list = []
    cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=11)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train_c, X_test_c = X.iloc[train_idx], X.iloc[test_idx]
        y_train_c, y_test_c = y[train_idx], y[test_idx]
        sampler = RandomUnderSampler(sampling_strategy=1.0, random_state=7)
        X_res, y_res = sampler.fit_resample(X_train_c, y_train_c)

#        weights = class_weight.compute_sample_weight('balanced', y=y_train)
#        model.fit(X_train, y_train, sample_weight=weights)
        model.fit(X_res, y_res)
        y_pred = model.predict(X_test_c)
        probs = model.predict_proba(X_test_c)
        prob1 = probs[:, 1]  # Only positives
        precision, recall, pr_thresh = precision_recall_curve(y_test_c, prob1)
        roc = roc_auc_score(y_test_c, prob1)
        roc_list.append(roc)
        mcc = matthews_corrcoef(y_test_c, y_pred)  # MWB
        mcc_list.append(mcc)
        f2 = fbeta_score(y_test_c, y_pred, beta=2.0, average=None)[1]
        pr_auc = auc(recall, precision)
        pr_list.append(pr_auc)
        avg_precision = average_precision_score(y_test_c, prob1)
#        print(f'ROC: {roc}')
#        print(f'PR_AUC: {pr_auc}')
#        print(f'avg_prec: {avg_precision}')
        
    print(f'MCC: {mcc_list}')
    print(f'ROC: {roc_list}')
    print(f'PR_AUC: {pr_list}')
    return np.mean(mcc_list)

sampler = TPESampler(seed=7)
study = optuna.create_study(sampler=sampler,direction='maximize')
#study.optimize(objective,n_trials=60)
study.optimize(objective,n_trials=25)
#study.optimize(objective,n_trials=5)

[32m[I 2021-12-10 17:12:01,051][0m A new study created in memory with name: no-name-8048c74c-fd79-4fe4-9aea-b76ae2dc9827[0m
[32m[I 2021-12-10 17:12:01,512][0m Trial 0 finished with value: 0.3449656297807747 and parameters: {'n_estimators': 24, 'subsample': 0.955983758448023, 'max_depth': 10, 'max_features': 17, 'learning_rate': 0.1973587414395923, 'min_samples_leaf': 27}. Best is trial 0 with value: 0.3449656297807747.[0m


MCC: [0.3109672292879325, 0.35225932063795257, 0.3461078586784821, 0.3705281105187316]
ROC: [0.8495877080665812, 0.8798668373879641, 0.8690601792573623, 0.8889040514313439]
PR_AUC: [0.2542466127523184, 0.2650230299097522, 0.23002338928981045, 0.2507584793954582]


[32m[I 2021-12-10 17:12:02,455][0m Trial 1 finished with value: 0.3376963176858768 and parameters: {'n_estimators': 105, 'subsample': 0.8144102266719524, 'max_depth': 7, 'max_features': 15, 'learning_rate': 0.16150759953451288, 'min_samples_leaf': 41}. Best is trial 0 with value: 0.3449656297807747.[0m


MCC: [0.3190562309948463, 0.3485013459659645, 0.3281256479779031, 0.3551020458047932]
ROC: [0.8469551856594111, 0.8736389244558258, 0.8736696542893726, 0.8758187772925764]
PR_AUC: [0.23563687114886772, 0.2597903206089115, 0.23043793508677818, 0.260297260538613]


[32m[I 2021-12-10 17:12:03,466][0m Trial 2 finished with value: 0.3392141740477944 and parameters: {'n_estimators': 82, 'subsample': 0.813187269381181, 'max_depth': 8, 'max_features': 20, 'learning_rate': 0.10560624242958987, 'min_samples_leaf': 23}. Best is trial 0 with value: 0.3449656297807747.[0m


MCC: [0.31353745700337815, 0.3576298183354836, 0.3359519520929719, 0.3497374687593441]
ROC: [0.8470576184379002, 0.8764455825864276, 0.8749398207426375, 0.874282306323791]
PR_AUC: [0.21034946274648925, 0.2332334060816672, 0.23082003348822727, 0.25833278543036703]


[32m[I 2021-12-10 17:12:05,230][0m Trial 3 finished with value: 0.32126601147400874 and parameters: {'n_estimators': 187, 'subsample': 0.8049798455100696, 'max_depth': 13, 'max_features': 20, 'learning_rate': 0.10763634548251577, 'min_samples_leaf': 28}. Best is trial 0 with value: 0.3449656297807747.[0m


MCC: [0.28467814473106384, 0.3401487451967217, 0.31651201825678665, 0.3437251377114627]
ROC: [0.8397848911651729, 0.871887323943662, 0.8733930857874521, 0.8744541484716158]
PR_AUC: [0.22015330623361867, 0.2526346990301474, 0.2249627783183427, 0.25266913233461225]


[32m[I 2021-12-10 17:12:07,142][0m Trial 4 finished with value: 0.3357622485582103 and parameters: {'n_estimators': 183, 'subsample': 0.8266338891518501, 'max_depth': 12, 'max_features': 18, 'learning_rate': 0.16028158890606967, 'min_samples_leaf': 24}. Best is trial 0 with value: 0.3449656297807747.[0m


MCC: [0.301471505901395, 0.35838959900254114, 0.34529082994515337, 0.33789705938375175]
ROC: [0.8393854033290653, 0.8747656850192062, 0.8805941101152369, 0.8697941937570759]
PR_AUC: [0.21725108250967237, 0.25567092194219204, 0.22579521897255914, 0.25046826500064223]


[32m[I 2021-12-10 17:12:07,769][0m Trial 5 finished with value: 0.35074558493173535 and parameters: {'n_estimators': 49, 'subsample': 0.8981531778182141, 'max_depth': 9, 'max_features': 15, 'learning_rate': 0.12390684629367116, 'min_samples_leaf': 42}. Best is trial 5 with value: 0.35074558493173535.[0m


MCC: [0.32752238519236, 0.362222399147225, 0.3429766964662433, 0.37026085892111327]
ROC: [0.8538898847631241, 0.8887477592829707, 0.8788937259923176, 0.8952672650816755]
PR_AUC: [0.2646909913026046, 0.26020968216136087, 0.22812132287173068, 0.2770844296615063]


[32m[I 2021-12-10 17:12:09,734][0m Trial 6 finished with value: 0.33441919141429843 and parameters: {'n_estimators': 156, 'subsample': 0.8627989354425325, 'max_depth': 13, 'max_features': 13, 'learning_rate': 0.13434115190556806, 'min_samples_leaf': 18}. Best is trial 5 with value: 0.35074558493173535.[0m


MCC: [0.29168140268406806, 0.35387561506383003, 0.35225932063795257, 0.33986042727134297]
ROC: [0.8409423815620998, 0.8763738796414853, 0.8817516005121638, 0.875040433446547]
PR_AUC: [0.19811666565110925, 0.2679191696744137, 0.2518164029123269, 0.23631042813207626]


[32m[I 2021-12-10 17:12:10,895][0m Trial 7 finished with value: 0.34013024850784146 and parameters: {'n_estimators': 135, 'subsample': 0.8740702165976071, 'max_depth': 11, 'max_features': 17, 'learning_rate': 0.12955901949366017, 'min_samples_leaf': 46}. Best is trial 5 with value: 0.35074558493173535.[0m


MCC: [0.32162346957835014, 0.3431047926271489, 0.3406906860210734, 0.3551020458047932]
ROC: [0.8419769526248398, 0.8791293213828425, 0.8709756722151087, 0.8826115963124697]
PR_AUC: [0.23629969387763194, 0.2668424047097139, 0.21713332694009935, 0.2709982461870448]


[32m[I 2021-12-10 17:12:11,549][0m Trial 8 finished with value: 0.34298014458964887 and parameters: {'n_estimators': 44, 'subsample': 0.9482237745826528, 'max_depth': 10, 'max_features': 14, 'learning_rate': 0.15612558423606065, 'min_samples_leaf': 27}. Best is trial 5 with value: 0.35074558493173535.[0m


MCC: [0.3046467356326966, 0.36615816337706814, 0.34529082994515337, 0.3558248494036772]
ROC: [0.8411165172855313, 0.8748886043533931, 0.881628681177977, 0.8847646773410967]
PR_AUC: [0.2734004289718773, 0.2595930939808364, 0.2605497575866468, 0.2479631476581579]


[32m[I 2021-12-10 17:12:12,397][0m Trial 9 finished with value: 0.3415413565105966 and parameters: {'n_estimators': 89, 'subsample': 0.8002853761125517, 'max_depth': 4, 'max_features': 17, 'learning_rate': 0.14292147161182367, 'min_samples_leaf': 35}. Best is trial 5 with value: 0.35074558493173535.[0m


MCC: [0.3142765872181906, 0.35611926476185246, 0.3233864831269125, 0.3723830909354308]
ROC: [0.8321331626120358, 0.8784327784891165, 0.8776747759282971, 0.8881509784894064]
PR_AUC: [0.22710149986173983, 0.2754525371837554, 0.2281670511488119, 0.25525421079289223]


[32m[I 2021-12-10 17:12:14,036][0m Trial 10 finished with value: 0.34790034713208684 and parameters: {'n_estimators': 55, 'subsample': 0.924763336987846, 'max_depth': 18, 'max_features': 10, 'learning_rate': 0.09172353354166263, 'min_samples_leaf': 4}. Best is trial 5 with value: 0.35074558493173535.[0m


MCC: [0.3235002061013286, 0.3591524129233262, 0.3386879105825793, 0.37026085892111327]
ROC: [0.844629961587708, 0.8867708066581306, 0.872573623559539, 0.8830664725861233]
PR_AUC: [0.19181249879496767, 0.2571550350274446, 0.20901645122075513, 0.23174161202447563]


[32m[I 2021-12-10 17:12:15,570][0m Trial 11 finished with value: 0.3443553810740385 and parameters: {'n_estimators': 52, 'subsample': 0.9151706593411547, 'max_depth': 19, 'max_features': 10, 'learning_rate': 0.080537399193323, 'min_samples_leaf': 4}. Best is trial 5 with value: 0.35074558493173535.[0m


MCC: [0.32067496925243283, 0.35225932063795257, 0.34336216373049816, 0.36112507067527033]
ROC: [0.8452138284250961, 0.8850499359795135, 0.8633341869398208, 0.8808931748342228]
PR_AUC: [0.21865983023517943, 0.2579237462566188, 0.19146050212859747, 0.23285960847085857]


[32m[I 2021-12-10 17:12:16,093][0m Trial 12 finished with value: 0.30684510629854234 and parameters: {'n_estimators': 10, 'subsample': 0.9983488269913793, 'max_depth': 18, 'max_features': 10, 'learning_rate': 0.08009443937396477, 'min_samples_leaf': 3}. Best is trial 5 with value: 0.35074558493173535.[0m


MCC: [0.30967853104693616, 0.30630554404854893, 0.26919248395090034, 0.3422038661477841]
ROC: [0.8155441741357234, 0.8652291933418694, 0.840460947503201, 0.8615356622998545]
PR_AUC: [0.1919330950906456, 0.1904023371189437, 0.197776452689267, 0.19305695248268065]


[32m[I 2021-12-10 17:12:17,154][0m Trial 13 finished with value: 0.35336337529732464 and parameters: {'n_estimators': 65, 'subsample': 0.9111013369255444, 'max_depth': 16, 'max_features': 12, 'learning_rate': 0.10608646327289792, 'min_samples_leaf': 15}. Best is trial 13 with value: 0.35336337529732464.[0m


MCC: [0.2993487172740654, 0.36680307200447015, 0.3707054682399415, 0.3765962436708214]
ROC: [0.8466376440460948, 0.8871088348271448, 0.8940025608194623, 0.8869278667313603]
PR_AUC: [0.21941374799800165, 0.26847511442187705, 0.2746112804967424, 0.27514060371383303]


[32m[I 2021-12-10 17:12:18,389][0m Trial 14 finished with value: 0.33725207681505626 and parameters: {'n_estimators': 75, 'subsample': 0.8753751428035348, 'max_depth': 14, 'max_features': 12, 'learning_rate': 0.11584383198552786, 'min_samples_leaf': 13}. Best is trial 13 with value: 0.35336337529732464.[0m


MCC: [0.29168140268406806, 0.3546205705936933, 0.3431047926271489, 0.3596015413553149]
ROC: [0.8424788732394366, 0.8828169014084507, 0.8836261203585148, 0.8783660844250365]
PR_AUC: [0.21143451116619882, 0.2464979608138986, 0.25109824360333305, 0.23923462159736322]


[32m[I 2021-12-10 17:12:19,645][0m Trial 15 finished with value: 0.3489181098291606 and parameters: {'n_estimators': 131, 'subsample': 0.8988893224452837, 'max_depth': 16, 'max_features': 12, 'learning_rate': 0.11977795375288411, 'min_samples_leaf': 50}. Best is trial 13 with value: 0.35336337529732464.[0m


MCC: [0.3383061072497245, 0.34775859582571667, 0.35000619488588625, 0.3596015413553149]
ROC: [0.8463610755441742, 0.8791702944942381, 0.8779411011523687, 0.8866751576904416]
PR_AUC: [0.2559252614079299, 0.26681453671062944, 0.22774168024118524, 0.27138910415994405]


[32m[I 2021-12-10 17:12:20,392][0m Trial 16 finished with value: 0.3487842995579099 and parameters: {'n_estimators': 35, 'subsample': 0.9480224980809973, 'max_depth': 16, 'max_features': 15, 'learning_rate': 0.09791492352756925, 'min_samples_leaf': 12}. Best is trial 13 with value: 0.35336337529732464.[0m


MCC: [0.3337397971845689, 0.3546205705936933, 0.34924701447502227, 0.3575298159783551]
ROC: [0.8405633802816901, 0.8748373879641484, 0.8827656850192062, 0.8910419699175158]
PR_AUC: [0.21177087606711278, 0.23420330253979096, 0.23509654798737137, 0.27176430951831076]


[32m[I 2021-12-10 17:12:21,061][0m Trial 17 finished with value: 0.3526584226048634 and parameters: {'n_estimators': 62, 'subsample': 0.8452509098400132, 'max_depth': 3, 'max_features': 12, 'learning_rate': 0.14302287802722008, 'min_samples_leaf': 36}. Best is trial 13 with value: 0.35336337529732464.[0m


MCC: [0.33753726926341265, 0.357648514301569, 0.3399348183116986, 0.3755130885427733]
ROC: [0.8521690140845071, 0.8855723431498079, 0.8723175416133162, 0.8838245997088792]
PR_AUC: [0.2671945443087588, 0.3061928016996379, 0.22733311563492895, 0.25537249342484647]


[32m[I 2021-12-10 17:12:21,798][0m Trial 18 finished with value: 0.3332648592749712 and parameters: {'n_estimators': 69, 'subsample': 0.841212937001562, 'max_depth': 3, 'max_features': 12, 'learning_rate': 0.18656989022946982, 'min_samples_leaf': 33}. Best is trial 13 with value: 0.35336337529732464.[0m


MCC: [0.30361931767215716, 0.3453526048140493, 0.32902687151745, 0.35506064309622815]
ROC: [0.8525480153649169, 0.8851011523687581, 0.8720819462227913, 0.8814592430858806]
PR_AUC: [0.25862731982511994, 0.30378822704434405, 0.23188440196006083, 0.24256977690517248]


[32m[I 2021-12-10 17:12:22,968][0m Trial 19 finished with value: 0.3324832442889991 and parameters: {'n_estimators': 109, 'subsample': 0.8368982476435596, 'max_depth': 6, 'max_features': 11, 'learning_rate': 0.15021425991536821, 'min_samples_leaf': 35}. Best is trial 13 with value: 0.35336337529732464.[0m


MCC: [0.30873158206748785, 0.3461078586784821, 0.3328422585475191, 0.3422512778625075]
ROC: [0.8313341869398209, 0.8789142125480154, 0.8822125480153649, 0.8809234999191331]
PR_AUC: [0.244192578590286, 0.27937419881540104, 0.22422953645346588, 0.2501685562813181]


[32m[I 2021-12-10 17:12:24,742][0m Trial 20 finished with value: 0.33408681760827474 and parameters: {'n_estimators': 101, 'subsample': 0.8554427249183266, 'max_depth': 15, 'max_features': 13, 'learning_rate': 0.17373590758783994, 'min_samples_leaf': 12}. Best is trial 13 with value: 0.35336337529732464.[0m


MCC: [0.30361931767215716, 0.3419190614117703, 0.350747195814845, 0.3400616955343266]
ROC: [0.8486555697823304, 0.8778898847631242, 0.878719590268886, 0.8779617499595667]
PR_AUC: [0.2213338753741271, 0.2583476194544274, 0.24797671961080459, 0.2287482897521413]


[32m[I 2021-12-10 17:12:25,485][0m Trial 21 finished with value: 0.3457559295273579 and parameters: {'n_estimators': 62, 'subsample': 0.8953536926758056, 'max_depth': 5, 'max_features': 15, 'learning_rate': 0.12390361441843967, 'min_samples_leaf': 41}. Best is trial 13 with value: 0.35336337529732464.[0m


MCC: [0.32353526845259134, 0.36068723399404395, 0.3282731051440648, 0.3705281105187316]
ROC: [0.853572343149808, 0.886606914212548, 0.8841382842509603, 0.8873928513666506]
PR_AUC: [0.3009620514208685, 0.2738843825328518, 0.2427127845942643, 0.2857753327923758]


[32m[I 2021-12-10 17:12:26,024][0m Trial 22 finished with value: 0.3514339082736922 and parameters: {'n_estimators': 33, 'subsample': 0.9210028609307378, 'max_depth': 8, 'max_features': 14, 'learning_rate': 0.13912169109050807, 'min_samples_leaf': 40}. Best is trial 13 with value: 0.35336337529732464.[0m


MCC: [0.32308071394446697, 0.3553684464547116, 0.3591524129233262, 0.36813405977226393]
ROC: [0.8547195902688861, 0.8924967989756722, 0.8691318822023048, 0.8911632702571567]
PR_AUC: [0.30458486768316834, 0.28794964853200034, 0.20271454104738915, 0.2678708626010756]


[32m[I 2021-12-10 17:12:26,471][0m Trial 23 finished with value: 0.36416430137851385 and parameters: {'n_estimators': 25, 'subsample': 0.928784331939467, 'max_depth': 3, 'max_features': 13, 'learning_rate': 0.13938806574215398, 'min_samples_leaf': 34}. Best is trial 23 with value: 0.36416430137851385.[0m


MCC: [0.34481634207253226, 0.3607194793435652, 0.35997241328162954, 0.39114897081632843]
ROC: [0.8502637644046095, 0.8952061459667093, 0.8726760563380281, 0.8932253760310528]
PR_AUC: [0.25538781759738954, 0.28023084505471535, 0.2333472032776976, 0.2655697190533627]


[32m[I 2021-12-10 17:12:26,827][0m Trial 24 finished with value: 0.36054423817143805 and parameters: {'n_estimators': 15, 'subsample': 0.9676952127097791, 'max_depth': 4, 'max_features': 11, 'learning_rate': 0.16777006678480033, 'min_samples_leaf': 31}. Best is trial 23 with value: 0.36416430137851385.[0m


MCC: [0.3401487451967217, 0.3568730478616745, 0.35997241328162954, 0.38518274634572647]
ROC: [0.8488450704225352, 0.8887426376440462, 0.8765275288092189, 0.887236171761281]
PR_AUC: [0.24871423451365715, 0.23918796732512776, 0.2488327671612125, 0.25770578226214796]
CPU times: user 25.7 s, sys: 287 ms, total: 25.9 s
Wall time: 25.8 s


In [14]:
best = study.best_trial
best

FrozenTrial(number=23, values=[0.36416430137851385], datetime_start=datetime.datetime(2021, 12, 10, 17, 12, 26, 26077), datetime_complete=datetime.datetime(2021, 12, 10, 17, 12, 26, 470392), params={'n_estimators': 25, 'subsample': 0.928784331939467, 'max_depth': 3, 'max_features': 13, 'learning_rate': 0.13938806574215398, 'min_samples_leaf': 34}, distributions={'n_estimators': IntUniformDistribution(high=200, low=10, step=1), 'subsample': UniformDistribution(high=1.0, low=0.8), 'max_depth': IntUniformDistribution(high=20, low=3, step=1), 'max_features': IntUniformDistribution(high=20, low=10, step=1), 'learning_rate': UniformDistribution(high=0.2, low=0.08), 'min_samples_leaf': IntUniformDistribution(high=50, low=1, step=1)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=23, state=TrialState.COMPLETE, value=None)

In [9]:
%%time
gb_params = study.best_params
#gb_params['random_state'] = 42
gb = GradientBoostingClassifier(**gb_params)
gb.fit(X_rus, y_rus)
y_pred = gb.predict(X_test)
probs = gb.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

NameError: name 'study' is not defined

In [16]:
%%time
gb_params = study.best_params
gb_params['random_state'] = 42
gb_weight = GradientBoostingClassifier(**gb_params)
weights = class_weight.compute_sample_weight('balanced', y=y_train)
gb_weight.fit(X_train, y_train, sample_weight=weights)
y_pred = gb_weight.predict(X_test)
probs = gb_weight.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

[[1310  340]
 [   8   78]]
              precision    recall  f1-score   support

           0       0.99      0.79      0.88      1650
           1       0.19      0.91      0.31        86

    accuracy                           0.80      1736
   macro avg       0.59      0.85      0.60      1736
weighted avg       0.95      0.80      0.85      1736

ROC AUC prob1: 0.8849999999999999
MCC: 0.3557225616740703
F2: 0.5118110236220472
PR_AUC: 0.2684872379620049
CPU times: user 96.9 ms, sys: 4.02 ms, total: 101 ms
Wall time: 98.9 ms


In [17]:
# No Sitenum - 5
coeffs_wgt = pd.Series(data=gb_weight.feature_importances_,
                   index=X_test.columns.values).sort_values(ascending=False)
coeffs_wgt[0:20]

Delmode           0.270206
Lac_None          0.218308
Lac_Min           0.169190
CS_FTP            0.094985
ROMmeth           0.053427
CS_UScar          0.027081
vertex            0.018877
Antesteroid       0.018405
new_BMI           0.015608
CS_Elect          0.015287
prelaborCD        0.008262
intra_abruptio    0.007106
Admefface         0.006385
Admpresent        0.006126
BESTGA            0.005656
high_height       0.005648
IntraMgSO         0.005608
momrace_new       0.005472
high_Gravidity    0.004037
AdmDBP            0.003416
dtype: float64

In [12]:
plt.rcParams["figure.figsize"] = (14, 8)
x_labels = coeffs_wgt.index.values[0:25]
print(x_labels)
fig, ax = plt.subplots(1,1)  # Create a figure and an axes.
ax.set_title('Top Predictors: All Sites combined; target: High_EBLoss, vars: PI')
ax.bar(x_labels, coeffs_wgt.values[0:25])
ax.set_ylabel('Coeff')
ax.set_xlabel('Variable')
plt.draw()
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, fontsize=16)
#plt.xlim(1,20)
plt.show()

NameError: name 'coeffs_wgt' is not defined

In [19]:
# No Sitenum - 5
coeffs = pd.Series(data=gb.feature_importances_,
                   index=X_test.columns.values).sort_values(ascending=False)
coeffs[0:20]

Lac_None       0.254690
HxnumCS        0.136208
Lac_Min        0.111217
Delmode        0.090414
CS_FTP         0.083102
new_BMI        0.057152
Presentdel     0.039371
CS_Elect       0.034893
Dilat_lst      0.032201
Admpresent     0.027259
IUPC           0.024720
prelaborCD     0.017480
BESTGA         0.009879
IntraMgSO      0.009760
momrace_new    0.009092
Admreason      0.008747
Admefface      0.008615
Insurance      0.007905
Hxcsection     0.006252
Antesteroid    0.005026
dtype: float64

In [21]:
gb_params # 5 runs

{'n_estimators': 25,
 'subsample': 0.928784331939467,
 'max_depth': 3,
 'max_features': 13,
 'learning_rate': 0.13938806574215398,
 'min_samples_leaf': 34,
 'random_state': 42}

In [10]:
from optuna.visualization import plot_optimization_history

plotly_config = {"staticPlot": True}

fig = plot_optimization_history(study)
fig.show(config=plotly_config)

NameError: name 'study' is not defined

In [11]:
from optuna.visualization import plot_param_importances

fig = plot_param_importances(study)
fig.show(config=plotly_config)

NameError: name 'study' is not defined

In [24]:
np.mean([0.3149689279176914, 0.31449457458602254, 0.31066798538596907, 0.3219519143488398])

0.3155208505596307

<h1 align='center'>Thanks and make sure to learn!</h1>