In [1]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
#import plotly.express as px
import seaborn as sns

from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import StratifiedKFold
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, \
                            accuracy_score, f1_score, average_precision_score, \
                            fbeta_score, precision_recall_curve, auc
from sklearn.metrics import matthews_corrcoef

from sklearn.model_selection import train_test_split

import optuna
from optuna.samplers import TPESampler
sns.set(style='white', context='notebook', palette='deep')
%config Completer.use_jedi = False

In [2]:
csl_df = pd.read_csv('../../data/csl/Sites/CSL_he_PI_s45.csv', index_col=0)
X = csl_df.drop(['high_EBL','Sitenum'], axis=1, inplace=False)
y = csl_df['high_EBL'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=7)
X_train                                                    

Unnamed: 0_level_0,transfus_yes,Accrete,AdmBishop,Admcervpos,Admconsistency,Admcontract,AdmDBP,Admefface,Admpresent,AdmSBP,...,TD_nos,ThreatenedPB,threatpb9,UnspecHBP,uscar,version9,new_age,new_BMI,new_high_Age,new_high_BMI
MomID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
45-01244,0,0,9,3,2,99.0,87,100.0,1,131.0,...,0,0,0,0,0,0,28,24,1,1
45-11378,0,0,8,3,3,10.0,79,60.0,1,125.0,...,0,0,0,0,1,0,38,24,1,1
45-14054,0,0,1,3,1,10.0,68,30.0,1,129.0,...,0,0,0,0,0,0,25,23,1,1
45-01134,0,0,8,8,8,0.0,84,0.0,77,146.0,...,0,0,0,0,1,0,26,24,1,1
45-12058,0,0,8,8,8,99.0,67,0.0,77,117.0,...,0,0,0,0,1,0,34,24,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45-14585,0,0,9,1,3,2.0,62,100.0,1,126.0,...,0,0,0,0,0,0,23,24,1,1
45-14580,0,0,10,2,3,2.0,75,100.0,1,142.0,...,0,0,0,0,0,0,30,37,1,1
45-12005,0,0,5,3,2,10.0,81,70.0,1,120.0,...,0,0,0,0,0,0,29,24,1,1
45-06743,0,0,8,8,8,99.0,67,0.0,77,103.0,...,0,0,0,0,1,0,31,24,1,1


In [3]:
X.shape

(11879, 195)

<h1 align='center'>Exploratory Data Analysis</h1>

<h1 align='center'>Modelling</h1>

In [8]:
print(X_train.shape)
print(X.shape)
print(y.shape)

(8315, 195)
(11879, 195)
(11879,)


Here we define our validation set

<h1 align='center'>Modelling Using Undersampling</h1>

We will use `imblearn`'s RandomUnderSampler to undersample from the majority class so that they match

In [9]:
sampler = RandomUnderSampler(sampling_strategy=1.0, random_state=7)
X_rus, y_rus = sampler.fit_resample(X_train, y_train)

# Try with no undersampling
#X_rus, y_rus = X_train, y_train

In [10]:
print(X_rus.shape)
print(y_rus.shape)
print(np.bincount(y_rus))

(910, 195)
(910,)
[455 455]


<h1 align='center'>Basic Gradient Boosting</h1>

In [11]:
%%time
from sklearn.metrics import matthews_corrcoef
from sklearn.utils import class_weight
gb_rus = GradientBoostingClassifier()   # If not undersampling
gb_rus.fit(X_rus, y_rus)
print(gb_rus.get_params())
y_pred = gb_rus.predict(X_test)
probs = gb_rus.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'presort': 'deprecated', 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
[[2516  853]
 [  15  180]]
              precision    recall  f1-score   support

           0       0.99      0.75      0.85      3369
           1       0.17      0.92      0.29       195

    accuracy                           0.76      3564
   macro avg       0.58      0.83      0.57      3564
weighted avg       0.95      0.76      0.82      3564

ROC AUC prob1: 0.8699423857037393
MCC: 0.33579368284763844
F2: 0.4964147821290679
PR_AUC: 0.2388490561225601
CPU times: user 356 ms, sys: 2.89 ms, total

In [12]:
%%time
# Not undersampled
from sklearn.metrics import matthews_corrcoef
from sklearn.utils import class_weight
gb_weight = GradientBoostingClassifier()  
weights = class_weight.compute_sample_weight('balanced', y=y_train)
gb_weight.fit(X_train, y_train, sample_weight=weights)
print(gb_weight.get_params())
y_pred = gb_weight.predict(X_test)
probs = gb_weight.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'presort': 'deprecated', 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
[[2585  784]
 [  23  172]]
              precision    recall  f1-score   support

           0       0.99      0.77      0.86      3369
           1       0.18      0.88      0.30       195

    accuracy                           0.77      3564
   macro avg       0.59      0.82      0.58      3564
weighted avg       0.95      0.77      0.83      3564

ROC AUC prob1: 0.8722865340852876
MCC: 0.3333174753313989
F2: 0.4953917050691244
PR_AUC: 0.25901781882089486
CPU times: user 2.43 s, sys: 4.73 ms, total

<h1 align='center'>Hyperparameter Tuning with Cross Validation</h1>

In [13]:
%%time
def create_model(trial):
    n_estimators = trial.suggest_int('n_estimators', 10, 200)
    subsample = trial.suggest_float('subsample', 0.8, 1.0)
    max_depth = trial.suggest_int('max_depth',3,20)
    max_features = trial.suggest_int('max_features',10,20)
    learning_rate = trial.suggest_uniform('learning_rate',0.08,0.2)
    min_samples_leaf = trial.suggest_int('min_samples_leaf',1,50)
    model = GradientBoostingClassifier(n_estimators=n_estimators, subsample=subsample, 
                           max_depth=max_depth, learning_rate=learning_rate, 
                           min_samples_leaf=min_samples_leaf)
#    model = GradientBoostingClassifier(n_estimators=n_estimators, subsample=subsample, 
#                           learning_rate=learning_rate, 
#                           min_samples_leaf=min_samples_leaf)
#    model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate) 
    return model

def objective(trial):
    model = create_model(trial)
    
    mcc_list = []
    roc_list = []
    pr_list = []
    f2_list = []
    cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=11)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train_c, X_test_c = X.iloc[train_idx], X.iloc[test_idx]
        y_train_c, y_test_c = y[train_idx], y[test_idx]
        sampler = RandomUnderSampler(sampling_strategy=1.0, random_state=7)
        X_res, y_res = sampler.fit_resample(X_train_c, y_train_c)

#        weights = class_weight.compute_sample_weight('balanced', y=y_train)
#        model.fit(X_train, y_train, sample_weight=weights)
        model.fit(X_res, y_res)
        y_pred = model.predict(X_test_c)
        probs = model.predict_proba(X_test_c)
        prob1 = probs[:, 1]  # Only positives
        precision, recall, pr_thresh = precision_recall_curve(y_test_c, prob1)
        roc = roc_auc_score(y_test_c, prob1)
        roc_list.append(roc)
        mcc = matthews_corrcoef(y_test_c, y_pred)  # MWB
        mcc_list.append(mcc)
        f2 = fbeta_score(y_test_c, y_pred, beta=2.0, average=None)[1]
        pr_auc = auc(recall, precision)
        pr_list.append(pr_auc)
        avg_precision = average_precision_score(y_test_c, prob1)
#        print(f'ROC: {roc}')
#        print(f'PR_AUC: {pr_auc}')
#        print(f'avg_prec: {avg_precision}')
        
    print(f'MCC: {mcc_list}')
    print(f'ROC: {roc_list}')
    print(f'PR_AUC: {pr_list}')
    return np.mean(mcc_list)

sampler = TPESampler(seed=7)
study = optuna.create_study(sampler=sampler,direction='maximize')
#study.optimize(objective,n_trials=60)
study.optimize(objective,n_trials=25)
#study.optimize(objective,n_trials=5)

[32m[I 2021-12-10 14:36:34,126][0m A new study created in memory with name: no-name-ecf0f3f9-6128-463b-ab5b-0f45b2b8b571[0m
[32m[I 2021-12-10 14:36:35,117][0m Trial 0 finished with value: 0.33710350552060636 and parameters: {'n_estimators': 24, 'subsample': 0.955983758448023, 'max_depth': 10, 'max_features': 17, 'learning_rate': 0.1973587414395923, 'min_samples_leaf': 27}. Best is trial 0 with value: 0.33710350552060636.[0m


MCC: [0.33177477443291925, 0.3377050812811679, 0.3572127099169114, 0.32172145645142697]
ROC: [0.8650021103724806, 0.8657497360892248, 0.883953569188335, 0.8618159187568997]
PR_AUC: [0.22835811001551834, 0.23224578396845674, 0.27644156659960956, 0.273563877740759]


[32m[I 2021-12-10 14:36:37,445][0m Trial 1 finished with value: 0.3350170315799119 and parameters: {'n_estimators': 105, 'subsample': 0.8144102266719524, 'max_depth': 7, 'max_features': 15, 'learning_rate': 0.16150759953451288, 'min_samples_leaf': 41}. Best is trial 0 with value: 0.33710350552060636.[0m


MCC: [0.34486724625444226, 0.3345071846055076, 0.3462679340494135, 0.3144257614102843]
ROC: [0.8564023425134537, 0.8550359421341477, 0.8781420681425272, 0.8437130278360536]
PR_AUC: [0.20319419618828943, 0.1899569645436858, 0.28221838077149436, 0.1969877995037051]


[32m[I 2021-12-10 14:36:39,617][0m Trial 2 finished with value: 0.3291949994384282 and parameters: {'n_estimators': 82, 'subsample': 0.813187269381181, 'max_depth': 8, 'max_features': 20, 'learning_rate': 0.10560624242958987, 'min_samples_leaf': 23}. Best is trial 0 with value: 0.33710350552060636.[0m


MCC: [0.3298014257956001, 0.3323073020850431, 0.32909163415169224, 0.32557963572137727]
ROC: [0.8661012627062009, 0.8603076008488857, 0.880705772815988, 0.8619676558163674]
PR_AUC: [0.2196980460636211, 0.21400612592125898, 0.3057579250282096, 0.2424630341192562]


[32m[I 2021-12-10 14:36:44,353][0m Trial 3 finished with value: 0.3283109407728715 and parameters: {'n_estimators': 187, 'subsample': 0.8049798455100696, 'max_depth': 13, 'max_features': 20, 'learning_rate': 0.10763634548251577, 'min_samples_leaf': 28}. Best is trial 0 with value: 0.33710350552060636.[0m


MCC: [0.33716507573733706, 0.3324333970471642, 0.34256662163462953, 0.3010786686723551]
ROC: [0.8562528577960677, 0.8634548597830578, 0.8851665752358805, 0.8543588119647969]
PR_AUC: [0.20485462874786228, 0.22370588295416705, 0.2713417699970563, 0.22145902341314502]


[32m[I 2021-12-10 14:36:49,540][0m Trial 4 finished with value: 0.32943007559807946 and parameters: {'n_estimators': 183, 'subsample': 0.8266338891518501, 'max_depth': 12, 'max_features': 18, 'learning_rate': 0.16028158890606967, 'min_samples_leaf': 24}. Best is trial 0 with value: 0.33710350552060636.[0m


MCC: [0.33111154531366693, 0.33131110577305556, 0.3502783671127074, 0.30501928419288815]
ROC: [0.8605131722415672, 0.8638264111850085, 0.8793966005232318, 0.8561972493809566]
PR_AUC: [0.20731259698060317, 0.21755367990670108, 0.24445812688553414, 0.21824598127169115]


[32m[I 2021-12-10 14:36:50,978][0m Trial 5 finished with value: 0.3360580253362999 and parameters: {'n_estimators': 49, 'subsample': 0.8981531778182141, 'max_depth': 9, 'max_features': 15, 'learning_rate': 0.12390684629367116, 'min_samples_leaf': 42}. Best is trial 0 with value: 0.33710350552060636.[0m


MCC: [0.33261792933894563, 0.3398961277127669, 0.3494078194965979, 0.32231022479688914]
ROC: [0.8638787766874189, 0.8482911913904984, 0.8868472989305877, 0.851121754696152]
PR_AUC: [0.23117743625380444, 0.17612337656532873, 0.2668792146093514, 0.2180693718968982]


[32m[I 2021-12-10 14:36:56,309][0m Trial 6 finished with value: 0.33065594258196723 and parameters: {'n_estimators': 156, 'subsample': 0.8627989354425325, 'max_depth': 13, 'max_features': 13, 'learning_rate': 0.13434115190556806, 'min_samples_leaf': 18}. Best is trial 0 with value: 0.33710350552060636.[0m


MCC: [0.3325416886020444, 0.33387618529761487, 0.3469308116320749, 0.3092750847961348]
ROC: [0.8576136083852133, 0.8587361569782817, 0.8778207854596638, 0.8570482963666671]
PR_AUC: [0.20956350470814247, 0.21573413047124243, 0.2538768134765713, 0.228560715720146]


[32m[I 2021-12-10 14:36:59,362][0m Trial 7 finished with value: 0.3326490280594889 and parameters: {'n_estimators': 135, 'subsample': 0.8740702165976071, 'max_depth': 11, 'max_features': 17, 'learning_rate': 0.12955901949366017, 'min_samples_leaf': 46}. Best is trial 0 with value: 0.33710350552060636.[0m


MCC: [0.32144362324863046, 0.3449817426203047, 0.34732717712266326, 0.31684356924635715]
ROC: [0.8515462875030776, 0.8514930902367219, 0.8767279872186319, 0.8443507633033818]
PR_AUC: [0.19726527583937878, 0.19120968674029037, 0.27220741448486957, 0.19835159671426247]


[32m[I 2021-12-10 14:37:00,842][0m Trial 8 finished with value: 0.33086117624150624 and parameters: {'n_estimators': 44, 'subsample': 0.9482237745826528, 'max_depth': 10, 'max_features': 14, 'learning_rate': 0.15612558423606065, 'min_samples_leaf': 27}. Best is trial 0 with value: 0.33710350552060636.[0m


MCC: [0.3175359971739469, 0.3358346023414679, 0.33816118920904037, 0.3319129162415698]
ROC: [0.8567320882135697, 0.8584607718215417, 0.8794403124528731, 0.8563621809673347]
PR_AUC: [0.2025994366819993, 0.21665881700721187, 0.28194450137950744, 0.2528045529411525]


[32m[I 2021-12-10 14:37:02,474][0m Trial 9 finished with value: 0.33287659499847067 and parameters: {'n_estimators': 89, 'subsample': 0.8002853761125517, 'max_depth': 4, 'max_features': 17, 'learning_rate': 0.14292147161182367, 'min_samples_leaf': 35}. Best is trial 0 with value: 0.33710350552060636.[0m


MCC: [0.3202137293225191, 0.3414989494497528, 0.3414989494497528, 0.32829475177185796]
ROC: [0.8621201329534663, 0.856353856812832, 0.8833831285065165, 0.8497737138634893]
PR_AUC: [0.21518245838477287, 0.21886087226194215, 0.29021146847572654, 0.2079138673063165]


[32m[I 2021-12-10 14:37:05,027][0m Trial 10 finished with value: 0.3232023536700065 and parameters: {'n_estimators': 36, 'subsample': 0.9869602664638994, 'max_depth': 18, 'max_features': 10, 'learning_rate': 0.18696473435792457, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.33710350552060636.[0m


MCC: [0.32534264193863105, 0.30516063527696874, 0.34183869079378015, 0.320467446670646]
ROC: [0.8553999155851008, 0.8540895788574138, 0.8861828776000401, 0.8603953080262307]
PR_AUC: [0.19468319584292001, 0.19741946234508853, 0.2673782138072693, 0.19925782926729008]


[32m[I 2021-12-10 14:37:05,909][0m Trial 11 finished with value: 0.33840726923658554 and parameters: {'n_estimators': 16, 'subsample': 0.9278127244571726, 'max_depth': 17, 'max_features': 12, 'learning_rate': 0.08188480856163091, 'min_samples_leaf': 11}. Best is trial 11 with value: 0.33840726923658554.[0m


MCC: [0.33193834621100693, 0.3356691045820556, 0.35323411742128846, 0.33278750873199114]
ROC: [0.8609528331750554, 0.8519214671472066, 0.8796960272412745, 0.8608109356239032]
PR_AUC: [0.19428677691680699, 0.18918702509891977, 0.2529238610544861, 0.23041734001693004]


[32m[I 2021-12-10 14:37:06,595][0m Trial 12 finished with value: 0.3355374156943714 and parameters: {'n_estimators': 10, 'subsample': 0.9466266340063076, 'max_depth': 18, 'max_features': 11, 'learning_rate': 0.0801224364535209, 'min_samples_leaf': 10}. Best is trial 11 with value: 0.33840726923658554.[0m


MCC: [0.3196456108734051, 0.3359681249187649, 0.3549556779142119, 0.33158024907110367]
ROC: [0.8503240301079807, 0.853646995569796, 0.8789212332883829, 0.8581280484854882]
PR_AUC: [0.1956617468090631, 0.18517766435238142, 0.2596811063721533, 0.2531672209540626]


[32m[I 2021-12-10 14:37:07,459][0m Trial 13 finished with value: 0.3348540385793838 and parameters: {'n_estimators': 16, 'subsample': 0.9421854113075142, 'max_depth': 16, 'max_features': 13, 'learning_rate': 0.1976336096355517, 'min_samples_leaf': 14}. Best is trial 11 with value: 0.33840726923658554.[0m


MCC: [0.3403582748681327, 0.32914917135225497, 0.3410271446743139, 0.3288815634228337]
ROC: [0.8594425978685237, 0.8610397756703773, 0.8876974959621106, 0.862596594932422]
PR_AUC: [0.22055921458530559, 0.19962320649340232, 0.2830598566631494, 0.2486782510799054]


[32m[I 2021-12-10 14:37:14,081][0m Trial 14 finished with value: 0.304917829786564 and parameters: {'n_estimators': 62, 'subsample': 0.9856209318135866, 'max_depth': 20, 'max_features': 12, 'learning_rate': 0.18243831976122055, 'min_samples_leaf': 1}. Best is trial 11 with value: 0.33840726923658554.[0m


MCC: [0.29160613307612093, 0.2943927614893436, 0.3255509640390853, 0.3081214605417061]
ROC: [0.8372496130983785, 0.8367239220091751, 0.8661300298771039, 0.8419834452669032]
PR_AUC: [0.18022441416049123, 0.1623811083972344, 0.20308732927066267, 0.2086199813203196]


[32m[I 2021-12-10 14:37:15,066][0m Trial 15 finished with value: 0.339650339053743 and parameters: {'n_estimators': 26, 'subsample': 0.9219374918449694, 'max_depth': 15, 'max_features': 17, 'learning_rate': 0.08082787196209995, 'min_samples_leaf': 32}. Best is trial 15 with value: 0.339650339053743.[0m


MCC: [0.3390715424015106, 0.3383639136504087, 0.3484632734216517, 0.3327026267414008]
ROC: [0.8583093718124581, 0.8565177765489869, 0.8798555757844652, 0.8569845228199342]
PR_AUC: [0.22180983636648288, 0.18759321348421443, 0.3032472925284074, 0.21655067063101874]


[32m[I 2021-12-10 14:37:17,136][0m Trial 16 finished with value: 0.3390864752252334 and parameters: {'n_estimators': 71, 'subsample': 0.9164558986672625, 'max_depth': 15, 'max_features': 18, 'learning_rate': 0.08082710284297405, 'min_samples_leaf': 34}. Best is trial 15 with value: 0.339650339053743.[0m


MCC: [0.33626015045507257, 0.33422442922565493, 0.3483063877182737, 0.3375549335019323]
ROC: [0.8603109282121629, 0.8610660028281619, 0.8841633864506132, 0.8640172056630909]
PR_AUC: [0.21678766203816555, 0.20964463334421513, 0.2849779309173226, 0.2701054788131597]


[32m[I 2021-12-10 14:37:19,075][0m Trial 17 finished with value: 0.3389052618111148 and parameters: {'n_estimators': 66, 'subsample': 0.8919799915410579, 'max_depth': 15, 'max_features': 19, 'learning_rate': 0.09715405938781194, 'min_samples_leaf': 33}. Best is trial 15 with value: 0.339650339053743.[0m


MCC: [0.33355091607961007, 0.33759712676837467, 0.3525913602502055, 0.3318816441462688]
ROC: [0.8633973479652491, 0.8658917998605591, 0.8835164498919222, 0.863922644886901]
PR_AUC: [0.2206851166198297, 0.22990024881383372, 0.2747711045589792, 0.26955365651611]


[32m[I 2021-12-10 14:37:22,069][0m Trial 18 finished with value: 0.3324656785425615 and parameters: {'n_estimators': 128, 'subsample': 0.9222476421814431, 'max_depth': 14, 'max_features': 16, 'learning_rate': 0.0936167501746739, 'min_samples_leaf': 50}. Best is trial 15 with value: 0.339650339053743.[0m


MCC: [0.3268823227010646, 0.34403368195330464, 0.33847926284523094, 0.320467446670646]
ROC: [0.8526894059301465, 0.8568040896881374, 0.8797703375216648, 0.8501673505829781]
PR_AUC: [0.18552129617792118, 0.19178327526867542, 0.2693018928200534, 0.21235527512259075]


[32m[I 2021-12-10 14:37:24,424][0m Trial 19 finished with value: 0.32994757853726997 and parameters: {'n_estimators': 80, 'subsample': 0.9168014347216491, 'max_depth': 19, 'max_features': 18, 'learning_rate': 0.11613096671364652, 'min_samples_leaf': 36}. Best is trial 15 with value: 0.339650339053743.[0m


MCC: [0.3215113030153332, 0.33759712676837467, 0.3450405642771867, 0.3156413200881854]
ROC: [0.8531884210896556, 0.85806736445477, 0.8816412081103114, 0.8554957403668959]
PR_AUC: [0.21656769927819788, 0.209251769527431, 0.308787578085691, 0.2533072889127994]


[32m[I 2021-12-10 14:37:26,144][0m Trial 20 finished with value: 0.3395561868584756 and parameters: {'n_estimators': 64, 'subsample': 0.8504786531918825, 'max_depth': 15, 'max_features': 19, 'learning_rate': 0.09413089237925701, 'min_samples_leaf': 33}. Best is trial 15 with value: 0.339650339053743.[0m


MCC: [0.33532618962690963, 0.33392059597545676, 0.3502237899366307, 0.3387541718949055]
ROC: [0.8594799690478702, 0.8618484463687408, 0.881765787109789, 0.8618752941279957]
PR_AUC: [0.21860646705921266, 0.2282536322697416, 0.2790303599140745, 0.24716845212823033]


[32m[I 2021-12-10 14:37:27,822][0m Trial 21 finished with value: 0.3438941189228 and parameters: {'n_estimators': 58, 'subsample': 0.8498502075713545, 'max_depth': 15, 'max_features': 19, 'learning_rate': 0.09378400776091556, 'min_samples_leaf': 31}. Best is trial 21 with value: 0.3438941189228.[0m


MCC: [0.3375122267496339, 0.34617067484123304, 0.3468913173895773, 0.3450022567107555]
ROC: [0.8643426189722485, 0.8647793312511882, 0.8807188863948805, 0.8643228788698448]
PR_AUC: [0.2303146189536853, 0.21226554361511085, 0.28028734171259545, 0.2684034011771712]


[32m[I 2021-12-10 14:37:29,137][0m Trial 22 finished with value: 0.3352342451971856 and parameters: {'n_estimators': 51, 'subsample': 0.8423952669510999, 'max_depth': 16, 'max_features': 19, 'learning_rate': 0.0935342764967663, 'min_samples_leaf': 39}. Best is trial 21 with value: 0.3438941189228.[0m


MCC: [0.32230392265835117, 0.334774684036634, 0.3486420783218614, 0.3352162957718959]
ROC: [0.8613815025852063, 0.853235010632927, 0.8894590867266539, 0.84952961511565]
PR_AUC: [0.22293376380518018, 0.1846623138693883, 0.282316130314597, 0.2063613221264443]


[32m[I 2021-12-10 14:37:30,211][0m Trial 23 finished with value: 0.3391669506901568 and parameters: {'n_estimators': 33, 'subsample': 0.8504413878499427, 'max_depth': 14, 'max_features': 19, 'learning_rate': 0.10790897273562051, 'min_samples_leaf': 32}. Best is trial 21 with value: 0.3438941189228.[0m


MCC: [0.3366922183467397, 0.3365578564637117, 0.34555750217237924, 0.3378602257777966]
ROC: [0.8584225845028314, 0.8577701233332096, 0.8825591586327782, 0.8637731069152514]
PR_AUC: [0.22556252289009704, 0.21368525784819106, 0.2749260070532586, 0.251691211098758]


[32m[I 2021-12-10 14:37:33,514][0m Trial 24 finished with value: 0.32744371968335356 and parameters: {'n_estimators': 101, 'subsample': 0.8799731267520511, 'max_depth': 17, 'max_features': 16, 'learning_rate': 0.09533328289091661, 'min_samples_leaf': 21}. Best is trial 21 with value: 0.3438941189228.[0m


MCC: [0.3230233920535266, 0.33324716138739163, 0.3375649963243928, 0.3159393289681031]
ROC: [0.8561693222187049, 0.8608845983201505, 0.8860058442849931, 0.8618796923036325]
PR_AUC: [0.2151144361678626, 0.2118860697301815, 0.26785356805155225, 0.273090292465858]
CPU times: user 59.4 s, sys: 110 ms, total: 59.6 s
Wall time: 59.4 s


In [14]:
best = study.best_trial
best

FrozenTrial(number=21, values=[0.3438941189228], datetime_start=datetime.datetime(2021, 12, 10, 14, 37, 26, 145911), datetime_complete=datetime.datetime(2021, 12, 10, 14, 37, 27, 822275), params={'n_estimators': 58, 'subsample': 0.8498502075713545, 'max_depth': 15, 'max_features': 19, 'learning_rate': 0.09378400776091556, 'min_samples_leaf': 31}, distributions={'n_estimators': IntUniformDistribution(high=200, low=10, step=1), 'subsample': UniformDistribution(high=1.0, low=0.8), 'max_depth': IntUniformDistribution(high=20, low=3, step=1), 'max_features': IntUniformDistribution(high=20, low=10, step=1), 'learning_rate': UniformDistribution(high=0.2, low=0.08), 'min_samples_leaf': IntUniformDistribution(high=50, low=1, step=1)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=21, state=TrialState.COMPLETE, value=None)

In [16]:
%%time
gb_params = study.best_params
#gb_params['random_state'] = 42
gb_weight = GradientBoostingClassifier(**gb_params)
weights = class_weight.compute_sample_weight('balanced', y=y_train)
gb_weight.fit(X_train, y_train, sample_weight=weights)
y_pred = gb_weight.predict(X_test)
probs = gb_weight.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

[[2833  536]
 [  63  132]]
              precision    recall  f1-score   support

           0       0.98      0.84      0.90      3369
           1       0.20      0.68      0.31       195

    accuracy                           0.83      3564
   macro avg       0.59      0.76      0.61      3564
weighted avg       0.94      0.83      0.87      3564

ROC AUC prob1: 0.8727173094047538
MCC: 0.30176138809434633
F2: 0.45580110497237575
PR_AUC: 0.23143854093057512
CPU times: user 889 ms, sys: 980 µs, total: 890 ms
Wall time: 887 ms


In [15]:
%%time
gb_params = study.best_params
#gb_params['random_state'] = 42
gb = GradientBoostingClassifier(**gb_params)
gb.fit(X_rus, y_rus)
y_pred = gb.predict(X_test)
probs = gb.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

[[2551  818]
 [  16  179]]
              precision    recall  f1-score   support

           0       0.99      0.76      0.86      3369
           1       0.18      0.92      0.30       195

    accuracy                           0.77      3564
   macro avg       0.59      0.84      0.58      3564
weighted avg       0.95      0.77      0.83      3564

ROC AUC prob1: 0.8702726975211391
MCC: 0.3420624851489895
F2: 0.5036578503095104
PR_AUC: 0.21139062898627556
CPU times: user 138 ms, sys: 3.02 ms, total: 141 ms
Wall time: 139 ms


In [17]:
coeffs_wgt = pd.Series(data=gb_weight.feature_importances_,
                   index=X_test.columns.values).sort_values(ascending=False)
coeffs_wgt[0:20]

Presentdel     0.285902
Delmode        0.063847
Lac_None       0.058451
CS_FTP         0.055910
AdmSBP         0.038917
AdmDBP         0.034367
TrialLabor     0.033963
new_age        0.033229
Admefface      0.020668
Delfetalpos    0.019940
AdmBishop      0.019468
CS_NRFHT       0.019427
Dilat_lst      0.017265
momrace_new    0.016374
prelaborCD     0.016193
BESTGA         0.016166
Admcontract    0.015027
Lac_Min        0.012726
Admcervpos     0.011324
Analgesia      0.010709
dtype: float64

In [26]:
coeffs = pd.Series(data=gb.feature_importances_,
                   index=X_test.columns.values).sort_values(ascending=False)
coeffs[0:20]

Presentdel        0.248502
Delfetalpos       0.142930
CS_FTP            0.072793
Lac_Min           0.057240
Delmode           0.055049
Dilat_lst         0.043160
AdmSBP            0.035636
Admefface         0.025463
AdmDBP            0.025085
Admconsistency    0.024490
BESTGA            0.019899
new_age           0.018413
TrialLabor        0.016620
Admcontract       0.016097
Lac_None          0.015849
spontlabor        0.013282
AdmBishop         0.011382
ROMmeth           0.010288
new_BMI           0.009608
prelaborCD        0.009599
dtype: float64

In [1]:
plt.rcParams["figure.figsize"] = (14, 8)
x_labels = coeffs_wgt.index.values[0:25]
print(x_labels)
fig, ax = plt.subplots(1,1)  # Create a figure and an axes.
ax.set_title('Top Predictors: All Sites combined; target: High_EBLoss, vars: PI')
ax.bar(x_labels, coeffs_wgt.values[0:25])
ax.set_ylabel('Coeff')
ax.set_xlabel('Variable')
plt.draw()
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, fontsize=16)
#plt.xlim(1,20)
plt.show()

NameError: name 'plt' is not defined

In [19]:
# No Sitenum - 5
coeffs = pd.Series(data=gb.feature_importances_,
                   index=X_test.columns.values).sort_values(ascending=False)
coeffs[0:20]

Presentdel        0.248502
Delfetalpos       0.142930
CS_FTP            0.072793
Lac_Min           0.057240
Delmode           0.055049
Dilat_lst         0.043160
AdmSBP            0.035636
Admefface         0.025463
AdmDBP            0.025085
Admconsistency    0.024490
BESTGA            0.019899
new_age           0.018413
TrialLabor        0.016620
Admcontract       0.016097
Lac_None          0.015849
spontlabor        0.013282
AdmBishop         0.011382
ROMmeth           0.010288
new_BMI           0.009608
prelaborCD        0.009599
dtype: float64

In [21]:
gb_params # 5 runs

{'n_estimators': 58,
 'subsample': 0.8498502075713545,
 'max_depth': 15,
 'max_features': 19,
 'learning_rate': 0.09378400776091556,
 'min_samples_leaf': 31,
 'random_state': 42}

In [3]:
from optuna.visualization import plot_optimization_history

plotly_config = {"staticPlot": True}

fig = plot_optimization_history(study)
fig.show(config=plotly_config)

NameError: name 'study' is not defined

In [None]:
from optuna.visualization import plot_param_importances

fig = plot_param_importances(study)
fig.show(config=plotly_config)

In [24]:
np.mean([0.3149689279176914, 0.31449457458602254, 0.31066798538596907, 0.3219519143488398])

0.3155208505596307

<h1 align='center'>Thanks and make sure to learn!</h1>