In [1]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
#import plotly.express as px
import seaborn as sns

from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import StratifiedKFold
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, \
                            accuracy_score, f1_score, average_precision_score, \
                            fbeta_score, precision_recall_curve, auc
from sklearn.metrics import matthews_corrcoef

from sklearn.model_selection import train_test_split

import optuna
from optuna.samplers import TPESampler
sns.set(style='white', context='notebook', palette='deep')
%config Completer.use_jedi = False

In [2]:
csl_df = pd.read_csv('../../data/csl/Sites/CSL_he_PI_s49.csv', index_col=0)
X = csl_df.drop(['high_EBL','Sitenum'], axis=1, inplace=False)
y = csl_df['high_EBL'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=7)
X_train                                                    

Unnamed: 0_level_0,transfus_yes,Accrete,AdmBishop,Admcervpos,Admconsistency,Admcontract,AdmDBP,Admefface,Admpresent,AdmSBP,...,TD_nos,ThreatenedPB,threatpb9,UnspecHBP,uscar,version9,new_age,new_BMI,new_high_Age,new_high_BMI
MomID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
49-06751,0,0,8,2,3,3.0,90,60.0,1,150.0,...,0,0,0,0,0,0,26,24,1,1
49-23935,0,0,8,8,8,3.0,65,0.0,77,114.0,...,0,0,0,0,1,0,39,24,1,1
49-06958,0,0,8,2,2,0.0,85,20.0,1,140.0,...,0,0,0,0,0,0,24,21,1,1
49-14728,0,0,8,8,8,2.0,72,0.0,77,115.0,...,0,0,0,0,0,0,34,24,1,1
49-24757,0,0,8,8,8,2.0,74,70.0,1,119.0,...,0,0,0,0,0,0,31,23,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49-10591,0,0,8,3,2,0.0,78,20.0,1,124.0,...,0,0,0,0,0,0,19,21,0,1
49-18110,0,0,8,2,3,1.0,77,90.0,1,115.0,...,0,0,0,0,0,0,18,26,0,1
49-00602,0,0,8,8,8,99.0,80,0.0,77,127.0,...,0,0,0,0,1,0,34,24,1,1
49-15932,0,0,8,2,2,1.0,82,60.0,1,136.0,...,0,0,0,0,0,0,22,31,1,1


In [3]:
X.shape

(21322, 195)

<h1 align='center'>Exploratory Data Analysis</h1>

<h1 align='center'>Modelling</h1>

In [8]:
print(X_train.shape)
print(X.shape)
print(y.shape)

(14925, 195)
(21322, 195)
(21322,)


Here we define our validation set

<h1 align='center'>Modelling Using Undersampling</h1>

We will use `imblearn`'s RandomUnderSampler to undersample from the majority class so that they match

In [9]:
sampler = RandomUnderSampler(sampling_strategy=1.0, random_state=7)
X_rus, y_rus = sampler.fit_resample(X_train, y_train)

# Try with no undersampling
#X_rus, y_rus = X_train, y_train

In [10]:
print(X_rus.shape)
print(y_rus.shape)
print(np.bincount(y_rus))

(1608, 195)
(1608,)
[804 804]


<h1 align='center'>Basic Gradient Boosting</h1>

In [11]:
%%time
from sklearn.metrics import matthews_corrcoef
from sklearn.utils import class_weight
gb_rus = GradientBoostingClassifier()   # If not undersampling
gb_rus.fit(X_rus, y_rus)
print(gb_rus.get_params())
y_pred = gb_rus.predict(X_test)
probs = gb_rus.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'presort': 'deprecated', 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
[[4438 1614]
 [  32  313]]
              precision    recall  f1-score   support

           0       0.99      0.73      0.84      6052
           1       0.16      0.91      0.28       345

    accuracy                           0.74      6397
   macro avg       0.58      0.82      0.56      6397
weighted avg       0.95      0.74      0.81      6397

ROC AUC prob1: 0.8688599768192573
MCC: 0.3153716353279921
F2: 0.4732385848200787
PR_AUC: 0.2205171513715284
CPU times: user 608 ms, sys: 5.86 ms, total:

In [12]:
%%time
# Not undersampled
from sklearn.metrics import matthews_corrcoef
from sklearn.utils import class_weight
gb_weight = GradientBoostingClassifier()  
weights = class_weight.compute_sample_weight('balanced', y=y_train)
gb_weight.fit(X_train, y_train, sample_weight=weights)
print(gb_weight.get_params())
y_pred = gb_weight.predict(X_test)
probs = gb_weight.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'presort': 'deprecated', 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
[[4490 1562]
 [  38  307]]
              precision    recall  f1-score   support

           0       0.99      0.74      0.85      6052
           1       0.16      0.89      0.28       345

    accuracy                           0.75      6397
   macro avg       0.58      0.82      0.56      6397
weighted avg       0.95      0.75      0.82      6397

ROC AUC prob1: 0.8727331724091688
MCC: 0.3137995331532645
F2: 0.4724530624807633
PR_AUC: 0.2435054511986828
CPU times: user 4.99 s, sys: 6.66 ms, total:

<h1 align='center'>Hyperparameter Tuning with Cross Validation</h1>

In [13]:
%%time
def create_model(trial):
    n_estimators = trial.suggest_int('n_estimators', 10, 200)
    subsample = trial.suggest_float('subsample', 0.8, 1.0)
    max_depth = trial.suggest_int('max_depth',3,20)
    max_features = trial.suggest_int('max_features',10,20)
    learning_rate = trial.suggest_uniform('learning_rate',0.08,0.2)
    min_samples_leaf = trial.suggest_int('min_samples_leaf',1,50)
    model = GradientBoostingClassifier(n_estimators=n_estimators, subsample=subsample, 
                           max_depth=max_depth, learning_rate=learning_rate, 
                           min_samples_leaf=min_samples_leaf)
#    model = GradientBoostingClassifier(n_estimators=n_estimators, subsample=subsample, 
#                           learning_rate=learning_rate, 
#                           min_samples_leaf=min_samples_leaf)
#    model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate) 
    return model

def objective(trial):
    model = create_model(trial)
    
    mcc_list = []
    roc_list = []
    pr_list = []
    f2_list = []
    cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=11)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train_c, X_test_c = X.iloc[train_idx], X.iloc[test_idx]
        y_train_c, y_test_c = y[train_idx], y[test_idx]
        sampler = RandomUnderSampler(sampling_strategy=1.0, random_state=7)
        X_res, y_res = sampler.fit_resample(X_train_c, y_train_c)

#        weights = class_weight.compute_sample_weight('balanced', y=y_train)
#        model.fit(X_train, y_train, sample_weight=weights)
        model.fit(X_res, y_res)
        y_pred = model.predict(X_test_c)
        probs = model.predict_proba(X_test_c)
        prob1 = probs[:, 1]  # Only positives
        precision, recall, pr_thresh = precision_recall_curve(y_test_c, prob1)
        roc = roc_auc_score(y_test_c, prob1)
        roc_list.append(roc)
        mcc = matthews_corrcoef(y_test_c, y_pred)  # MWB
        mcc_list.append(mcc)
        f2 = fbeta_score(y_test_c, y_pred, beta=2.0, average=None)[1]
        pr_auc = auc(recall, precision)
        pr_list.append(pr_auc)
        avg_precision = average_precision_score(y_test_c, prob1)
#        print(f'ROC: {roc}')
#        print(f'PR_AUC: {pr_auc}')
#        print(f'avg_prec: {avg_precision}')
        
    print(f'MCC: {mcc_list}')
    print(f'ROC: {roc_list}')
    print(f'PR_AUC: {pr_list}')
    return np.mean(mcc_list)

sampler = TPESampler(seed=7)
study = optuna.create_study(sampler=sampler,direction='maximize')
#study.optimize(objective,n_trials=60)
study.optimize(objective,n_trials=25)
#study.optimize(objective,n_trials=5)

[32m[I 2021-12-10 14:30:13,218][0m A new study created in memory with name: no-name-0150141f-e408-4915-be77-9b99a2492e26[0m
[32m[I 2021-12-10 14:30:15,013][0m Trial 0 finished with value: 0.32435021248539964 and parameters: {'n_estimators': 24, 'subsample': 0.955983758448023, 'max_depth': 10, 'max_features': 17, 'learning_rate': 0.1973587414395923, 'min_samples_leaf': 27}. Best is trial 0 with value: 0.32435021248539964.[0m


MCC: [0.33864621685917395, 0.3258690257197002, 0.3120587371594714, 0.32082687020325285]
ROC: [0.8842147291983852, 0.8750034426157269, 0.8618093455516012, 0.873624805764502]
PR_AUC: [0.23255702238537396, 0.23414921130400373, 0.2013871898174007, 0.20961815662203806]


[32m[I 2021-12-10 14:30:19,426][0m Trial 1 finished with value: 0.32110738936992844 and parameters: {'n_estimators': 105, 'subsample': 0.8144102266719524, 'max_depth': 7, 'max_features': 15, 'learning_rate': 0.16150759953451288, 'min_samples_leaf': 41}. Best is trial 0 with value: 0.32435021248539964.[0m


MCC: [0.33526767522541384, 0.31484822036817267, 0.3154734966375594, 0.3188401652485679]
ROC: [0.8771065494726545, 0.8642652356401612, 0.8589302728244415, 0.873085195541341]
PR_AUC: [0.21851472435568095, 0.2266558526169103, 0.19820987117942546, 0.2243810758754444]


[32m[I 2021-12-10 14:30:23,526][0m Trial 2 finished with value: 0.32838807160890915 and parameters: {'n_estimators': 82, 'subsample': 0.813187269381181, 'max_depth': 8, 'max_features': 20, 'learning_rate': 0.10560624242958987, 'min_samples_leaf': 23}. Best is trial 2 with value: 0.32838807160890915.[0m


MCC: [0.3451961734073835, 0.321652653759179, 0.3171594618794151, 0.3295439973896589]
ROC: [0.8857828116062966, 0.8770139302002776, 0.8666388915950007, 0.8731114505842094]
PR_AUC: [0.24652948739438416, 0.2576839440971523, 0.22010213912137408, 0.21594066376862503]


[32m[I 2021-12-10 14:30:34,255][0m Trial 3 finished with value: 0.3204708819510114 and parameters: {'n_estimators': 187, 'subsample': 0.8049798455100696, 'max_depth': 13, 'max_features': 20, 'learning_rate': 0.10763634548251577, 'min_samples_leaf': 28}. Best is trial 2 with value: 0.32838807160890915.[0m


MCC: [0.3281712136517826, 0.30482698437140954, 0.3215548506568585, 0.327330479123995]
ROC: [0.8788632162406365, 0.8686001773635622, 0.8618010544854324, 0.8735605500016927]
PR_AUC: [0.22705064682751896, 0.2255368844077668, 0.2100873121433827, 0.21293453237485138]


[32m[I 2021-12-10 14:30:44,617][0m Trial 4 finished with value: 0.31940300321926274 and parameters: {'n_estimators': 183, 'subsample': 0.8266338891518501, 'max_depth': 12, 'max_features': 18, 'learning_rate': 0.16028158890606967, 'min_samples_leaf': 24}. Best is trial 2 with value: 0.32838807160890915.[0m


MCC: [0.3238104451763635, 0.320854484216632, 0.3142788182736737, 0.31866826521038183]
ROC: [0.8735628213878149, 0.8688872915151916, 0.8603708455712925, 0.8735729866009462]
PR_AUC: [0.22203748468705894, 0.22476600903305244, 0.20154145026034526, 0.21239938087496824]


[32m[I 2021-12-10 14:30:47,275][0m Trial 5 finished with value: 0.3250933646133319 and parameters: {'n_estimators': 49, 'subsample': 0.8981531778182141, 'max_depth': 9, 'max_features': 15, 'learning_rate': 0.12390684629367116, 'min_samples_leaf': 42}. Best is trial 2 with value: 0.32838807160890915.[0m


MCC: [0.3389347099957521, 0.319335431419953, 0.31509691979246884, 0.32700639724515357]
ROC: [0.8833395043478022, 0.8755115726970278, 0.8675681819281011, 0.8723749275395363]
PR_AUC: [0.23037517235500324, 0.24837688567464855, 0.21001533056741162, 0.220351150003655]


[32m[I 2021-12-10 14:30:57,704][0m Trial 6 finished with value: 0.3198018391648879 and parameters: {'n_estimators': 156, 'subsample': 0.8627989354425325, 'max_depth': 13, 'max_features': 13, 'learning_rate': 0.13434115190556806, 'min_samples_leaf': 18}. Best is trial 2 with value: 0.32838807160890915.[0m


MCC: [0.33440025180769845, 0.3174016288406667, 0.3081771255488055, 0.319228350462381]
ROC: [0.8811179391390606, 0.8681705389208364, 0.8598146532157936, 0.8719292827329564]
PR_AUC: [0.22203388091567744, 0.21065706687531466, 0.1965572171097773, 0.20431883005046117]


[32m[I 2021-12-10 14:31:04,114][0m Trial 7 finished with value: 0.3195511283471792 and parameters: {'n_estimators': 135, 'subsample': 0.8740702165976071, 'max_depth': 11, 'max_features': 17, 'learning_rate': 0.12955901949366017, 'min_samples_leaf': 46}. Best is trial 2 with value: 0.32838807160890915.[0m


MCC: [0.33522262853486773, 0.31332278184459295, 0.31568665560029885, 0.3139724474089572]
ROC: [0.880161201634674, 0.8674999173772224, 0.8611059867716039, 0.8697929513500965]
PR_AUC: [0.2124148961842672, 0.22002199721753007, 0.19535139677244126, 0.21309019952997466]


[32m[I 2021-12-10 14:31:06,892][0m Trial 8 finished with value: 0.3268232956692334 and parameters: {'n_estimators': 44, 'subsample': 0.9482237745826528, 'max_depth': 10, 'max_features': 14, 'learning_rate': 0.15612558423606065, 'min_samples_leaf': 27}. Best is trial 2 with value: 0.32838807160890915.[0m


MCC: [0.33944736913000423, 0.32213726167287454, 0.32462876628022647, 0.32107978559382844]
ROC: [0.8832952940948917, 0.876436259281292, 0.8667874398638606, 0.8723307085199687]
PR_AUC: [0.23119010398516035, 0.25177439196764656, 0.21046415722104564, 0.20718082312019126]


[32m[I 2021-12-10 14:31:09,593][0m Trial 9 finished with value: 0.3282631980098683 and parameters: {'n_estimators': 89, 'subsample': 0.8002853761125517, 'max_depth': 4, 'max_features': 17, 'learning_rate': 0.14292147161182367, 'min_samples_leaf': 35}. Best is trial 2 with value: 0.32838807160890915.[0m


MCC: [0.3496904040808406, 0.31557275405768404, 0.3269627636976958, 0.32082687020325285]
ROC: [0.879719789890773, 0.87431836208606, 0.8682439038208687, 0.8730948684518713]
PR_AUC: [0.21600821103568918, 0.25803376748167023, 0.20809204461613445, 0.2288938866312372]


[32m[I 2021-12-10 14:31:19,769][0m Trial 10 finished with value: 0.32323315265011143 and parameters: {'n_estimators': 78, 'subsample': 0.9869602664638994, 'max_depth': 18, 'max_features': 10, 'learning_rate': 0.09172353354166263, 'min_samples_leaf': 4}. Best is trial 2 with value: 0.32838807160890915.[0m


MCC: [0.34006414923698497, 0.3159611667434705, 0.3240867101509378, 0.3128205844690525]
ROC: [0.8835425951971086, 0.8683977515588164, 0.8596094493281129, 0.8652121372917647]
PR_AUC: [0.2402638878675469, 0.21177294405956829, 0.18372739510918415, 0.20443413654171944]


[32m[I 2021-12-10 14:31:22,258][0m Trial 11 finished with value: 0.328537045265035 and parameters: {'n_estimators': 89, 'subsample': 0.8518380886504129, 'max_depth': 3, 'max_features': 20, 'learning_rate': 0.0820677705499854, 'min_samples_leaf': 12}. Best is trial 11 with value: 0.328537045265035.[0m


MCC: [0.3502504509556626, 0.317333708423675, 0.32206619735185593, 0.3244978243289465]
ROC: [0.8865012282160887, 0.8826450167448828, 0.8745112589223963, 0.8739878853704829]
PR_AUC: [0.2458078973522967, 0.26993756233750893, 0.22949616995864114, 0.23057077494074962]


[32m[I 2021-12-10 14:31:25,555][0m Trial 12 finished with value: 0.33390160754165166 and parameters: {'n_estimators': 129, 'subsample': 0.845528398647246, 'max_depth': 3, 'max_features': 20, 'learning_rate': 0.08006789436397342, 'min_samples_leaf': 10}. Best is trial 12 with value: 0.33390160754165166.[0m


MCC: [0.3551615626823937, 0.3254765987380499, 0.3248729878465611, 0.33009528089960216]
ROC: [0.8887697668185472, 0.8830736224028908, 0.8768970132125049, 0.8741644159876629]
PR_AUC: [0.2534099635696359, 0.26733102626036787, 0.2393971798448672, 0.239164560840815]


[32m[I 2021-12-10 14:31:28,865][0m Trial 13 finished with value: 0.33077094616501324 and parameters: {'n_estimators': 131, 'subsample': 0.8534856832953535, 'max_depth': 3, 'max_features': 19, 'learning_rate': 0.08061551154657245, 'min_samples_leaf': 8}. Best is trial 12 with value: 0.33390160754165166.[0m


MCC: [0.3492773855583111, 0.32088097748902833, 0.32732185123871055, 0.325603570374003]
ROC: [0.8858166600811811, 0.8801770055302179, 0.8720066660171998, 0.8735508770911623]
PR_AUC: [0.2441054624633069, 0.2649750307106648, 0.22922993133825814, 0.2356969560210276]


[32m[I 2021-12-10 14:31:34,450][0m Trial 14 finished with value: 0.32289546273374925 and parameters: {'n_estimators': 137, 'subsample': 0.8993236446634258, 'max_depth': 5, 'max_features': 18, 'learning_rate': 0.08096592158289725, 'min_samples_leaf': 1}. Best is trial 12 with value: 0.33390160754165166.[0m


MCC: [0.3302900754253778, 0.31668144477790705, 0.3240867101509378, 0.32052362058077427]
ROC: [0.8802454774292844, 0.8709046643311963, 0.8695069095672685, 0.8674590162235437]
PR_AUC: [0.22590548164231028, 0.235926455977329, 0.2156873133286227, 0.23388532452423086]


[32m[I 2021-12-10 14:31:40,312][0m Trial 15 finished with value: 0.3216071702334051 and parameters: {'n_estimators': 132, 'subsample': 0.8466557426823317, 'max_depth': 6, 'max_features': 19, 'learning_rate': 0.10366157991532525, 'min_samples_leaf': 10}. Best is trial 12 with value: 0.33390160754165166.[0m


MCC: [0.3392887825928066, 0.31571666739365667, 0.3169952781072408, 0.3144279528399164]
ROC: [0.8849939349059289, 0.8706092879018221, 0.8700202647475612, 0.8725372942520112]
PR_AUC: [0.24005424896378277, 0.23541346780040376, 0.21956436994250114, 0.24381223891665796]


[32m[I 2021-12-10 14:31:53,397][0m Trial 16 finished with value: 0.32598355356771874 and parameters: {'n_estimators': 159, 'subsample': 0.8841883054873618, 'max_depth': 16, 'max_features': 12, 'learning_rate': 0.09572114049970852, 'min_samples_leaf': 14}. Best is trial 12 with value: 0.33390160754165166.[0m


MCC: [0.33998767304604793, 0.3201167327143533, 0.31769667272788893, 0.3261331357825848]
ROC: [0.8819786575004076, 0.8733234461409655, 0.8655182158178343, 0.8716722596817198]
PR_AUC: [0.23262783530703193, 0.23053661486765678, 0.2124727081804476, 0.2191107444990913]


[32m[I 2021-12-10 14:31:56,439][0m Trial 17 finished with value: 0.3278803401366042 and parameters: {'n_estimators': 116, 'subsample': 0.924580558278454, 'max_depth': 3, 'max_features': 18, 'learning_rate': 0.11532603266017688, 'min_samples_leaf': 7}. Best is trial 12 with value: 0.33390160754165166.[0m


MCC: [0.355722724343596, 0.31700199947033414, 0.31891247221293734, 0.31988416451954943]
ROC: [0.8850388359440408, 0.8821527226959262, 0.8714919289925456, 0.8720557214920326]
PR_AUC: [0.24573771457366111, 0.26140176206156196, 0.24275714364221282, 0.22934373925100626]


[32m[I 2021-12-10 14:32:03,255][0m Trial 18 finished with value: 0.31796391077550623 and parameters: {'n_estimators': 166, 'subsample': 0.841212937001562, 'max_depth': 6, 'max_features': 16, 'learning_rate': 0.18656989022946982, 'min_samples_leaf': 17}. Best is trial 12 with value: 0.33390160754165166.[0m


MCC: [0.3207978338829286, 0.31791573623211405, 0.3140717238323438, 0.3190703491546385]
ROC: [0.8744415001644069, 0.8704887963513781, 0.8636554896185488, 0.8730147214789051]
PR_AUC: [0.22533109075272315, 0.22542635263314187, 0.20785923946177096, 0.21537137333503048]


[32m[I 2021-12-10 14:32:14,111][0m Trial 19 finished with value: 0.3283076872606138 and parameters: {'n_estimators': 110, 'subsample': 0.9239318054357577, 'max_depth': 15, 'max_features': 19, 'learning_rate': 0.09284764829847633, 'min_samples_leaf': 7}. Best is trial 12 with value: 0.33390160754165166.[0m


MCC: [0.3432049192451292, 0.3175850987462439, 0.32710652448159994, 0.32533420656948236]
ROC: [0.8836959495118911, 0.8683206369665322, 0.8655942172577161, 0.8693065421348528]
PR_AUC: [0.23122903092276811, 0.2160963234310595, 0.21229821471453036, 0.20350789823333337]


[32m[I 2021-12-10 14:32:19,629][0m Trial 20 finished with value: 0.32404323514377664 and parameters: {'n_estimators': 144, 'subsample': 0.8355016970848046, 'max_depth': 5, 'max_features': 19, 'learning_rate': 0.11597741513850611, 'min_samples_leaf': 17}. Best is trial 12 with value: 0.33390160754165166.[0m


MCC: [0.33295698745060714, 0.32214248391336464, 0.31406286710264253, 0.32701060210849225]
ROC: [0.8790331494002603, 0.8765429803688282, 0.8689970089978796, 0.8740835780925159]
PR_AUC: [0.2280492050750762, 0.23876748518101573, 0.2152746136933956, 0.22231822265586992]


[32m[I 2021-12-10 14:32:22,217][0m Trial 21 finished with value: 0.33027027457764635 and parameters: {'n_estimators': 96, 'subsample': 0.8564994809384565, 'max_depth': 3, 'max_features': 20, 'learning_rate': 0.08202589208971212, 'min_samples_leaf': 11}. Best is trial 12 with value: 0.33390160754165166.[0m


MCC: [0.3509207714976065, 0.3212819475895668, 0.3225850021081047, 0.3262933771153074]
ROC: [0.8894059799893343, 0.8815912320708573, 0.8759881050837363, 0.8735232402039326]
PR_AUC: [0.2508478857512608, 0.2611573507516366, 0.2341032314656386, 0.2259897747120648]


[32m[I 2021-12-10 14:32:24,033][0m Trial 22 finished with value: 0.32914276143870225 and parameters: {'n_estimators': 60, 'subsample': 0.8668483140550334, 'max_depth': 3, 'max_features': 20, 'learning_rate': 0.08334208356942746, 'min_samples_leaf': 1}. Best is trial 12 with value: 0.33390160754165166.[0m


MCC: [0.35009721322138015, 0.31885288333368256, 0.3237236569345528, 0.3238972922651934]
ROC: [0.884441997529752, 0.8801683989909006, 0.874806282693574, 0.8676490198232483]
PR_AUC: [0.24192298006630064, 0.2615143634276929, 0.2374655434074323, 0.21861942366341264]


[32m[I 2021-12-10 14:32:35,707][0m Trial 23 finished with value: 0.3272808914583817 and parameters: {'n_estimators': 121, 'subsample': 0.8847667467206112, 'max_depth': 20, 'max_features': 19, 'learning_rate': 0.0942433137579249, 'min_samples_leaf': 9}. Best is trial 12 with value: 0.33390160754165166.[0m


MCC: [0.34538176306802654, 0.3179507077250074, 0.32035458881926165, 0.3254365062212313]
ROC: [0.8840482499647699, 0.8705955174389143, 0.8651969370037884, 0.8703090702191121]
PR_AUC: [0.22947330901048357, 0.2122693186996103, 0.20682132583730034, 0.21765794264793922]


[32m[I 2021-12-10 14:32:39,248][0m Trial 24 finished with value: 0.32817972028124953 and parameters: {'n_estimators': 101, 'subsample': 0.8551963642689071, 'max_depth': 5, 'max_features': 20, 'learning_rate': 0.0809593886171461, 'min_samples_leaf': 20}. Best is trial 12 with value: 0.33390160754165166.[0m


MCC: [0.3418141229028901, 0.3235876163049037, 0.31971090455694817, 0.3276062373602563]
ROC: [0.8808146844355041, 0.8829916881485889, 0.8727756624043678, 0.8790126169299426]
PR_AUC: [0.22639883440527078, 0.2702863426642678, 0.21640790789180112, 0.24068260403748162]
CPU times: user 2min 25s, sys: 380 ms, total: 2min 26s
Wall time: 2min 26s


In [14]:
best = study.best_trial
best

FrozenTrial(number=12, values=[0.33390160754165166], datetime_start=datetime.datetime(2021, 12, 10, 14, 31, 22, 259735), datetime_complete=datetime.datetime(2021, 12, 10, 14, 31, 25, 554835), params={'n_estimators': 129, 'subsample': 0.845528398647246, 'max_depth': 3, 'max_features': 20, 'learning_rate': 0.08006789436397342, 'min_samples_leaf': 10}, distributions={'n_estimators': IntUniformDistribution(high=200, low=10, step=1), 'subsample': UniformDistribution(high=1.0, low=0.8), 'max_depth': IntUniformDistribution(high=20, low=3, step=1), 'max_features': IntUniformDistribution(high=20, low=10, step=1), 'learning_rate': UniformDistribution(high=0.2, low=0.08), 'min_samples_leaf': IntUniformDistribution(high=50, low=1, step=1)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=12, state=TrialState.COMPLETE, value=None)

In [15]:
%%time
gb_params = study.best_params
#gb_params['random_state'] = 42
gb = GradientBoostingClassifier(**gb_params)
gb.fit(X_rus, y_rus)
y_pred = gb.predict(X_test)
probs = gb.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

[[4477 1575]
 [  33  312]]
              precision    recall  f1-score   support

           0       0.99      0.74      0.85      6052
           1       0.17      0.90      0.28       345

    accuracy                           0.75      6397
   macro avg       0.58      0.82      0.56      6397
weighted avg       0.95      0.75      0.82      6397

ROC AUC prob1: 0.8728847572248245
MCC: 0.31903644411605014
F2: 0.47750229568411384
PR_AUC: 0.23795067997920405
CPU times: user 233 ms, sys: 2.01 ms, total: 235 ms
Wall time: 232 ms


In [16]:
%%time
gb_params = study.best_params
gb_params['random_state'] = 42
gb_weight = GradientBoostingClassifier(**gb_params)
weights = class_weight.compute_sample_weight('balanced', y=y_train)
gb_weight.fit(X_train, y_train, sample_weight=weights)
y_pred = gb_weight.predict(X_test)
probs = gb_weight.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

[[4441 1611]
 [  33  312]]
              precision    recall  f1-score   support

           0       0.99      0.73      0.84      6052
           1       0.16      0.90      0.28       345

    accuracy                           0.74      6397
   macro avg       0.58      0.82      0.56      6397
weighted avg       0.95      0.74      0.81      6397

ROC AUC prob1: 0.8703842064427137
MCC: 0.31437459088501823
F2: 0.47229791099000906
PR_AUC: 0.2256715279314253
CPU times: user 1.1 s, sys: 3.98 ms, total: 1.1 s
Wall time: 1.1 s


In [17]:
# No Sitenum - 5
coeffs_wgt = pd.Series(data=gb_weight.feature_importances_,
                   index=X_test.columns.values).sort_values(ascending=False)
coeffs_wgt[0:20]

Delmode         0.358040
Hxanemia        0.095787
Delfetalpos     0.079318
Lac_Min         0.053093
Admefface       0.052234
TrialLabor      0.043442
Lac_None        0.033449
transfus_yes    0.019314
uscar           0.017466
CS_FTP          0.015495
BESTGA          0.012806
Intrafever      0.012616
Admcervpos      0.012163
CS_UScar        0.012133
Admpresent      0.011141
new_age         0.011053
CS_NRFHT        0.008042
new_BMI         0.006668
AdmSBP          0.006603
AdmDBP          0.006562
dtype: float64

In [1]:
plt.rcParams["figure.figsize"] = (14, 8)
x_labels = coeffs_wgt.index.values[0:25]
print(x_labels)
fig, ax = plt.subplots(1,1)  # Create a figure and an axes.
ax.set_title('Top Predictors: All Sites combined; target: High_EBLoss, vars: PI')
ax.bar(x_labels, coeffs_wgt.values[0:25])
ax.set_ylabel('Coeff')
ax.set_xlabel('Variable')
plt.draw()
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, fontsize=16)
#plt.xlim(1,20)
plt.show()

NameError: name 'plt' is not defined

In [19]:
# No Sitenum - 5
coeffs = pd.Series(data=gb.feature_importances_,
                   index=X_test.columns.values).sort_values(ascending=False)
coeffs[0:20]

Delmode         0.296163
Lac_None        0.082153
Hxanemia        0.074459
Lac_Min         0.067637
CS_FTP          0.058739
Admpresent      0.046754
Delfetalpos     0.036141
Intrafever      0.025021
Admefface       0.024864
new_age         0.023217
AdmDBP          0.019894
BESTGA          0.018348
uscar           0.018166
Dilat_lst       0.017962
prelaborCD      0.017912
transfus_yes    0.014563
AdmSBP          0.011502
Anteanemia      0.009878
CS_NRFHT        0.009177
Admreason       0.007574
dtype: float64

In [21]:
gb_params # 5 runs

{'n_estimators': 129,
 'subsample': 0.845528398647246,
 'max_depth': 3,
 'max_features': 20,
 'learning_rate': 0.08006789436397342,
 'min_samples_leaf': 10,
 'random_state': 42}

In [2]:
from optuna.visualization import plot_optimization_history

plotly_config = {"staticPlot": True}

fig = plot_optimization_history(study)
fig.show(config=plotly_config)

NameError: name 'study' is not defined

In [None]:
from optuna.visualization import plot_param_importances

fig = plot_param_importances(study)
fig.show(config=plotly_config)

In [24]:
np.mean([0.3149689279176914, 0.31449457458602254, 0.31066798538596907, 0.3219519143488398])

0.3155208505596307

<h1 align='center'>Thanks and make sure to learn!</h1>