In [2]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
#import plotly.express as px
import seaborn as sns

from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import StratifiedKFold
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, \
                            accuracy_score, f1_score, average_precision_score, \
                            fbeta_score, precision_recall_curve, auc
from sklearn.metrics import matthews_corrcoef

from sklearn.model_selection import train_test_split

import optuna
from optuna.samplers import TPESampler
sns.set(style='white', context='notebook', palette='deep')
%config Completer.use_jedi = False

In [3]:
csl_df = pd.read_csv('../../data/csl/Sites/CSL_he_PI_s43.csv', index_col=0)
X = csl_df.drop(['high_EBL','Sitenum'], axis=1, inplace=False)
y = csl_df['high_EBL'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=7)
X_train                                                    

Unnamed: 0_level_0,transfus_yes,Accrete,AdmBishop,Admcervpos,Admconsistency,Admcontract,AdmDBP,Admefface,Admpresent,AdmSBP,...,TD_nos,ThreatenedPB,threatpb9,UnspecHBP,uscar,version9,new_age,new_BMI,new_high_Age,new_high_BMI
MomID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
43-05445,0,0,8,8,8,66.0,74,100.0,1,124.4,...,0,0,0,0,0,0,20,24,1,1
43-00861,0,0,8,8,8,99.0,60,0.0,1,115.0,...,0,0,0,0,0,0,16,23,0,1
43-02991,0,0,8,8,8,99.0,74,0.0,1,124.4,...,0,0,0,0,0,0,24,34,1,1
43-10097,0,0,8,8,8,66.0,65,0.0,1,114.0,...,0,0,0,0,0,0,37,24,1,1
43-10609,0,0,8,8,8,88.0,74,0.0,1,124.4,...,0,0,0,0,0,0,21,24,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43-00936,0,0,8,8,8,66.0,83,0.0,1,132.0,...,0,0,0,0,0,0,18,24,0,1
43-04353,0,0,8,2,2,77.0,89,100.0,1,125.0,...,0,0,0,0,0,0,36,26,1,1
43-07748,0,0,8,8,8,88.0,56,0.0,1,117.0,...,0,0,0,0,0,0,20,23,1,1
43-05532,0,0,8,8,8,66.0,96,0.0,1,147.0,...,0,1,1,0,0,0,35,24,1,1


In [4]:
X.shape

(9658, 195)

<h1 align='center'>Modelling</h1>

In [9]:
print(X_train.shape)
print(X.shape)
print(y.shape)

(6760, 195)
(9658, 195)
(9658,)


Here we define our validation set

<h1 align='center'>Modelling Using Undersampling</h1>

We will use `imblearn`'s RandomUnderSampler to undersample from the majority class so that they match

In [10]:
sampler = RandomUnderSampler(sampling_strategy=1.0, random_state=7)
X_rus, y_rus = sampler.fit_resample(X_train, y_train)

# Try with no undersampling
#X_rus, y_rus = X_train, y_train

In [11]:
print(X_rus.shape)
print(y_rus.shape)
print(np.bincount(y_rus))

(770, 195)
(770,)
[385 385]


<h1 align='center'>Basic Gradient Boosting</h1>

In [12]:
%%time
from sklearn.metrics import matthews_corrcoef
from sklearn.utils import class_weight
gb_rus = GradientBoostingClassifier()   # If not undersampling
gb_rus.fit(X_rus, y_rus)
print(gb_rus.get_params())
y_pred = gb_rus.predict(X_test)
probs = gb_rus.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'presort': 'deprecated', 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
[[2147  586]
 [  19  146]]
              precision    recall  f1-score   support

           0       0.99      0.79      0.88      2733
           1       0.20      0.88      0.33       165

    accuracy                           0.79      2898
   macro avg       0.60      0.84      0.60      2898
weighted avg       0.95      0.79      0.85      2898

ROC AUC prob1: 0.8792591114215703
MCC: 0.35754591162238036
F2: 0.5244252873563219
PR_AUC: 0.2770742396675366
CPU times: user 333 ms, sys: 0 ns, total: 3

In [13]:
%%time
# Not undersampled
from sklearn.metrics import matthews_corrcoef
from sklearn.utils import class_weight
gb_weight = GradientBoostingClassifier()  
weights = class_weight.compute_sample_weight('balanced', y=y_train)
gb_weight.fit(X_train, y_train, sample_weight=weights)
print(gb_weight.get_params())
y_pred = gb_weight.predict(X_test)
probs = gb_weight.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'presort': 'deprecated', 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
[[2207  526]
 [  29  136]]
              precision    recall  f1-score   support

           0       0.99      0.81      0.89      2733
           1       0.21      0.82      0.33       165

    accuracy                           0.81      2898
   macro avg       0.60      0.82      0.61      2898
weighted avg       0.94      0.81      0.86      2898

ROC AUC prob1: 0.8772034283560077
MCC: 0.3487086481233762
F2: 0.5143721633888049
PR_AUC: 0.3344901250758451
CPU times: user 2.07 s, sys: 0 ns, total: 2.

<h1 align='center'>Hyperparameter Tuning with Cross Validation</h1>

In [14]:
%%time
def create_model(trial):
    n_estimators = trial.suggest_int('n_estimators', 10, 200)
    subsample = trial.suggest_float('subsample', 0.8, 1.0)
    max_depth = trial.suggest_int('max_depth',3,20)
    max_features = trial.suggest_int('max_features',10,20)
    learning_rate = trial.suggest_uniform('learning_rate',0.08,0.2)
    min_samples_leaf = trial.suggest_int('min_samples_leaf',1,50)
    model = GradientBoostingClassifier(n_estimators=n_estimators, subsample=subsample, 
                           max_depth=max_depth, learning_rate=learning_rate, 
                           min_samples_leaf=min_samples_leaf)
#    model = GradientBoostingClassifier(n_estimators=n_estimators, subsample=subsample, 
#                           learning_rate=learning_rate, 
#                           min_samples_leaf=min_samples_leaf)
#    model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate) 
    return model

def objective(trial):
    model = create_model(trial)
    
    mcc_list = []
    roc_list = []
    pr_list = []
    f2_list = []
    cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=11)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train_c, X_test_c = X.iloc[train_idx], X.iloc[test_idx]
        y_train_c, y_test_c = y[train_idx], y[test_idx]
        sampler = RandomUnderSampler(sampling_strategy=1.0, random_state=7)
        X_res, y_res = sampler.fit_resample(X_train_c, y_train_c)

#        weights = class_weight.compute_sample_weight('balanced', y=y_train)
#        model.fit(X_train, y_train, sample_weight=weights)
        model.fit(X_res, y_res)
        y_pred = model.predict(X_test_c)
        probs = model.predict_proba(X_test_c)
        prob1 = probs[:, 1]  # Only positives
        precision, recall, pr_thresh = precision_recall_curve(y_test_c, prob1)
        roc = roc_auc_score(y_test_c, prob1)
        roc_list.append(roc)
        mcc = matthews_corrcoef(y_test_c, y_pred)  # MWB
        mcc_list.append(mcc)
        f2 = fbeta_score(y_test_c, y_pred, beta=2.0, average=None)[1]
        pr_auc = auc(recall, precision)
        pr_list.append(pr_auc)
        avg_precision = average_precision_score(y_test_c, prob1)
#        print(f'ROC: {roc}')
#        print(f'PR_AUC: {pr_auc}')
#        print(f'avg_prec: {avg_precision}')
        
    print(f'MCC: {mcc_list}')
    print(f'ROC: {roc_list}')
    print(f'PR_AUC: {pr_list}')
    return np.mean(mcc_list)

sampler = TPESampler(seed=7)
study = optuna.create_study(sampler=sampler,direction='maximize')
#study.optimize(objective,n_trials=60)
study.optimize(objective,n_trials=25)
#study.optimize(objective,n_trials=5)

[32m[I 2021-12-14 09:25:16,992][0m A new study created in memory with name: no-name-8a57b7a0-f475-4996-b844-11765abb4e54[0m
[32m[I 2021-12-14 09:25:17,804][0m Trial 0 finished with value: 0.37173546584472394 and parameters: {'n_estimators': 24, 'subsample': 0.955983758448023, 'max_depth': 10, 'max_features': 17, 'learning_rate': 0.1973587414395923, 'min_samples_leaf': 27}. Best is trial 0 with value: 0.37173546584472394.[0m


MCC: [0.336925886180751, 0.3826973023215862, 0.37384624442139636, 0.3934724304551621]
ROC: [0.849216805738545, 0.8953555721041544, 0.8928414580588493, 0.8835290384005077]
PR_AUC: [0.23995091754927236, 0.31430345945822313, 0.3442879449004834, 0.30686381517799977]


[32m[I 2021-12-14 09:25:19,740][0m Trial 1 finished with value: 0.3540340566285872 and parameters: {'n_estimators': 105, 'subsample': 0.8144102266719524, 'max_depth': 7, 'max_features': 15, 'learning_rate': 0.16150759953451288, 'min_samples_leaf': 41}. Best is trial 0 with value: 0.37173546584472394.[0m


MCC: [0.3142458381379295, 0.3495866744777997, 0.36672160805924564, 0.38558210583937397]
ROC: [0.8471514133139841, 0.8767352160546867, 0.8821730475173826, 0.8772139035547477]
PR_AUC: [0.2478804304278904, 0.28726660098643564, 0.35198767209262727, 0.27904546853120554]


[32m[I 2021-12-14 09:25:21,654][0m Trial 2 finished with value: 0.3659085690946857 and parameters: {'n_estimators': 82, 'subsample': 0.813187269381181, 'max_depth': 8, 'max_features': 20, 'learning_rate': 0.10560624242958987, 'min_samples_leaf': 23}. Best is trial 0 with value: 0.37173546584472394.[0m


MCC: [0.3397794598756002, 0.3636253756944109, 0.37569638389617677, 0.3845330569125548]
ROC: [0.854047723612941, 0.8867152940876949, 0.8852184171130537, 0.8876515071373846]
PR_AUC: [0.2424774988901391, 0.3215804868508943, 0.33971465744006957, 0.33225069539393287]


[32m[I 2021-12-14 09:25:25,853][0m Trial 3 finished with value: 0.3570451519243594 and parameters: {'n_estimators': 187, 'subsample': 0.8049798455100696, 'max_depth': 13, 'max_features': 20, 'learning_rate': 0.10763634548251577, 'min_samples_leaf': 28}. Best is trial 0 with value: 0.37173546584472394.[0m


MCC: [0.32250237615802385, 0.36062206097128974, 0.3586090844592548, 0.3864470861088694]
ROC: [0.850066512637401, 0.8808755481723346, 0.8788551974842043, 0.8825929879563646]
PR_AUC: [0.24335306714965388, 0.3043149552182727, 0.30832039289594604, 0.27540664630730105]


[32m[I 2021-12-14 09:25:30,063][0m Trial 4 finished with value: 0.34958545472450053 and parameters: {'n_estimators': 183, 'subsample': 0.8266338891518501, 'max_depth': 12, 'max_features': 18, 'learning_rate': 0.16028158890606967, 'min_samples_leaf': 24}. Best is trial 0 with value: 0.37173546584472394.[0m


MCC: [0.31008934481763073, 0.36106377924918787, 0.35440246623056626, 0.37278622860061716]
ROC: [0.8476796955057824, 0.8795898493441026, 0.8791308835739176, 0.8789513670503832]
PR_AUC: [0.23088546678689187, 0.3016742174557992, 0.3123145011934804, 0.2712837184098913]


[32m[I 2021-12-14 09:25:31,192][0m Trial 5 finished with value: 0.36637710359074843 and parameters: {'n_estimators': 49, 'subsample': 0.8981531778182141, 'max_depth': 9, 'max_features': 15, 'learning_rate': 0.12390684629367116, 'min_samples_leaf': 42}. Best is trial 0 with value: 0.37173546584472394.[0m


MCC: [0.3471649592870157, 0.36046838669474235, 0.3663241719219492, 0.3915508964592865]
ROC: [0.8617460044681217, 0.8941462514241342, 0.8904949206440796, 0.8832245014409406]
PR_AUC: [0.26634109516189985, 0.306471989474509, 0.33348821474382545, 0.3146273341945407]


[32m[I 2021-12-14 09:25:35,486][0m Trial 6 finished with value: 0.3579292449648335 and parameters: {'n_estimators': 156, 'subsample': 0.8627989354425325, 'max_depth': 13, 'max_features': 13, 'learning_rate': 0.13434115190556806, 'min_samples_leaf': 18}. Best is trial 0 with value: 0.37173546584472394.[0m


MCC: [0.3092683483562735, 0.36817593212659444, 0.3716360916060689, 0.38263660777039715]
ROC: [0.8442840503331998, 0.883357837989218, 0.8852152114608477, 0.8781435426944789]
PR_AUC: [0.21778620811218685, 0.2766656391048849, 0.3263761848709425, 0.2519183431863237]


[32m[I 2021-12-14 09:25:38,107][0m Trial 7 finished with value: 0.35129549463330667 and parameters: {'n_estimators': 135, 'subsample': 0.8740702165976071, 'max_depth': 11, 'max_features': 17, 'learning_rate': 0.12955901949366017, 'min_samples_leaf': 46}. Best is trial 0 with value: 0.37173546584472394.[0m


MCC: [0.3304622646696201, 0.33853952520300934, 0.3676278353350551, 0.36855235332554226]
ROC: [0.8573797203286806, 0.8809041899779139, 0.8859493058160147, 0.8799707644518816]
PR_AUC: [0.2592039360031255, 0.29725499074942097, 0.33861233488806697, 0.30320074243459827]


[32m[I 2021-12-14 09:25:39,582][0m Trial 8 finished with value: 0.36102836657613213 and parameters: {'n_estimators': 44, 'subsample': 0.9482237745826528, 'max_depth': 10, 'max_features': 14, 'learning_rate': 0.15612558423606065, 'min_samples_leaf': 27}. Best is trial 0 with value: 0.37173546584472394.[0m


MCC: [0.3416355684669336, 0.35081696454529177, 0.3657528283772126, 0.3859081049150904]
ROC: [0.852917963503975, 0.8879246147677149, 0.888135560620486, 0.8841060557975822]
PR_AUC: [0.2658435038177294, 0.29113476140204925, 0.3223068964415177, 0.2916793007796131]


[32m[I 2021-12-14 09:25:41,023][0m Trial 9 finished with value: 0.3692178219393937 and parameters: {'n_estimators': 89, 'subsample': 0.8002853761125517, 'max_depth': 4, 'max_features': 17, 'learning_rate': 0.14292147161182367, 'min_samples_leaf': 35}. Best is trial 0 with value: 0.37173546584472394.[0m


MCC: [0.3399102589791743, 0.37198732585821137, 0.3710180979441975, 0.3939556049759915]
ROC: [0.8570296538160432, 0.8867693952760116, 0.8879977175756293, 0.8894498780249337]
PR_AUC: [0.26178249943476034, 0.3341492834590018, 0.3492648179570545, 0.33616535028027933]


[32m[I 2021-12-14 09:25:43,492][0m Trial 10 finished with value: 0.36474196498143513 and parameters: {'n_estimators': 36, 'subsample': 0.9869602664638994, 'max_depth': 18, 'max_features': 10, 'learning_rate': 0.18696473435792457, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.37173546584472394.[0m


MCC: [0.3331241157735506, 0.3591806954516671, 0.3802902989977914, 0.38637274970273133]
ROC: [0.8474410137926206, 0.8706886126545863, 0.8870969293057518, 0.8674751321530122]
PR_AUC: [0.2054747640293738, 0.24657148933892895, 0.305971973623076, 0.2608355362992377]


[32m[I 2021-12-14 09:25:43,997][0m Trial 11 finished with value: 0.37547886406432746 and parameters: {'n_estimators': 18, 'subsample': 0.9452811664619694, 'max_depth': 3, 'max_features': 17, 'learning_rate': 0.19685189794973873, 'min_samples_leaf': 38}. Best is trial 11 with value: 0.37547886406432746.[0m


MCC: [0.3542115675571874, 0.37525882301038865, 0.38048406151934305, 0.39196100417039076]
ROC: [0.8671338463398954, 0.892069720519626, 0.8920160026158122, 0.8845147764538434]
PR_AUC: [0.29552377293695864, 0.306781157506395, 0.34169867441640583, 0.328733156647923]


[32m[I 2021-12-14 09:25:44,396][0m Trial 12 finished with value: 0.37883091374517597 and parameters: {'n_estimators': 10, 'subsample': 0.9508577202851777, 'max_depth': 3, 'max_features': 18, 'learning_rate': 0.19834120666704083, 'min_samples_leaf': 13}. Best is trial 12 with value: 0.37883091374517597.[0m


MCC: [0.35499430137574417, 0.384043846365215, 0.38150589417215147, 0.3947796130675933]
ROC: [0.8501333435170864, 0.8983024956559928, 0.9013027770565061, 0.8784144203058833]
PR_AUC: [0.23812191545373074, 0.3244086343799207, 0.33807915950389467, 0.25903067429879595]


[32m[I 2021-12-14 09:25:44,917][0m Trial 13 finished with value: 0.37989264655725846 and parameters: {'n_estimators': 16, 'subsample': 0.9386551526685136, 'max_depth': 3, 'max_features': 19, 'learning_rate': 0.1807474826128703, 'min_samples_leaf': 10}. Best is trial 13 with value: 0.37989264655725846.[0m


MCC: [0.35661955640531373, 0.38530845768347033, 0.38150589417215147, 0.39613667796809837]
ROC: [0.8542673107890498, 0.8893567050466861, 0.897512093322947, 0.8888872860627859]
PR_AUC: [0.22670117096393388, 0.29524547746455604, 0.3471977449913048, 0.3130251761481257]


[32m[I 2021-12-14 09:25:45,417][0m Trial 14 finished with value: 0.3769303946148321 and parameters: {'n_estimators': 10, 'subsample': 0.9192474490295567, 'max_depth': 5, 'max_features': 19, 'learning_rate': 0.17871235017246065, 'min_samples_leaf': 8}. Best is trial 13 with value: 0.37989264655725846.[0m


MCC: [0.33801008274692956, 0.39044484326733525, 0.37992878719351963, 0.3993378652515441]
ROC: [0.8492931838867567, 0.8942608186464519, 0.8930690593654732, 0.8802913296724786]
PR_AUC: [0.19397734065445785, 0.3200863332643693, 0.3052908942363965, 0.2545470836166173]


[32m[I 2021-12-14 09:25:46,997][0m Trial 15 finished with value: 0.3618066659877539 and parameters: {'n_estimators': 60, 'subsample': 0.999495161092392, 'max_depth': 6, 'max_features': 19, 'learning_rate': 0.17721128548941745, 'min_samples_leaf': 13}. Best is trial 13 with value: 0.37989264655725846.[0m


MCC: [0.3265825909320253, 0.3705399979848684, 0.3743345198551014, 0.3757695551790206]
ROC: [0.8451146626950029, 0.872397573720825, 0.88687573930354, 0.8780794296503596]
PR_AUC: [0.20523209994544997, 0.24697690820660378, 0.3331734323380174, 0.2973020691527125]


[32m[I 2021-12-14 09:25:52,037][0m Trial 16 finished with value: 0.3621505409357787 and parameters: {'n_estimators': 69, 'subsample': 0.9682757969285061, 'max_depth': 16, 'max_features': 12, 'learning_rate': 0.17315835178275174, 'min_samples_leaf': 2}. Best is trial 13 with value: 0.37989264655725846.[0m


MCC: [0.33963732330120683, 0.3545899805909892, 0.3872914269131094, 0.3670834329378093]
ROC: [0.8489112931456977, 0.8787846963650365, 0.884670250585833, 0.8690426960817313]
PR_AUC: [0.19266023348937594, 0.2612571752731539, 0.3235547470398963, 0.23816463891675524]


[32m[I 2021-12-14 09:25:53,797][0m Trial 17 finished with value: 0.3725792003990935 and parameters: {'n_estimators': 118, 'subsample': 0.924580558278454, 'max_depth': 3, 'max_features': 18, 'learning_rate': 0.08974148211872215, 'min_samples_leaf': 12}. Best is trial 13 with value: 0.37989264655725846.[0m


MCC: [0.3363868591377634, 0.3709768354403102, 0.3845330569125548, 0.3984200501057456]
ROC: [0.860100691858726, 0.8847135501199773, 0.8951912011258252, 0.8852280340696717]
PR_AUC: [0.21985015790109896, 0.28732747662901975, 0.36719384304290054, 0.3241061112982618]


[32m[I 2021-12-14 09:25:54,286][0m Trial 18 finished with value: 0.37546091670198584 and parameters: {'n_estimators': 10, 'subsample': 0.9259945995262191, 'max_depth': 6, 'max_features': 16, 'learning_rate': 0.19854160944703997, 'min_samples_leaf': 17}. Best is trial 13 with value: 0.37989264655725846.[0m


MCC: [0.3529877961641858, 0.3701212717111831, 0.3803163715333473, 0.3984182273992271]
ROC: [0.8455474722015366, 0.8938677894254454, 0.8901166536837752, 0.883506598835066]
PR_AUC: [0.22579975470042812, 0.27227785675375277, 0.32504762092684203, 0.2978565410125891]


[32m[I 2021-12-14 09:25:55,836][0m Trial 19 finished with value: 0.3700528940256513 and parameters: {'n_estimators': 32, 'subsample': 0.8937183240571254, 'max_depth': 15, 'max_features': 19, 'learning_rate': 0.1859836323390793, 'min_samples_leaf': 8}. Best is trial 13 with value: 0.37989264655725846.[0m


MCC: [0.34336998331769747, 0.36935320678196687, 0.36855235332554226, 0.3989360326773986]
ROC: [0.8566891345719323, 0.877323964280486, 0.8893024180234589, 0.8777492474731446]
PR_AUC: [0.23230218820392343, 0.24600861638903898, 0.3241090155323976, 0.2401607625501933]


[32m[I 2021-12-14 09:25:57,335][0m Trial 20 finished with value: 0.36523621881911017 and parameters: {'n_estimators': 64, 'subsample': 0.9742922232762813, 'max_depth': 5, 'max_features': 20, 'learning_rate': 0.1528915085775708, 'min_samples_leaf': 17}. Best is trial 13 with value: 0.37989264655725846.[0m


MCC: [0.32946017559186747, 0.37317498039916525, 0.37376676428081274, 0.3845429550045953]
ROC: [0.8516768185955332, 0.8823712869081488, 0.8929344219728224, 0.8822051040394423]
PR_AUC: [0.2217509713560651, 0.29033033992116075, 0.31452129118739963, 0.30963159150303543]


[32m[I 2021-12-14 09:25:57,736][0m Trial 21 finished with value: 0.3791521372310711 and parameters: {'n_estimators': 10, 'subsample': 0.9247900894485763, 'max_depth': 3, 'max_features': 19, 'learning_rate': 0.17326155964611412, 'min_samples_leaf': 8}. Best is trial 13 with value: 0.37989264655725846.[0m


MCC: [0.35499430137574417, 0.3848860692876502, 0.3819485651932968, 0.3947796130675933]
ROC: [0.8545505464220019, 0.8868744152298027, 0.8982397763737021, 0.8811536501158843]
PR_AUC: [0.23121396903934613, 0.30676467672121416, 0.3554343513287952, 0.2549000832707351]


[32m[I 2021-12-14 09:25:58,442][0m Trial 22 finished with value: 0.3776765277419568 and parameters: {'n_estimators': 34, 'subsample': 0.9356776976036455, 'max_depth': 3, 'max_features': 18, 'learning_rate': 0.16738913512043332, 'min_samples_leaf': 7}. Best is trial 13 with value: 0.37989264655725846.[0m


MCC: [0.35503147408972985, 0.3773983446016477, 0.3803163715333473, 0.39795992074310227]
ROC: [0.8631271759816184, 0.888688396249833, 0.8927677280581121, 0.880977339244556]
PR_AUC: [0.2283567757147428, 0.3212922964552959, 0.3582212725499506, 0.2947036513117472]


[32m[I 2021-12-14 09:26:00,442][0m Trial 23 finished with value: 0.3654116832306583 and parameters: {'n_estimators': 49, 'subsample': 0.9651639462440296, 'max_depth': 20, 'max_features': 19, 'learning_rate': 0.18665550084892357, 'min_samples_leaf': 12}. Best is trial 13 with value: 0.37989264655725846.[0m


MCC: [0.33482079378578417, 0.3784796662444787, 0.3643082784465239, 0.3840379944458464]
ROC: [0.843889429900772, 0.8742147371636974, 0.8900397180308319, 0.8853819053755583]
PR_AUC: [0.2031816362676297, 0.2408784487685069, 0.3082622337767169, 0.2592544752730361]


[32m[I 2021-12-14 09:26:01,264][0m Trial 24 finished with value: 0.3656814832358799 and parameters: {'n_estimators': 26, 'subsample': 0.9099795358361916, 'max_depth': 5, 'max_features': 16, 'learning_rate': 0.18633617118122323, 'min_samples_leaf': 4}. Best is trial 13 with value: 0.37989264655725846.[0m


MCC: [0.34371959992018786, 0.34262732316596894, 0.37888818515971534, 0.3974908246976475]
ROC: [0.8428901491283345, 0.8809201020921248, 0.8902785391201766, 0.8808170566342576]
PR_AUC: [0.1902565066369948, 0.27589547705426604, 0.2994579483426194, 0.2753663684489146]
CPU times: user 44.4 s, sys: 0 ns, total: 44.4 s
Wall time: 44.3 s


In [15]:
best = study.best_trial
best

FrozenTrial(number=13, values=[0.37989264655725846], datetime_start=datetime.datetime(2021, 12, 14, 9, 25, 44, 399093), datetime_complete=datetime.datetime(2021, 12, 14, 9, 25, 44, 917156), params={'n_estimators': 16, 'subsample': 0.9386551526685136, 'max_depth': 3, 'max_features': 19, 'learning_rate': 0.1807474826128703, 'min_samples_leaf': 10}, distributions={'n_estimators': IntUniformDistribution(high=200, low=10, step=1), 'subsample': UniformDistribution(high=1.0, low=0.8), 'max_depth': IntUniformDistribution(high=20, low=3, step=1), 'max_features': IntUniformDistribution(high=20, low=10, step=1), 'learning_rate': UniformDistribution(high=0.2, low=0.08), 'min_samples_leaf': IntUniformDistribution(high=50, low=1, step=1)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=13, state=TrialState.COMPLETE, value=None)

In [16]:
print(type(study.best_params))
study.best_params

<class 'dict'>


{'n_estimators': 16,
 'subsample': 0.9386551526685136,
 'max_depth': 3,
 'max_features': 19,
 'learning_rate': 0.1807474826128703,
 'min_samples_leaf': 10}

In [22]:
%%time
gb_params = study.best_params
gb_params['random_state'] = 42
gb = GradientBoostingClassifier(**gb_params)
gb.fit(X_rus, y_rus)
y_pred = gb.predict(X_test)
probs = gb.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

[[2194  539]
 [  21  144]]
              precision    recall  f1-score   support

           0       0.99      0.80      0.89      2733
           1       0.21      0.87      0.34       165

    accuracy                           0.81      2898
   macro avg       0.60      0.84      0.61      2898
weighted avg       0.95      0.81      0.86      2898

ROC AUC prob1: 0.8864307177150206
MCC: 0.3688035829631495
F2: 0.5361131794489948
PR_AUC: 0.27018095986999474
CPU times: user 63.7 ms, sys: 0 ns, total: 63.7 ms
Wall time: 59.9 ms


In [30]:
mod_params = study.best_params.copy()
print(mod_params)
mod_params['n_estimators'] = 2000
mod_params['learning_rate'] = 0.001
print(mod_params)
print(study.best_params)


{'n_estimators': 16, 'subsample': 0.9386551526685136, 'max_depth': 3, 'max_features': 19, 'learning_rate': 0.1807474826128703, 'min_samples_leaf': 10}
{'n_estimators': 2000, 'subsample': 0.9386551526685136, 'max_depth': 3, 'max_features': 19, 'learning_rate': 0.001, 'min_samples_leaf': 10}
{'n_estimators': 16, 'subsample': 0.9386551526685136, 'max_depth': 3, 'max_features': 19, 'learning_rate': 0.1807474826128703, 'min_samples_leaf': 10}


In [31]:
%%time
gb_params = study.best_params
#gb_params['random_state'] = 42
gb = GradientBoostingClassifier(**mod_params)
gb.fit(X_rus, y_rus)
y_pred = gb.predict(X_test)
probs = gb.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

[[2192  541]
 [  18  147]]
              precision    recall  f1-score   support

           0       0.99      0.80      0.89      2733
           1       0.21      0.89      0.34       165

    accuracy                           0.81      2898
   macro avg       0.60      0.85      0.62      2898
weighted avg       0.95      0.81      0.86      2898

ROC AUC prob1: 0.8936832651431992
MCC: 0.3773796009739648
F2: 0.5452522255192879
PR_AUC: 0.3302866008382411
CPU times: user 2.36 s, sys: 0 ns, total: 2.36 s
Wall time: 2.36 s


In [32]:
%%time
gb_params = study.best_params
#gb_params['random_state'] = 42
gb = GradientBoostingClassifier(**mod_params)
gb.fit(X_rus, y_rus)
y_pred = gb.predict(X_test)
probs = gb.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

[[2195  538]
 [  18  147]]
              precision    recall  f1-score   support

           0       0.99      0.80      0.89      2733
           1       0.21      0.89      0.35       165

    accuracy                           0.81      2898
   macro avg       0.60      0.85      0.62      2898
weighted avg       0.95      0.81      0.86      2898

ROC AUC prob1: 0.8936699597511891
MCC: 0.3785473350803167
F2: 0.5464684014869887
PR_AUC: 0.32760750584061205
CPU times: user 2.23 s, sys: 0 ns, total: 2.23 s
Wall time: 2.22 s


In [16]:
%%time
gb_params = study.best_params
gb_params['random_state'] = 42
gb_weight = GradientBoostingClassifier(**gb_params)
weights = class_weight.compute_sample_weight('balanced', y=y_train)
gb_weight.fit(X_train, y_train, sample_weight=weights)
y_pred = gb_weight.predict(X_test)
probs = gb_weight.predict_proba(X_test)
prob1 = probs[:, 1]  # Only positives
precision, recall, pr_thresh = precision_recall_curve(y_test, prob1)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))
print(classification_report(y_test, y_pred))
print(f'ROC AUC prob1: {roc_auc_score(y_test, prob1)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(f'F2: {fbeta_score(y_test, y_pred, beta=2.0, average=None)[1]}')
print(f'PR_AUC: {auc(recall, precision)}')

[[2409  324]
 [  59  106]]
              precision    recall  f1-score   support

           0       0.98      0.88      0.93      2733
           1       0.25      0.64      0.36       165

    accuracy                           0.87      2898
   macro avg       0.61      0.76      0.64      2898
weighted avg       0.93      0.87      0.89      2898

ROC AUC prob1: 0.8750823271130627
MCC: 0.34149249514721947
F2: 0.4862385321100917
PR_AUC: 0.2830320569037005
CPU times: user 278 ms, sys: 4 ms, total: 282 ms
Wall time: 280 ms


In [17]:
# No Sitenum - 5
coeffs_wgt = pd.Series(data=gb_weight.feature_importances_,
                   index=X_test.columns.values).sort_values(ascending=False)
coeffs_wgt[0:20]

Delmode             0.100120
prelaborCD          0.048542
transfus_yes        0.039994
Delfetalpos         0.037433
CS_FTP              0.037259
AdmSBP              0.035369
uscar               0.033633
CS_NRFHT            0.031166
new_age             0.030934
new_BMI             0.030619
Admpresent          0.029334
AdmDBP              0.025946
Analgesia           0.025284
HxnumCS             0.024544
Intrafever          0.023452
BESTGA              0.023151
Lac_Min             0.022055
spontlabor          0.019795
Intrafetdistress    0.019520
CS_Other            0.018983
dtype: float64

In [33]:
plt.rcParams["figure.figsize"] = (14, 8)
x_labels = coeffs_wgt.index.values[0:25]
print(x_labels)
fig, ax = plt.subplots(1,1)  # Create a figure and an axes.
ax.set_title('Top Predictors: All Sites combined; target: High_EBLoss, vars: PI')
ax.bar(x_labels, coeffs_wgt.values[0:25])
ax.set_ylabel('Coeff')
ax.set_xlabel('Variable')
plt.draw()
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, fontsize=16)
#plt.xlim(1,20)
plt.show()

NameError: name 'coeffs_wgt' is not defined

In [19]:
# No Sitenum - 5
coeffs = pd.Series(data=gb.feature_importances_,
                   index=X_test.columns.values).sort_values(ascending=False)
coeffs[0:20]

Delmode         0.203047
Lac_Min         0.107583
new_BMI         0.059018
Analgesia       0.055636
spontlabor      0.050641
BESTGA          0.044505
Delfetalpos     0.025792
transfus_yes    0.024336
AdmDBP          0.023741
TrialLabor      0.023611
Intrafever      0.022488
new_age         0.022091
AdmSBP          0.019205
Dilat_lst       0.017751
CS_FTP          0.017589
CS_NRFHT        0.017574
Admreason       0.016142
IUPC            0.015457
HxnumCS         0.013869
Education       0.013595
dtype: float64

In [35]:
np.round(confusion_matrix(y_test, y_pred, normalize='true'), decimals=2) # 25

array([[0.8 , 0.2 ],
       [0.11, 0.89]])

In [21]:
gb_params # 5 runs

{'n_estimators': 16,
 'subsample': 0.9601643679270631,
 'max_depth': 16,
 'max_features': 13,
 'learning_rate': 0.19727312624322152,
 'min_samples_leaf': 13,
 'random_state': 42}

In [37]:
#from optuna.visualization import plot_optimization_history

#plotly_config = {"staticPlot": True}

#fig = plot_optimization_history(study)
#fig.show(config=plotly_config)

In [38]:
#from optuna.visualization import plot_param_importances

#fig = plot_param_importances(study)
#fig.show(config=plotly_config)

In [24]:
np.mean([0.3149689279176914, 0.31449457458602254, 0.31066798538596907, 0.3219519143488398])

0.3155208505596307

<h1 align='center'>Thanks and make sure to learn!</h1>