In [1]:
# Data stuff
import numpy as np
import pandas as pd

# Visual stuff
from IPython.display import display
# Configs 
pd.options.display.max_columns = None
pd.options.display.max_rows = None
import matplotlib.pyplot as plt

# Random state seed
rand_state=42

# ML stuff
import sklearn
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV,StratifiedKFold
import lightgbm as lgb
import xgboost as xgb
from xgboost import XGBClassifier
import shap

In [2]:
dataset = pd.read_csv('data_gen/dataset_KNN_entire.csv')
# Ensure correct types
dataset = dataset.astype(dtype={
    'age':float,
    'gender':int,
    'expired':int,
    'P-glucose':float,
    'blood_pressure_systoliskt':float,
    'blood_pressure_diastoliskt':float,
    'BMI':float
    
})
# Order ints(categorical variables, first) and floats(number variables, last)
dataset = dataset[[
    'gender',
    'I109',
    'E119',
    'E669',
    'I259',
    'I252',
    'I209',
    'E660',
    'E118',
    'I639',
    'E113',
    'expired',
    'age',
    'P-glucose',
    'blood_pressure_systoliskt',
    'blood_pressure_diastoliskt',
    'BMI',
]]

In [3]:
# Generate data set withouth categories(all numbers)
Y_no_cat = dataset.expired.values
X_no_cat = dataset.drop(columns=['expired']).values

# dataset as is, but target variable dropped(hospital expire flag)
dataset_no_target = dataset.drop(columns=['expired'])

# Feture names and categorical feature names
feature_names = dataset_no_target.select_dtypes(include='int').columns.values.tolist() + dataset_no_target.select_dtypes(exclude='int').columns.values.tolist() 
cat_feature_names = dataset_no_target.select_dtypes(include='int').columns.values.tolist() 

# Generate data set with categories(int type required)
dataframe_int_list = dataset_no_target.select_dtypes(include='int').values.tolist()
dataframe_no_int_list = dataset_no_target.select_dtypes(exclude='int').values.tolist()
Y = dataset.expired.values.tolist()
X = []
for i,v in enumerate(dataframe_int_list):
    X = X + [v+dataframe_no_int_list[i]]

# Generate categorical feature indicies
cat_features_indices=list(range(0,len(dataframe_int_list[0])))

In [4]:
# Function that w
def strat_cv_it(classifier, params, uses_cat, param_comb ):
    folds = 3
    skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = rand_state)
    
    # Perform fit and scoring
    if uses_cat:
        random_search = RandomizedSearchCV( classifier, return_train_score=True, param_distributions=params ,scoring='roc_auc', n_iter=param_comb, n_jobs=4, cv=skf.split(X, Y),random_state = rand_state, refit=True, verbose=3, error_score=0.0 )
        random_search.fit(X, Y)     
    else:
        random_search = RandomizedSearchCV( classifier, return_train_score=True, param_distributions=params ,scoring='roc_auc', n_iter=param_comb, n_jobs=4, cv=skf.split(X_no_cat, Y_no_cat),random_state = rand_state, refit=True, verbose=3,error_score=0.0)
        random_search.fit(X_no_cat, Y_no_cat)
    
    # Display results and return best model
    display(random_search.best_score_)
    display(random_search.best_params_)
    display(pd.DataFrame(random_search.cv_results_))
    return random_search.best_estimator_

# Scale of negative class to the positive class(#survived/#died)
scale_pos_weight_min = int( (dataset[dataset.expired==0].shape[0] / dataset[dataset.expired==1].shape[0]) )

# XGboost

In [5]:
xgb_classifier = xgb.XGBClassifier(objective = "binary:logistic",random_state=rand_state)
xgb_params = {
        'learning_rate': (0.01, 0.05,0.1),
        'min_child_weight': [3, 5, 10],
        'gamma': [0.5, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': list(range(5,30)),
        'scale_pos_weight':  list(range(scale_pos_weight_min,3*scale_pos_weight_min))
}
model = strat_cv_it(xgb_classifier ,xgb_params,False,50)
explainer = shap.TreeExplainer(model)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    3.0s
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:   12.0s
[Parallel(n_jobs=4)]: Done 150 out of 150 | elapsed:   14.9s finished


0.8826014525504093

{'subsample': 0.6,
 'scale_pos_weight': 25,
 'min_child_weight': 10,
 'max_depth': 28,
 'learning_rate': 0.01,
 'gamma': 2,
 'colsample_bytree': 0.8}

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_subsample,param_scale_pos_weight,param_min_child_weight,param_max_depth,param_learning_rate,param_gamma,param_colsample_bytree,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.24976,0.08046,0.00548,0.00212,1.0,22,5,7,0.01,1.5,1.0,"{'subsample': 1.0, 'scale_pos_weight': 22, 'mi...",0.873059,0.871975,0.873566,0.872867,0.000664,21,0.951642,0.944139,0.951851,0.949211,0.003587
1,0.400412,0.054569,0.008778,0.00107,1.0,25,10,20,0.1,2.0,1.0,"{'subsample': 1.0, 'scale_pos_weight': 25, 'mi...",0.865119,0.863688,0.850365,0.859724,0.006644,41,0.998181,0.998471,0.998213,0.998288,0.00013
2,0.36077,0.101539,0.007631,0.003017,0.8,27,10,12,0.1,1.5,1.0,"{'subsample': 0.8, 'scale_pos_weight': 27, 'mi...",0.872302,0.861734,0.8559,0.863312,0.006789,38,0.996802,0.996943,0.997066,0.996937,0.000108
3,0.185498,0.038045,0.006703,0.000214,1.0,14,3,6,0.1,5.0,0.8,"{'subsample': 1.0, 'scale_pos_weight': 14, 'mi...",0.883621,0.868229,0.872473,0.874774,0.006491,16,0.991887,0.991001,0.988071,0.990319,0.001631
4,0.437001,0.101296,0.011666,0.005029,1.0,29,10,20,0.1,0.5,1.0,"{'subsample': 1.0, 'scale_pos_weight': 29, 'mi...",0.866252,0.85583,0.853029,0.85837,0.005689,43,0.998565,0.998814,0.998364,0.998581,0.000184
5,0.385068,0.111729,0.007786,0.001118,0.6,26,5,17,0.01,0.5,1.0,"{'subsample': 0.6, 'scale_pos_weight': 26, 'mi...",0.880127,0.873643,0.878518,0.877429,0.002757,9,0.968351,0.969454,0.969444,0.969083,0.000518
6,0.308564,0.063147,0.008276,0.004849,0.8,25,10,9,0.01,0.5,0.8,"{'subsample': 0.8, 'scale_pos_weight': 25, 'mi...",0.879765,0.878258,0.884633,0.880885,0.002721,2,0.964823,0.962222,0.967942,0.964996,0.002339
7,0.298025,0.082823,0.011855,0.008487,0.6,29,10,17,0.01,2.0,1.0,"{'subsample': 0.6, 'scale_pos_weight': 29, 'mi...",0.880465,0.877505,0.882876,0.880282,0.002197,3,0.954652,0.954812,0.955698,0.955054,0.00046
8,0.370421,0.05511,0.009773,0.001742,0.6,16,3,16,0.05,2.0,0.8,"{'subsample': 0.6, 'scale_pos_weight': 16, 'mi...",0.870281,0.867083,0.868405,0.86859,0.001312,28,0.997113,0.996791,0.996387,0.996764,0.000297
9,0.163301,0.014174,0.006456,0.000393,1.0,25,3,6,0.05,0.5,1.0,"{'subsample': 1.0, 'scale_pos_weight': 25, 'mi...",0.875302,0.874381,0.870489,0.873391,0.002086,19,0.986284,0.986432,0.98037,0.984362,0.002823


In [None]:
shap_values = explainer.shap_values( X_no_cat)
shap.summary_plot(shap_values,X_no_cat ,feature_names=feature_names, show=False)
#plt.savefig("shap_summary.svg", format='svg', dpi=300, bbox_inches='tight')
for predictor in feature_names:
    if predictor != 'age':
        save = shap.dependence_plot(predictor, shap_values,X_no_cat ,feature_names=feature_names,interaction_index='age', show=False)
    else:
        save = shap.dependence_plot(predictor, shap_values,X_no_cat ,feature_names=feature_names, interaction_index='gender', show=False)
        
    # plt.savefig(predictor+".svg", format='svg', dpi=300, bbox_inches='tight')

# LightGBM

In [7]:
LGB_classifier = lgb.LGBMClassifier()
LGB_params = {
             'num_leaves': [1,5,8,10,15,20,35,40], 
             'min_child_samples': [1,5,10,20,50,100,200,300,400,500], 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': [0.2,0,4,0.5, 0.6, 0.8, 1.0],
             'colsample_bytree': [0.6, 0.8, 1.0],
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100],
             'scale_pos_weight':  list(range(scale_pos_weight_min,3*scale_pos_weight_min))

}
model = strat_cv_it(LGB_classifier ,LGB_params,True,50)
explainer = shap.TreeExplainer(model)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:   14.6s
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:  1.5min
[Parallel(n_jobs=4)]: Done 150 out of 150 | elapsed:  2.4min finished


0.8848895693625888

{'subsample': 0.5,
 'scale_pos_weight': 12,
 'reg_lambda': 20,
 'reg_alpha': 50,
 'num_leaves': 40,
 'min_child_weight': 10.0,
 'min_child_samples': 50,
 'colsample_bytree': 0.6}

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_subsample,param_scale_pos_weight,param_reg_lambda,param_reg_alpha,param_num_leaves,param_min_child_weight,param_min_child_samples,param_colsample_bytree,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.008769,0.000608,0.0,0.0,0.0,21,0.0,10.0,15,0.001,10,1.0,"{'subsample': 0, 'scale_pos_weight': 21, 'reg_...",0.0,0.0,0.0,0.0,0.0,33,0.0,0.0,0.0,0.0,0.0
1,0.499964,0.031273,0.022771,0.004652,0.2,14,0.0,5.0,10,1e-05,500,1.0,"{'subsample': 0.2, 'scale_pos_weight': 14, 're...",0.885697,0.877911,0.877381,0.88033,0.003802,15,0.913298,0.911421,0.916962,0.913894,0.002301
2,0.016527,0.003775,0.0,0.0,0.2,26,5.0,100.0,1,0.1,400,0.8,"{'subsample': 0.2, 'scale_pos_weight': 26, 're...",0.0,0.0,0.0,0.0,0.0,33,0.0,0.0,0.0,0.0,0.0
3,1.782938,0.782728,0.032204,0.006127,0.8,22,0.0,10.0,20,1e-05,20,0.6,"{'subsample': 0.8, 'scale_pos_weight': 22, 're...",0.883207,0.873159,0.874949,0.877105,0.004376,22,0.973411,0.971853,0.974276,0.97318,0.001003
4,6.51942,1.309696,0.041123,0.005218,1.0,10,5.0,0.0,40,1e-05,1,1.0,"{'subsample': 1.0, 'scale_pos_weight': 10, 're...",0.87744,0.864385,0.853904,0.865243,0.009628,26,0.997975,0.998057,0.997605,0.997879,0.000197
5,2.201499,0.574707,0.036922,0.005405,0.8,12,100.0,0.1,10,0.01,20,0.8,"{'subsample': 0.8, 'scale_pos_weight': 12, 're...",0.888733,0.876958,0.882439,0.88271,0.004811,5,0.933889,0.937557,0.934733,0.935393,0.001569
6,1.660735,0.638652,0.027732,0.009275,0.6,19,100.0,0.0,15,0.01,500,1.0,"{'subsample': 0.6, 'scale_pos_weight': 19, 're...",0.887832,0.878585,0.877738,0.881385,0.004572,11,0.904325,0.906329,0.910395,0.907016,0.002525
7,4.072236,0.177566,0.034332,0.006773,0.8,25,100.0,1.0,15,1.0,50,0.8,"{'subsample': 0.8, 'scale_pos_weight': 25, 're...",0.887183,0.874649,0.884301,0.882044,0.00536,8,0.940215,0.942472,0.939569,0.940752,0.001245
8,1.083135,0.610418,0.031514,0.002399,0.6,13,20.0,0.1,5,0.1,500,1.0,"{'subsample': 0.6, 'scale_pos_weight': 13, 're...",0.888428,0.87689,0.878022,0.881113,0.005193,14,0.911473,0.910207,0.916343,0.912674,0.002645
9,7.887479,1.274937,0.040674,0.016573,0.2,26,5.0,7.0,35,0.001,1,0.8,"{'subsample': 0.2, 'scale_pos_weight': 26, 're...",0.879511,0.871028,0.866929,0.872489,0.00524,24,0.990291,0.989538,0.990974,0.990268,0.000586


In [None]:
shap_values = explainer.shap_values(dataset.drop(columns=['expired']))
shap.summary_plot(shap_values[1],dataset.drop(columns=['expired']) ,feature_names=feature_names, show=False)
#plt.savefig("shap_summary.svg", format='svg', dpi=300, bbox_inches='tight')
for predictor in feature_names:
    if predictor != 'age':
        save = shap.dependence_plot(predictor, shap_values[1],dataset.drop(columns=['expired']) ,feature_names=feature_names,interaction_index='age', show=False)
    else:
        save = shap.dependence_plot(predictor, shap_values[1],dataset.drop(columns=['expired']) ,feature_names=feature_names, interaction_index='gender', show=False)
        
  #  plt.savefig(predictor+".svg", format='svg', dpi=300, bbox_inches='tight')