In [1]:
# Data stuff
import numpy as np
import pandas as pd

# Visual stuff
from IPython.display import display
# Configs 
pd.options.display.max_columns = None
pd.options.display.max_rows = None
import matplotlib.pyplot as plt

# Random state seed
rand_state=42

# ML stuff
import sklearn
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV,StratifiedKFold
import lightgbm as lgb
import xgboost as xgb
from xgboost import XGBClassifier
import shap

# Imbalanced learn 
from imblearn.pipeline import Pipeline, make_pipeline
from imblearn.over_sampling import SMOTENC

In [2]:
dataset = pd.read_csv('data_gen/dataset_KNN_entire.csv')
# Ensure correct types
dataset = dataset.astype(dtype={
    'age':float,
    'gender':int,
    'expired':int,
    'P-glucose':float,
    'blood_pressure_systoliskt':float,
    'blood_pressure_diastoliskt':float,
    'BMI':float
    
})
# Order ints(categorical variables, first) and floats(number variables, last)
dataset = dataset[[
    'gender',
    'I109',
    'E119',
    'E669',
    'I259',
    'I252',
    'I209',
    'E660',
    'E118',
    'I639',
    'E113',
    'expired',
    'age',
    'P-glucose',
    'blood_pressure_systoliskt',
    'blood_pressure_diastoliskt',
    'BMI',
]]

In [3]:
# Generate data set withouth categories(all numbers)
Y_no_cat = dataset.expired.values
X_no_cat = dataset.drop(columns=['expired']).values

# dataset as is, but target variable dropped(hospital expire flag)
dataset_no_target = dataset.drop(columns=['expired'])

# Feture names and categorical feature names
feature_names = dataset_no_target.select_dtypes(include='int').columns.values.tolist() + dataset_no_target.select_dtypes(exclude='int').columns.values.tolist() 
cat_feature_names = dataset_no_target.select_dtypes(include='int').columns.values.tolist() 

# Generate data set with categories(int type required)
dataframe_int_list = dataset_no_target.select_dtypes(include='int').values.tolist()
dataframe_no_int_list = dataset_no_target.select_dtypes(exclude='int').values.tolist()
Y = dataset.expired.values.tolist()
X = []
for i,v in enumerate(dataframe_int_list):
    X = X + [v+dataframe_no_int_list[i]]

# Generate categorical feature indicies
cat_features_indices=list(range(0,len(dataframe_int_list[0])))

In [4]:
def strat_cv_it(classifier, params, uses_cat, param_comb ):
    folds = 3
    skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = rand_state)
    
    imba_pipeline = make_pipeline(SMOTENC(random_state=42,categorical_features=cat_features_indices), 
                                 classifier)
    
    # Perform fit and scoring
    if uses_cat:
        random_search = RandomizedSearchCV(imba_pipeline, return_train_score=True, param_distributions=params ,scoring='roc_auc', n_iter=param_comb, n_jobs=4, cv=skf.split(X, Y),random_state = rand_state, refit=True, verbose=3,  )
        random_search.fit(X, Y)     
    else:
        random_search = RandomizedSearchCV(imba_pipeline, return_train_score=True, param_distributions=params ,scoring='roc_auc', n_iter=param_comb, n_jobs=4, cv=skf.split(X_no_cat, Y_no_cat),random_state = rand_state, refit=True, verbose=3)
        random_search.fit(X_no_cat, Y_no_cat)
    
    # Display results and return best model
    display(random_search.best_score_)
    display(random_search.best_params_)
    display(pd.DataFrame(random_search.cv_results_))
    return random_search.best_estimator_

# XGboost

In [5]:
xgb_classifier = xgb.XGBClassifier(objective = "binary:logistic",random_state=rand_state)
xgb_params = {
        'xgbclassifier__learning_rate': (0.01, 0.05,0.1),
        'xgbclassifier__min_child_weight': [3, 5, 10],
        'xgbclassifier__gamma': [0.5, 1.5, 2, 5],
        'xgbclassifier__subsample': [0.6, 0.8, 1.0],
        'xgbclassifier__colsample_bytree': [0.6, 0.8, 1.0],
        'xgbclassifier__max_depth': list(range(5,30)),
        'xgbclassifier__scale_pos_weight':  [0.5,0.75,0.9,1, 1.1, 1.25,1.5]
}
model = strat_cv_it(xgb_classifier ,xgb_params,False,50)
explainer = shap.TreeExplainer(model['xgbclassifier'])

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    9.3s
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:  5.3min
[Parallel(n_jobs=4)]: Done 150 out of 150 | elapsed:  7.9min finished


0.8797792436311543

{'xgbclassifier__subsample': 0.6,
 'xgbclassifier__scale_pos_weight': 0.5,
 'xgbclassifier__min_child_weight': 3,
 'xgbclassifier__max_depth': 8,
 'xgbclassifier__learning_rate': 0.01,
 'xgbclassifier__gamma': 0.5,
 'xgbclassifier__colsample_bytree': 0.6}

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_xgbclassifier__subsample,param_xgbclassifier__scale_pos_weight,param_xgbclassifier__min_child_weight,param_xgbclassifier__max_depth,param_xgbclassifier__learning_rate,param_xgbclassifier__gamma,param_xgbclassifier__colsample_bytree,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,1.312709,0.422485,0.012707,0.00412,0.8,1.25,5,25,0.1,5.0,1.0,"{'xgbclassifier__subsample': 0.8, 'xgbclassifi...",0.882746,0.865569,0.85503,0.867782,0.011423,40,0.972895,0.972463,0.97145,0.972269,0.000606
1,0.775646,0.076591,0.011011,0.001551,0.6,0.75,10,5,0.05,5.0,0.6,"{'xgbclassifier__subsample': 0.6, 'xgbclassifi...",0.889982,0.874288,0.868514,0.877595,0.009071,5,0.911852,0.91515,0.914489,0.91383,0.001425
2,1.408443,0.407123,0.01259,0.00185,1.0,1.5,5,18,0.01,0.5,0.6,"{'xgbclassifier__subsample': 1.0, 'xgbclassifi...",0.880759,0.8714,0.862882,0.87168,0.007301,25,0.965066,0.967734,0.964886,0.965895,0.001302
3,0.998044,0.261104,0.007298,0.002434,0.8,0.5,10,10,0.01,0.5,1.0,"{'xgbclassifier__subsample': 0.8, 'xgbclassifi...",0.882959,0.872727,0.859996,0.871894,0.009393,23,0.917042,0.91588,0.912257,0.915059,0.002038
4,1.527052,0.134057,0.017146,0.00338,0.8,1.25,5,17,0.05,5.0,1.0,"{'xgbclassifier__subsample': 0.8, 'xgbclassifi...",0.88167,0.867747,0.860675,0.870031,0.008722,32,0.961207,0.962917,0.961505,0.961877,0.000746
5,1.556201,0.358091,0.013152,0.004958,1.0,0.5,3,15,0.05,1.5,1.0,"{'xgbclassifier__subsample': 1.0, 'xgbclassifi...",0.878632,0.863789,0.855907,0.866109,0.009422,45,0.975316,0.974903,0.972803,0.974341,0.001101
6,0.973145,0.11178,0.014202,0.006693,0.8,0.9,3,9,0.05,2.0,0.6,"{'xgbclassifier__subsample': 0.8, 'xgbclassifi...",0.885321,0.870482,0.859465,0.871756,0.010594,24,0.966688,0.968659,0.96609,0.967146,0.001098
7,1.824373,0.179995,0.014804,0.003453,0.8,1.1,3,26,0.05,5.0,1.0,"{'xgbclassifier__subsample': 0.8, 'xgbclassifi...",0.881198,0.871755,0.857985,0.870313,0.009532,30,0.967207,0.966646,0.964248,0.966034,0.001283
8,1.147542,0.175899,0.01094,0.002949,0.8,0.9,5,29,0.01,1.5,0.6,"{'xgbclassifier__subsample': 0.8, 'xgbclassifi...",0.885942,0.872385,0.867068,0.875132,0.007946,8,0.951572,0.953076,0.952108,0.952252,0.000623
9,1.418147,0.398159,0.01347,0.005053,1.0,0.9,5,22,0.05,5.0,0.6,"{'xgbclassifier__subsample': 1.0, 'xgbclassifi...",0.888141,0.871159,0.864605,0.874635,0.009918,12,0.958604,0.95921,0.958117,0.958644,0.000447


In [None]:
shap_values = explainer.shap_values( X_no_cat)
shap.summary_plot(shap_values,X_no_cat ,feature_names=feature_names, show=False)
#plt.savefig("shap_summary.svg", format='svg', dpi=300, bbox_inches='tight')
for predictor in feature_names:
    if predictor != 'age':
        save = shap.dependence_plot(predictor, shap_values,X_no_cat ,feature_names=feature_names,interaction_index='age', show=False)
    else:
        save = shap.dependence_plot(predictor, shap_values,X_no_cat ,feature_names=feature_names, interaction_index='gender', show=False)
        
    # plt.savefig(predictor+".svg", format='svg', dpi=300, bbox_inches='tight')

# LightGBM

In [7]:
LGB_classifier = lgb.LGBMClassifier()
LGB_params = {
             'lgbmclassifier__num_leaves': [1,5,8,10,15,20,35,40], 
             'lgbmclassifier__min_child_samples': [1,5,10,20,50,100,200,300,400,500], 
             'lgbmclassifier__min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'lgbmclassifier__subsample': [0.2,0,4,0.5, 0.6, 0.8, 1.0],
             'lgbmclassifier__colsample_bytree': [0.6, 0.8, 1.0],
             'lgbmclassifier__reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'lgbmclassifier__reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100],
             'lgbmclassifier__scale_pos_weight':  [0.5,0.75,0.9,1, 1.1, 1.25,1.5]

}
model = strat_cv_it(LGB_classifier ,LGB_params,True,50)
explainer = shap.TreeExplainer(model['lgbmclassifier'])

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    9.5s
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:   30.9s
[Parallel(n_jobs=4)]: Done 150 out of 150 | elapsed:   38.2s finished


0.8832860648148824

{'lgbmclassifier__subsample': 0.2,
 'lgbmclassifier__scale_pos_weight': 0.5,
 'lgbmclassifier__reg_lambda': 50,
 'lgbmclassifier__reg_alpha': 50,
 'lgbmclassifier__num_leaves': 5,
 'lgbmclassifier__min_child_weight': 0.01,
 'lgbmclassifier__min_child_samples': 100,
 'lgbmclassifier__colsample_bytree': 1.0}

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_lgbmclassifier__subsample,param_lgbmclassifier__scale_pos_weight,param_lgbmclassifier__reg_lambda,param_lgbmclassifier__reg_alpha,param_lgbmclassifier__num_leaves,param_lgbmclassifier__min_child_weight,param_lgbmclassifier__min_child_samples,param_lgbmclassifier__colsample_bytree,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.910986,0.099226,0.029231,0.003053,0.2,0.5,50.0,50.0,5,0.01,100,1.0,"{'lgbmclassifier__subsample': 0.2, 'lgbmclassi...",0.894117,0.877452,0.878288,0.883286,0.007667,1,0.890562,0.898088,0.893667,0.894105,0.003088
1,2.374017,0.596381,0.030353,0.000502,0.8,0.5,50.0,5.0,20,0.1,300,1.0,"{'lgbmclassifier__subsample': 0.8, 'lgbmclassi...",0.892251,0.870852,0.864813,0.875972,0.011772,13,0.904211,0.906182,0.907207,0.905867,0.001243
2,2.25228,0.396184,0.030118,0.009349,1.0,1.0,0.1,10.0,15,0.01,100,1.0,"{'lgbmclassifier__subsample': 1.0, 'lgbmclassi...",0.886515,0.87289,0.856861,0.872089,0.01212,21,0.92866,0.927751,0.926374,0.927595,0.00094
3,0.148847,0.049742,0.0,0.0,1.0,0.9,50.0,10.0,1,1000.0,100,1.0,"{'lgbmclassifier__subsample': 1.0, 'lgbmclassi...",,,,,,31,,,,,
4,1.804796,0.238229,0.028414,0.004135,0.5,1.25,0.1,50.0,40,10.0,200,0.6,"{'lgbmclassifier__subsample': 0.5, 'lgbmclassi...",0.893164,0.878055,0.870401,0.88054,0.009458,2,0.896209,0.897952,0.894226,0.896129,0.001522
5,0.45439,0.133974,0.020288,0.003121,1.0,0.9,0.0,0.1,15,10000.0,200,0.8,"{'lgbmclassifier__subsample': 1.0, 'lgbmclassi...",0.5,0.5,0.5,0.5,0.0,23,0.5,0.5,0.5,0.5,0.0
6,1.484874,0.071276,0.028836,0.008819,0.5,1.0,10.0,2.0,15,0.1,500,0.8,"{'lgbmclassifier__subsample': 0.5, 'lgbmclassi...",0.890602,0.870229,0.863991,0.874941,0.011363,14,0.909329,0.91554,0.915904,0.913591,0.003018
7,0.154524,0.030736,0.0,0.0,1.0,1.1,10.0,0.0,1,0.001,500,1.0,"{'lgbmclassifier__subsample': 1.0, 'lgbmclassi...",,,,,,42,,,,,
8,0.351499,0.124634,0.019904,0.002484,0.8,1.25,0.1,2.0,5,1000.0,400,0.6,"{'lgbmclassifier__subsample': 0.8, 'lgbmclassi...",0.5,0.5,0.5,0.5,0.0,23,0.5,0.5,0.5,0.5,0.0
9,0.882899,0.573641,0.022742,0.003975,1.0,1.5,1.0,10.0,8,100.0,400,1.0,"{'lgbmclassifier__subsample': 1.0, 'lgbmclassi...",0.890792,0.869689,0.868365,0.876282,0.010274,12,0.89949,0.903583,0.900943,0.901339,0.001694


In [None]:
shap_values = explainer.shap_values(dataset.drop(columns=['expired']))
shap.summary_plot(shap_values[1],dataset.drop(columns=['expired']) ,feature_names=feature_names, show=False)
#plt.savefig("shap_summary.svg", format='svg', dpi=300, bbox_inches='tight')
for predictor in feature_names:
    if predictor != 'age':
        save = shap.dependence_plot(predictor, shap_values[1],dataset.drop(columns=['expired']) ,feature_names=feature_names,interaction_index='age', show=False)
    else:
        save = shap.dependence_plot(predictor, shap_values[1],dataset.drop(columns=['expired']) ,feature_names=feature_names, interaction_index='gender', show=False)
        
  #  plt.savefig(predictor+".svg", format='svg', dpi=300, bbox_inches='tight')