In [1]:
# Data stuff
import numpy as np
import pandas as pd

# Visual stuff
from IPython.display import display
# Configs 
pd.options.display.max_columns = None
pd.options.display.max_rows = None
import matplotlib.pyplot as plt

# Random state seed
rand_state=42

# ML stuff
import sklearn
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV,StratifiedKFold
import lightgbm as lgb
import xgboost as xgb
from xgboost import XGBClassifier
import shap

In [2]:
dataset = pd.read_csv('data_gen/dataset_KNN_undersampled.csv')
# Ensure correct types
dataset = dataset.astype(dtype={
    'age':float,
    'gender':int,
    'expired':int,
    'P-glucose':float,
    'blood_pressure_systoliskt':float,
    'blood_pressure_diastoliskt':float,
    'BMI':float
    
})
# Order ints(categorical variables, first) and floats(number variables, last)
dataset = dataset[[
    'gender',
    'I109',
    'E119',
    'E669',
    'I259',
    'I252',
    'I209',
    'E660',
    'E118',
    'I639',
    'E113',
    'expired',
    'age',
    'P-glucose',
    'blood_pressure_systoliskt',
    'blood_pressure_diastoliskt',
    'BMI',
]]

In [3]:
# Generate data set withouth categories(all numbers)
Y_no_cat = dataset.expired.values
X_no_cat = dataset.drop(columns=['expired']).values

# dataset as is, but target variable dropped(hospital expire flag)
dataset_no_target = dataset.drop(columns=['expired'])

# Feture names and categorical feature names
feature_names = dataset_no_target.select_dtypes(include='int').columns.values.tolist() + dataset_no_target.select_dtypes(exclude='int').columns.values.tolist() 
cat_feature_names = dataset_no_target.select_dtypes(include='int').columns.values.tolist() 

# Generate data set with categories(int type required)
dataframe_int_list = dataset_no_target.select_dtypes(include='int').values.tolist()
dataframe_no_int_list = dataset_no_target.select_dtypes(exclude='int').values.tolist()
Y = dataset.expired.values.tolist()
X = []
for i,v in enumerate(dataframe_int_list):
    X = X + [v+dataframe_no_int_list[i]]

# Generate categorical feature indicies
cat_features_indices=list(range(0,len(dataframe_int_list[0])))

In [4]:
# Function that w
def strat_cv_it(classifier, params, uses_cat, param_comb ):
    folds = 3
    skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = rand_state)
    
    # Perform fit and scoring
    if uses_cat:
        random_search = RandomizedSearchCV( classifier, return_train_score=True, param_distributions=params ,scoring='roc_auc', n_iter=param_comb, n_jobs=4, cv=skf.split(X, Y),random_state = rand_state, refit=True, verbose=3, error_score=0.0 )
        random_search.fit(X, Y)     
    else:
        random_search = RandomizedSearchCV( classifier, return_train_score=True, param_distributions=params ,scoring='roc_auc', n_iter=param_comb, n_jobs=4, cv=skf.split(X_no_cat, Y_no_cat),random_state = rand_state, refit=True, verbose=3,error_score=0.0)
        random_search.fit(X_no_cat, Y_no_cat)
    
    # Display results and return best model
    display(random_search.best_score_)
    display(random_search.best_params_)
    display(pd.DataFrame(random_search.cv_results_))
    return random_search.best_estimator_

# Scale of negative class to the positive class(#survived/#died)
scale_pos_weight_min = int( (dataset[dataset.expired==0].shape[0] / dataset[dataset.expired==1].shape[0]) )

# XGboost

In [5]:
xgb_classifier = xgb.XGBClassifier(objective = "binary:logistic",random_state=rand_state)
xgb_params = {
        'learning_rate': (0.01, 0.05,0.1),
        'min_child_weight': [3, 5, 10],
        'gamma': [0.5, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': list(range(5,30)),
      #  'scale_pos_weight':  [0.5,0.75,0.9,1, 1.1, 1.25,1.5]
}
model = strat_cv_it(xgb_classifier ,xgb_params,False,50)
explainer = shap.TreeExplainer(model)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    4.0s
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:   17.5s
[Parallel(n_jobs=4)]: Done 150 out of 150 | elapsed:   27.7s finished


0.9029858923409315

{'subsample': 1.0,
 'min_child_weight': 3,
 'max_depth': 16,
 'learning_rate': 0.05,
 'gamma': 5,
 'colsample_bytree': 0.8}

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_subsample,param_min_child_weight,param_max_depth,param_learning_rate,param_gamma,param_colsample_bytree,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.251415,0.012418,0.017172,0.002113,0.8,10,12,0.1,2.0,1.0,"{'subsample': 0.8, 'min_child_weight': 10, 'ma...",0.869582,0.902477,0.915729,0.895929,0.0194,28,0.94856,0.943399,0.942537,0.944832,0.002659
1,0.435372,0.089287,0.011874,0.00281,0.8,10,24,0.01,5.0,1.0,"{'subsample': 0.8, 'min_child_weight': 10, 'ma...",0.875946,0.90768,0.907419,0.897015,0.014898,20,0.920939,0.906083,0.910701,0.912575,0.006208
2,0.647126,0.274019,0.013542,0.000323,1.0,5,25,0.01,1.5,0.6,"{'subsample': 1.0, 'min_child_weight': 5, 'max...",0.882396,0.898736,0.903956,0.895029,0.009184,33,0.945885,0.940067,0.940274,0.942075,0.002695
3,0.491509,0.146841,0.015664,0.006095,1.0,10,28,0.1,5.0,0.8,"{'subsample': 1.0, 'min_child_weight': 10, 'ma...",0.883406,0.904519,0.907938,0.898621,0.010849,7,0.926736,0.917024,0.921666,0.921809,0.003966
4,0.288954,0.060624,0.010016,0.005422,0.6,10,10,0.1,5.0,0.8,"{'subsample': 0.6, 'min_child_weight': 10, 'ma...",0.879837,0.902885,0.894694,0.892472,0.009539,43,0.923824,0.913196,0.909841,0.915621,0.00596
5,0.311251,0.054076,0.011604,0.000205,0.8,10,6,0.1,5.0,0.8,"{'subsample': 0.8, 'min_child_weight': 10, 'ma...",0.881751,0.906927,0.912374,0.900351,0.013339,3,0.930824,0.923738,0.918194,0.924252,0.005169
6,0.63678,0.113461,0.015016,0.005233,0.8,3,24,0.05,1.5,0.8,"{'subsample': 0.8, 'min_child_weight': 3, 'max...",0.87543,0.900606,0.911357,0.895798,0.015056,29,0.979168,0.97768,0.975286,0.977378,0.001599
7,0.517692,0.067204,0.01505,0.001719,1.0,5,23,0.05,0.5,0.8,"{'subsample': 1.0, 'min_child_weight': 5, 'max...",0.872592,0.897854,0.910102,0.893516,0.015618,41,0.976084,0.974137,0.973276,0.974499,0.001175
8,0.833455,0.292299,0.014843,0.003299,0.8,3,17,0.05,0.5,1.0,"{'subsample': 0.8, 'min_child_weight': 3, 'max...",0.872076,0.903487,0.90915,0.894904,0.016307,35,0.982674,0.982463,0.981951,0.982363,0.000303
9,0.734811,0.265565,0.020335,0.001926,0.8,3,26,0.01,1.5,1.0,"{'subsample': 0.8, 'min_child_weight': 3, 'max...",0.868636,0.899553,0.901446,0.889878,0.01504,49,0.951656,0.944845,0.947627,0.948042,0.002796


In [None]:
shap_values = explainer.shap_values( X_no_cat)
shap.summary_plot(shap_values,X_no_cat ,feature_names=feature_names, show=False)
#plt.savefig("shap_summary.svg", format='svg', dpi=300, bbox_inches='tight')
for predictor in feature_names:
    if predictor != 'age':
        save = shap.dependence_plot(predictor, shap_values,X_no_cat ,feature_names=feature_names,interaction_index='age', show=False)
    else:
        save = shap.dependence_plot(predictor, shap_values,X_no_cat ,feature_names=feature_names, interaction_index='gender', show=False)
        
    # plt.savefig(predictor+".svg", format='svg', dpi=300, bbox_inches='tight')

# LightGBM

In [7]:
LGB_classifier = lgb.LGBMClassifier()
LGB_params = {
             'num_leaves': [1,5,8,10,15,20,35,40], 
             'min_child_samples': [1,5,10,20,50,100,200,300,400,500], 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': [0.2,0,4,0.5, 0.6, 0.8, 1.0],
             'colsample_bytree': [0.6, 0.8, 1.0],
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100],
           #  'scale_pos_weight':  [0.5,0.75,0.9,1, 1.1, 1.25,1.5]

}
model = strat_cv_it(LGB_classifier ,LGB_params,True,50)
explainer = shap.TreeExplainer(model)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    8.6s
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:   44.1s
[Parallel(n_jobs=4)]: Done 150 out of 150 | elapsed:   56.9s finished


0.9005497166548984

{'subsample': 0.8,
 'reg_lambda': 0.1,
 'reg_alpha': 0,
 'num_leaves': 10,
 'min_child_weight': 0.001,
 'min_child_samples': 100,
 'colsample_bytree': 1.0}

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_subsample,param_reg_lambda,param_reg_alpha,param_num_leaves,param_min_child_weight,param_min_child_samples,param_colsample_bytree,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.965261,0.811417,0.032455,0.023976,0.6,50.0,100.0,5,0.1,20,0.6,"{'subsample': 0.6, 'reg_lambda': 50, 'reg_alph...",0.831355,0.845975,0.863162,0.846831,0.012999,19,0.871355,0.835693,0.851492,0.852847,0.01459
1,0.006099,0.004801,0.0,0.0,4.0,100.0,7.0,10,1.0,400,0.8,"{'subsample': 4, 'reg_lambda': 100, 'reg_alpha...",0.0,0.0,0.0,0.0,0.0,38,0.0,0.0,0.0,0.0,0.0
2,1.883454,1.706491,0.031899,0.005132,0.5,100.0,10.0,20,10.0,20,0.6,"{'subsample': 0.5, 'reg_lambda': 100, 'reg_alp...",0.877902,0.897338,0.885496,0.886912,0.007998,12,0.912139,0.899132,0.901908,0.904393,0.005593
3,0.379301,0.267156,0.012888,0.015486,0.5,0.1,1.0,8,0.001,300,0.6,"{'subsample': 0.5, 'reg_lambda': 0.1, 'reg_alp...",0.834344,0.862186,0.862643,0.853058,0.013234,18,0.876629,0.854859,0.862838,0.864775,0.008992
4,4.706204,3.886069,0.026017,0.017039,0.6,0.0,50.0,8,1e-05,20,0.6,"{'subsample': 0.6, 'reg_lambda': 0, 'reg_alpha...",0.871668,0.892608,0.874589,0.879622,0.00926,13,0.891075,0.884868,0.881381,0.885775,0.004009
5,0.002092,4.9e-05,0.0,0.0,0.0,20.0,10.0,15,0.001,1,1.0,"{'subsample': 0, 'reg_lambda': 20, 'reg_alpha'...",0.0,0.0,0.0,0.0,0.0,38,0.0,0.0,0.0,0.0,0.0
6,0.002062,3e-05,0.0,0.0,0.6,5.0,5.0,1,1000.0,20,0.6,"{'subsample': 0.6, 'reg_lambda': 5, 'reg_alpha...",0.0,0.0,0.0,0.0,0.0,38,0.0,0.0,0.0,0.0,0.0
7,0.569383,0.481528,0.056012,0.01504,0.5,0.1,100.0,40,1.0,300,1.0,"{'subsample': 0.5, 'reg_lambda': 0.1, 'reg_alp...",0.809748,0.5,0.5,0.603249,0.146017,24,0.845648,0.5,0.5,0.615216,0.16294
8,2.685243,1.556988,0.031929,0.01798,0.8,0.1,0.0,10,0.001,100,1.0,"{'subsample': 0.8, 'reg_lambda': 0.1, 'reg_alp...",0.875989,0.907551,0.918109,0.90055,0.017894,1,0.949358,0.946134,0.939425,0.944972,0.004138
9,0.983864,0.495458,0.035947,0.007379,0.2,20.0,7.0,10,1000.0,50,0.6,"{'subsample': 0.2, 'reg_lambda': 20, 'reg_alph...",0.5,0.5,0.5,0.5,0.0,26,0.5,0.5,0.5,0.5,0.0


In [None]:
shap_values = explainer.shap_values(dataset.drop(columns=['expired']))
shap.summary_plot(shap_values[1],dataset.drop(columns=['expired']) ,feature_names=feature_names, show=False)
#plt.savefig("shap_summary.svg", format='svg', dpi=300, bbox_inches='tight')
for predictor in feature_names:
    if predictor != 'age':
        save = shap.dependence_plot(predictor, shap_values[1],dataset.drop(columns=['expired']) ,feature_names=feature_names,interaction_index='age', show=False)
    else:
        save = shap.dependence_plot(predictor, shap_values[1],dataset.drop(columns=['expired']) ,feature_names=feature_names, interaction_index='gender', show=False)
        
  #  plt.savefig(predictor+".svg", format='svg', dpi=300, bbox_inches='tight')