In [None]:
# Data stuff
import numpy as np
import pandas as pd

# Visual stuff
from IPython.display import display
# Configs 
pd.options.display.max_columns = None
pd.options.display.max_rows = None
import matplotlib.pyplot as plt

# Random state seed
rand_state=42

# ML stuff
import sklearn
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV,StratifiedKFold
import lightgbm as lgb
import xgboost as xgb
from xgboost import XGBClassifier
import shap
from sklearn.metrics import confusion_matrix


In [None]:
data = pd.read_csv('data_gen/dataset_missing_entire.csv')
# Ensure correct types
data = data.astype(dtype={
    'age':float,
    'gender':int,
    'expired':int,
    'P-glucose':float,
    'blood_pressure_systoliskt':float,
    'blood_pressure_diastoliskt':float,
    'BMI':float
    
})
# Order ints(categorical variables, first) and floats(number variables, last)
data = data[[
    'gender',
    'I109',
    'E119',
    'E669',
    'I259',
    'I252',
    'I209',
    'E660',
    'E118',
    'I639',
    'E113',
    'expired',
    'age',
    'P-glucose',
    'blood_pressure_systoliskt',
    'blood_pressure_diastoliskt',
    'BMI',
]]

In [None]:
def plot_age_vs_expired_for(predictor, dataset):
    died_with_predictor = dataset[ (dataset.expired==1) & (dataset[predictor] == 1) ].age.tolist()
    died_predictor = dataset[ (dataset.expired==1) & (dataset[predictor]  == 0) ].age.tolist()
    lived_with_predictor = dataset[ (dataset.expired==0) & (dataset[predictor]  == 1) ].age.tolist()
    lived_predictor = dataset[ (dataset.expired==0) & (dataset[predictor]  == 0) ].age.tolist()


    plt.figure( figsize=(7,5),dpi=300)

    plt.hist([died_with_predictor, died_predictor,lived_with_predictor,  lived_predictor ],
             stacked=True, color=['darkred','red','green','lightgreen'], bins = int(dataset.age.max()-dataset.age.min()) )
    if predictor =='gender':
        plt.legend(['Died(men)','Died(women) ', 'Lived(men)', 'Lived(women)'])
    else:   
        plt.legend(['Died with '+ predictor,'Died without '+ predictor,'Lived with '+ predictor, 'Lived without '+ predictor ])
    
    plt.ylabel('Numbers of patients')
    plt.xlabel('Age')
    plt.title('Age vs expired vs '+predictor)

    plt.show()
    
def plot_predictor_vs_expired(predictor, dataset):

    plt.figure( figsize=(7,5),dpi=300)

    plt.hist([dataset[dataset.expired==1][predictor].tolist(),dataset[dataset.expired==0][predictor].tolist()],
             stacked=True, color=['red','lightgreen'], bins = int(dataset.age.max()-dataset.age.min()) )
    plt.legend(['Died','Survived'])
    plt.ylabel('Numbers of patients')
    plt.xlabel(predictor)
    plt.title('Mortality amongst COVID-19 patients(confirmed cases)')

    plt.show()


In [None]:
dataset = data.copy(deep=True)

# dataset as is, but target variable dropped(hospital expire flag)
dataset_no_target = dataset.drop(columns=['expired'])

# Feture names and categorical feature names
feature_names = dataset_no_target.select_dtypes(include='int').columns.values.tolist() + dataset_no_target.select_dtypes(exclude='int').columns.values.tolist() 

# Generate data set with categories(int type required)
dataframe_int_list = dataset_no_target.select_dtypes(include='int').values.tolist()
dataframe_no_int_list = dataset_no_target.select_dtypes(exclude='int').values.tolist()
Y = dataset.expired.values.tolist()
X = []
for i,v in enumerate(dataframe_int_list):
    X = X + [v+dataframe_no_int_list[i]]

In [None]:
def strat_cv_it(classifier, params, param_comb ):
    folds = 3
    skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = rand_state)
    
    # Perform fit and scoring
    random_search = RandomizedSearchCV( classifier, return_train_score=True, param_distributions=params ,scoring='roc_auc', n_iter=param_comb, n_jobs=4, cv=skf.split(X, Y),random_state = rand_state, refit=True, verbose=3, error_score=0.0 )
    random_search.fit(X, Y)     

    # Display results and return best model
    display(random_search.best_score_)
    display(random_search.best_params_)
    display(pd.DataFrame(random_search.cv_results_))
    return random_search.best_estimator_

# Scale of negative class to the positive class(#survived/#died)
scale_pos_weight_min = int( (dataset[dataset.expired==0].shape[0] / dataset[dataset.expired==1].shape[0]) )

# LightGBM

In [None]:
LGB_classifier = lgb.LGBMClassifier()
LGB_params = {
             'num_leaves': [1,5,8,10,15,20,35,40], 
             'min_child_samples': [1,5,10,20,50,100,200,300,400,500], 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': [0.2,0,4,0.5, 0.6, 0.8, 1.0],
             'colsample_bytree': [0.6, 0.8, 1.0],
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100],
             'scale_pos_weight':  list(range(scale_pos_weight_min,3*scale_pos_weight_min))

}
model = strat_cv_it(LGB_classifier ,LGB_params,50)
explainer = shap.TreeExplainer(model)

In [None]:
y_true = Y
y_pred = model.predict(X)
confusion_matrix(y_true, y_pred)

In [None]:
shap_values = explainer.shap_values(dataset.drop(columns=['expired']))
shap.summary_plot(shap_values[1],dataset.drop(columns=['expired']) ,feature_names=feature_names, show=False)
#plt.savefig("shap_summary.svg", format='svg', dpi=300, bbox_inches='tight')
for predictor in feature_names:
    if predictor != 'age':
        save = shap.dependence_plot(predictor, shap_values[1],dataset.drop(columns=['expired']) ,feature_names=feature_names, show=False)
    else:
        save = shap.dependence_plot(predictor, shap_values[1],dataset.drop(columns=['expired']) ,feature_names=feature_names, interaction_index='gender', show=False)
        
  #  plt.savefig(predictor+".svg", format='svg', dpi=300, bbox_inches='tight')

In [None]:
# Plot some interesting stuff
plot_predictor_vs_expired('age',dataset)
plot_age_vs_expired_for('gender', dataset)
plot_predictor_vs_expired('BMI', dataset)
plot_age_vs_expired_for('I109', dataset)
plot_age_vs_expired_for('E119', dataset)

# LightGBM withouth age predictor

In [None]:
dataset.drop(inplace=True, axis=1, labels=['age'])

# dataset as is, but target variable dropped(hospital expire flag)
dataset_no_target = dataset.drop(columns=['expired'])

# Feture names and categorical feature names
feature_names = dataset_no_target.select_dtypes(include='int').columns.values.tolist() + dataset_no_target.select_dtypes(exclude='int').columns.values.tolist() 

# Generate data set with categories(int type required)
dataframe_int_list = dataset_no_target.select_dtypes(include='int').values.tolist()
dataframe_no_int_list = dataset_no_target.select_dtypes(exclude='int').values.tolist()
Y = dataset.expired.values.tolist()
X = []
for i,v in enumerate(dataframe_int_list):
    X = X + [v+dataframe_no_int_list[i]]


In [None]:
LGB_classifier = lgb.LGBMClassifier()
LGB_params = {
             'num_leaves': [1,5,8,10,15,20,35,40], 
             'min_child_samples': [1,5,10,20,50,100,200,300,400,500], 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': [0.2,0,4,0.5, 0.6, 0.8, 1.0],
             'colsample_bytree': [0.6, 0.8, 1.0],
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100],
             'scale_pos_weight':  list(range(scale_pos_weight_min,3*scale_pos_weight_min))

}
model = strat_cv_it(LGB_classifier ,LGB_params,50)
explainer = shap.TreeExplainer(model)

In [None]:
y_true = Y
y_pred = model.predict(X)
confusion_matrix(y_true, y_pred)

In [None]:
shap_values = explainer.shap_values(dataset.drop(columns=['expired']))
shap.summary_plot(shap_values[1],dataset.drop(columns=['expired']) ,feature_names=feature_names, show=False)
#plt.savefig("shap_summary.svg", format='svg', dpi=300, bbox_inches='tight')
for predictor in feature_names:
    if predictor != 'age':
        save = shap.dependence_plot(predictor, shap_values[1],dataset.drop(columns=['expired']) ,feature_names=feature_names, show=False)
    else:
        save = shap.dependence_plot(predictor, shap_values[1],dataset.drop(columns=['expired']) ,feature_names=feature_names, interaction_index='gender', show=False)
        
  #  plt.savefig(predictor+".svg", format='svg', dpi=300, bbox_inches='tight')

In [None]:
dataset = data.copy(deep=True)

# dataset as is, but target variable dropped(hospital expire flag)
dataset_no_target = dataset.drop(columns=['expired'])

# Feture names and categorical feature names
feature_names = dataset_no_target.select_dtypes(include='int').columns.values.tolist() + dataset_no_target.select_dtypes(exclude='int').columns.values.tolist() 

# Generate data set with categories(int type required)
dataframe_int_list = dataset_no_target.select_dtypes(include='int').values.tolist()
dataframe_no_int_list = dataset_no_target.select_dtypes(exclude='int').values.tolist()
Y = dataset.expired.values.tolist()
X = []
for i,v in enumerate(dataframe_int_list):
    X = X + [v+dataframe_no_int_list[i]]

print('Split age groups at 70 \n')
print('Age Died')
print('-70: ', dataset[(dataset.age <= 70 )  & (dataset.expired==1)].shape[0])
print('70 +: ', dataset[(dataset.age > 70) & (dataset.expired==1)].shape[0])
print('Age Lived')
print('-70: ', dataset[(dataset.age <= 70 )  & (dataset.expired==0)].shape[0])
print('70 +: ',dataset[(dataset.age > 70) & (dataset.expired==0)].shape[0])

In [None]:
# Plot some interesting stuff
plot_age_vs_expired_for('I109', dataset)
plot_age_vs_expired_for('gender', dataset)
plot_predictor_vs_expired('BMI',dataset)
plot_age_vs_expired_for('I259', dataset)
plot_predictor_vs_expired('blood_pressure_diastoliskt',dataset)
plot_predictor_vs_expired('blood_pressure_systoliskt',dataset)
plot_predictor_vs_expired('P-glucose',dataset)

# Age group 70 and below 

In [None]:
dataset = data.copy(deep=True)
dataset = dataset[dataset.age <= 70]


# Plot some interesting stuff
plot_predictor_vs_expired('age',dataset)
plot_age_vs_expired_for('I109', dataset)
plot_age_vs_expired_for('gender', dataset)
plot_predictor_vs_expired('blood_pressure_systoliskt',dataset)
plot_predictor_vs_expired('BMI', dataset)
plot_predictor_vs_expired('blood_pressure_diastoliskt',dataset)





# drop age
dataset.drop(inplace=True, axis=1, labels=['age'])

# dataset as is, but target variable dropped(hospital expire flag)
dataset_no_target = dataset.drop(columns=['expired'])

# Feture names and categorical feature names
feature_names = dataset_no_target.select_dtypes(include='int').columns.values.tolist() + dataset_no_target.select_dtypes(exclude='int').columns.values.tolist() 

# Generate data set with categories(int type required)
dataframe_int_list = dataset_no_target.select_dtypes(include='int').values.tolist()
dataframe_no_int_list = dataset_no_target.select_dtypes(exclude='int').values.tolist()
Y = dataset.expired.values.tolist()
X = []
for i,v in enumerate(dataframe_int_list):
    X = X + [v+dataframe_no_int_list[i]]


# Scale of negative class to the positive class(#survived/#died)
scale_pos_weight_min = int( (dataset[dataset.expired==0].shape[0] / dataset[dataset.expired==1].shape[0]) )

In [None]:
LGB_classifier = lgb.LGBMClassifier()
LGB_params = {
             'num_leaves': [1,5,8,10,15,20,35,40], 
             'min_child_samples': [1,5,10,20,50,100,200,300,400,500], 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': [0.2,0,4,0.5, 0.6, 0.8, 1.0],
             'colsample_bytree': [0.6, 0.8, 1.0],
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100],
             'scale_pos_weight':  list(range(scale_pos_weight_min,3*scale_pos_weight_min))

}
model = strat_cv_it(LGB_classifier ,LGB_params,50)
explainer = shap.TreeExplainer(model)

In [None]:
y_true = Y
y_pred = model.predict(X)
confusion_matrix(y_true, y_pred)

In [None]:
shap_values = explainer.shap_values(dataset.drop(columns=['expired']))
shap.summary_plot(shap_values[1],dataset.drop(columns=['expired']) ,feature_names=feature_names, show=False)
#plt.savefig("shap_summary.svg", format='svg', dpi=300, bbox_inches='tight')
for predictor in feature_names:
    if predictor != 'age':
        save = shap.dependence_plot(predictor, shap_values[1],dataset.drop(columns=['expired']) ,feature_names=feature_names, show=False)
    else:
        save = shap.dependence_plot(predictor, shap_values[1],dataset.drop(columns=['expired']) ,feature_names=feature_names, interaction_index='gender', show=False)
        
  #  plt.savefig(predictor+".svg", format='svg', dpi=300, bbox_inches='tight')

# Age group above 70

In [None]:
dataset = data.copy(deep=True)
dataset = dataset[(dataset.age > 70) ]

# Drop age
dataset.drop(inplace=True, axis=1, labels=['age'])


# dataset as is, but target variable dropped(hospital expire flag)
dataset_no_target = dataset.drop(columns=['expired'])

# Feture names and categorical feature names
feature_names = dataset_no_target.select_dtypes(include='int').columns.values.tolist() + dataset_no_target.select_dtypes(exclude='int').columns.values.tolist() 

# Generate data set with categories(int type required)
dataframe_int_list = dataset_no_target.select_dtypes(include='int').values.tolist()
dataframe_no_int_list = dataset_no_target.select_dtypes(exclude='int').values.tolist()
Y = dataset.expired.values.tolist()
X = []
for i,v in enumerate(dataframe_int_list):
    X = X + [v+dataframe_no_int_list[i]]


# Scale of negative class to the positive class(#survived/#died)
scale_pos_weight_min = int( (dataset[dataset.expired==0].shape[0] / dataset[dataset.expired==1].shape[0]) )

In [None]:
LGB_classifier = lgb.LGBMClassifier()
LGB_params = {
             'num_leaves': [1,5,8,10,15,20,35,40], 
             'min_child_samples': [1,5,10,20,50,100,200,300,400,500], 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': [0.2,0,4,0.5, 0.6, 0.8, 1.0],
             'colsample_bytree': [0.6, 0.8, 1.0],
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100],
             'scale_pos_weight':  list(range(scale_pos_weight_min,3*scale_pos_weight_min))

}
model = strat_cv_it(LGB_classifier ,LGB_params,50)
explainer = shap.TreeExplainer(model)

In [None]:
y_true = Y
y_pred = model.predict(X)
confusion_matrix(y_true, y_pred)

In [None]:
shap_values = explainer.shap_values(dataset.drop(columns=['expired']))
shap.summary_plot(shap_values[1],dataset.drop(columns=['expired']) ,feature_names=feature_names, show=False)
#plt.savefig("shap_summary.svg", format='svg', dpi=300, bbox_inches='tight')
for predictor in feature_names:
    if predictor != 'age':
        save = shap.dependence_plot(predictor, shap_values[1],dataset.drop(columns=['expired']) ,feature_names=feature_names, show=False)
    else:
        save = shap.dependence_plot(predictor, shap_values[1],dataset.drop(columns=['expired']) ,feature_names=feature_names, interaction_index='gender', show=False)
        
  #  plt.savefig(predictor+".svg", format='svg', dpi=300, bbox_inches='tight')