In [5]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier, SGDRegressor, RidgeCV, LassoCV, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from imblearn.over_sampling import SMOTEN


smoten_knn_contin = pd.read_csv('smoten_knn_contin.csv', index_col=False)
smoten_median_imputed_contin = pd.read_csv('smoten_median_imputed_contin.csv', index_col=False)

pd.set_option("display.max_columns", None)


def ensembleScale_coef(scaleFunc1, estimatorFunc2, k_fold_int, x_array, y_array):
    k = k_fold_int
    kf = KFold(n_splits=k, shuffle=True, random_state=None)    
    clf = make_pipeline(scaleFunc1, estimatorFunc2)

    acc_score = []
    for train_index , test_index in kf.split(X):
        X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
        y_train , y_test = y[train_index] , y[test_index]
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
     
    acc = accuracy_score(y_pred , y_test)
    acc_score.append(acc)
     
    avg_acc_score = sum(acc_score)/k
    print('accuracy of each fold - {}'.format(acc_score))
    print('Avg accuracy : {}'.format(avg_acc_score))

    outcome_labels = ['Intubation False', 'Intubation True']

    #print('parameters \n', clf.get_params())
    print('score \n', clf.score(X_train, y_train))
    print('feature_names_in_')
    print(clf.feature_names_in_)
    print()
    print('feature_importances_')
    print(clf[1].feature_importances_)
    print()
    for i in range(len(clf.feature_names_in_)):
        print(clf.feature_names_in_[i], ":" ,clf[1].feature_importances_[i])
    print()
    print('fit')
    print(clf.fit(X_train, y_train).n_features_in_, clf.fit(X_train, y_train).feature_names_in_)
    print()
    print('classification report \n', classification_report(y_test, y_pred, target_names=outcome_labels))
    micro_roc_auc_ovr = roc_auc_score(y_test, y_pred, multi_class="ovr", average="micro")
    print()
    print(f"Micro-averaged One-vs-Rest ROC AUC score:\n{micro_roc_auc_ovr:.2f}")
    print()
    print('\n')
    
print('AdaBoost Ensemble')

addidas = AdaBoostClassifier(estimator=None, n_estimators=50, 
                             learning_rate=0.201, algorithm='SAMME.R', 
                             random_state=None)


print('### SMOTEN knn contin ###')
print()

print('StandardScaler')
print()
X = smoten_knn_contin.loc[:, ~smoten_knn_contin.columns.isin(['heart_rate_min', 'heart_rate_mean', 'sbp_max', 'temperature_min', 'temperature_max', 'wbc_min', 'pt_min', 'gender','outcome'])]
y = smoten_knn_contin['outcome']
ensembleScale_coef(StandardScaler(), addidas, 5, X, y)

print('RobustScaler')
print()
X = smoten_knn_contin.loc[:, ~smoten_knn_contin.columns.isin(['heart_rate_min', 'heart_rate_max', 'mbp_min', 'sbp_min', 'temperature_min', 'wbc_max', 'pt_max','outcome'])]
y = smoten_median_imputed_contin['outcome']
ensembleScale_coef(RobustScaler(), addidas, 5, X, y)

print('### SMOTEN median impute ###')
print()



print('StandardScaler')
print()
X = smoten_median_imputed_contin.loc[:, ~smoten_median_imputed_contin.columns.isin(['heart_rate_min', 'sbp_mean', 'dbp_min', 'temperature_min', 'creatinine_max','outcome'])]
y = smoten_median_imputed_contin['outcome']
ensembleScale_coef(StandardScaler(), addidas, 5, X, y)

print('RobustScaler')
print()
X = smoten_median_imputed_contin.loc[:, ~smoten_median_imputed_contin.columns.isin(['heart_rate_max', 'mbp_min', 'mbp_mean', 'sbp_max', 'temperature_max', 'wbc_min', 'pt_in','outcome'])]
y = smoten_median_imputed_contin['outcome']
ensembleScale_coef(StandardScaler(), addidas, 5, X, y)
ensembleScale_coef(RobustScaler(), addidas, 5, X, y)


AdaBoost Ensemble
### SMOTEN knn contin ###

StandardScaler

accuracy of each fold - [0.9581200227531286]
Avg accuracy : 0.19162400455062573
score 
 0.9575867463026166
feature_names_in_
['age' 'heart_rate_max' 'mbp_min' 'mbp_max' 'mbp_mean' 'sbp_min'
 'sbp_mean' 'dbp_min' 'dbp_max' 'dbp_mean' 'temperature_mean'
 'glucose_min' 'glucose_max' 'wbc_max' 'creatinine_min' 'creatinine_max'
 'hemoglobin_min' 'hemoglobin_max' 'pt_max' 'urineoutput']

feature_importances_
[0.16 0.04 0.   0.   0.   0.04 0.04 0.02 0.   0.08 0.1  0.04 0.02 0.04
 0.06 0.04 0.04 0.06 0.1  0.12]

age : 0.16
heart_rate_max : 0.04
mbp_min : 0.0
mbp_max : 0.0
mbp_mean : 0.0
sbp_min : 0.04
sbp_mean : 0.04
dbp_min : 0.02
dbp_max : 0.0
dbp_mean : 0.08
temperature_mean : 0.1
glucose_min : 0.04
glucose_max : 0.02
wbc_max : 0.04
creatinine_min : 0.06
creatinine_max : 0.04
hemoglobin_min : 0.04
hemoglobin_max : 0.06
pt_max : 0.1
urineoutput : 0.12

fit
20 ['age' 'heart_rate_max' 'mbp_min' 'mbp_max' 'mbp_mean' 'sbp_min'
 'sbp_me

In [2]:
print('##SFS optimised###')

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier, SGDRegressor, RidgeCV, LassoCV, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from imblearn.over_sampling import SMOTEN


smoten_knn_contin = pd.read_csv('smoten_knn_contin.csv', index_col=False)
smoten_median_imputed_contin = pd.read_csv('smoten_median_imputed_contin.csv', index_col=False)

pd.set_option("display.max_columns", None)


def ensembleScale_coef(scaleFunc1, estimatorFunc2, k_fold_int, x_array, y_array):
    k = k_fold_int
    kf = KFold(n_splits=k, shuffle=True, random_state=None)    
    clf = make_pipeline(scaleFunc1, estimatorFunc2)

    acc_score = []
    for train_index , test_index in kf.split(X):
        X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
        y_train , y_test = y[train_index] , y[test_index]
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
     
    acc = accuracy_score(y_pred , y_test)
    acc_score.append(acc)
     
    avg_acc_score = sum(acc_score)/k
    print('accuracy of each fold - {}'.format(acc_score))
    print('Avg accuracy : {}'.format(avg_acc_score))

    outcome_labels = ['Intubation False', 'Intubation True']

    #print('parameters \n', clf.get_params())
    print('score \n', clf.score(X_train, y_train))
    print('feature_names_in_')
    print(clf.feature_names_in_)
    print()
    print('feature_importances_')
    print(clf[1].feature_importances_)
    print()
    for i in range(len(clf.feature_names_in_)):
        print(clf.feature_names_in_[i], ":" ,clf[1].feature_importances_[i])
    print()
    print('fit')
    print(clf.fit(X_train, y_train).n_features_in_, clf.fit(X_train, y_train).feature_names_in_)
    print()
    print('classification report \n', classification_report(y_test, y_pred, target_names=outcome_labels))
    micro_roc_auc_ovr = roc_auc_score(y_test, y_pred, multi_class="ovr", average="micro")
    print()
    print(f"Micro-averaged One-vs-Rest ROC AUC score:\n{micro_roc_auc_ovr:.2f}")
    print()
    print('\n')
    
print('AdaBoost Ensemble')

addidas = AdaBoostClassifier(estimator=None, n_estimators=50, 
                             learning_rate=0.201, algorithm='SAMME.R', 
                             random_state=None)


print('### SMOTEN knn contin ###')
print()

print('StandardScaler')
print()
X = smoten_knn_contin[['age', 'mbp_mean', 'dbp_mean', 'temperature_mean', 'wbc_max']]

y = smoten_knn_contin['outcome']
ensembleScale_coef(StandardScaler(), addidas, 5, X, y)

print('RobustScaler')
print()
X = smoten_knn_contin[['age', 'mbp_mean', 'dbp_mean', 'temperature_mean', 'wbc_max']]
y = smoten_median_imputed_contin['outcome']
ensembleScale_coef(RobustScaler(), addidas, 5, X, y)

print('### SMOTEN median impute ###')
print()



print('StandardScaler')
print()
X = smoten_median_imputed_contin[['age', 'mbp_mean', 'temperature_mean', 'wbc_max', 'hemoglobin_max']]
y = smoten_median_imputed_contin['outcome']
ensembleScale_coef(StandardScaler(), addidas, 5, X, y)

print('RobustScaler')
print()
X = smoten_median_imputed_contin[['age', 'mbp_mean', 'temperature_mean', 'wbc_max', 'hemoglobin_max']]
y = smoten_median_imputed_contin['outcome']
ensembleScale_coef(StandardScaler(), addidas, 5, X, y)
ensembleScale_coef(RobustScaler(), addidas, 5, X, y)


##SFS optimised###
AdaBoost Ensemble
### SMOTEN knn contin ###

StandardScaler

accuracy of each fold - [0.9288253697383391]
Avg accuracy : 0.1857650739476678
score 
 0.928629835039818
feature_names_in_
['age' 'mbp_mean' 'dbp_mean' 'temperature_mean' 'wbc_max']

feature_importances_
[0.32 0.04 0.28 0.2  0.16]

age : 0.32
mbp_mean : 0.04
dbp_mean : 0.28
temperature_mean : 0.2
wbc_max : 0.16

fit
5 ['age' 'mbp_mean' 'dbp_mean' 'temperature_mean' 'wbc_max']

classification report 
                   precision    recall  f1-score   support

Intubation False       0.95      0.91      0.93      7066
 Intubation True       0.91      0.95      0.93      6998

        accuracy                           0.93     14064
       macro avg       0.93      0.93      0.93     14064
    weighted avg       0.93      0.93      0.93     14064


Micro-averaged One-vs-Rest ROC AUC score:
0.93



RobustScaler

accuracy of each fold - [0.9301763367463026]
Avg accuracy : 0.1860352673492605
score 
 0.92891424914