In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier, SGDRegressor, RidgeCV, LassoCV, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from imblearn.over_sampling import SMOTEN


smoten_knn_contin = pd.read_csv('smoten_knn_contin.csv', index_col=False)
smoten_median_imputed_contin = pd.read_csv('smoten_median_imputed_contin.csv', index_col=False)

pd.set_option("display.max_columns", None)


def treeScale_coef(scaleFunc1, estimatorFunc2, k_fold_int, x_array, y_array):
    k = k_fold_int
    kf = KFold(n_splits=k, shuffle=True, random_state=None)    
    clf = make_pipeline(scaleFunc1, estimatorFunc2)

    acc_score = []
    for train_index , test_index in kf.split(X):
        X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
        y_train , y_test = y[train_index] , y[test_index]
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
     
    acc = accuracy_score(y_pred , y_test)
    acc_score.append(acc)
     
    avg_acc_score = sum(acc_score)/k
    print('accuracy of each fold - {}'.format(acc_score))
    print('Avg accuracy : {}'.format(avg_acc_score))

    outcome_labels = ['Intubation False', 'Intubation True']

    #print('parameters \n', clf.get_params())
    print('score \n', clf.score(X_train, y_train))
    print('feature_names_in_')
    print(clf.feature_names_in_)
    print()
    print('feature_importances_')
    print(clf[1].feature_importances_)
    print()
    for i in range(len(clf.feature_names_in_)):
        print(clf.feature_names_in_[i], ":" ,clf[1].feature_importances_[i])
    print()
    print('fit')
    print(clf.fit(X_train, y_train).n_features_in_, clf.fit(X_train, y_train).feature_names_in_)
    print()
    print('classification report \n', classification_report(y_test, y_pred, target_names=outcome_labels))
    micro_roc_auc_ovr = roc_auc_score(y_test, y_pred, multi_class="ovr", average="micro")
    print()
    print(f"Micro-averaged One-vs-Rest ROC AUC score:\n{micro_roc_auc_ovr:.2f}")
    print()
    print('Decision Path')
    print(clf[1].decision_path(X))
    print('\n')
    
    
print('### SMOTEN knn contin ###')
print()

print('Random Forest')


print('Random Forest StandardScaler')
print()

X = smoten_knn_contin.loc[:, ~smoten_knn_contin.columns.isin(['heart_rate_min', 'heart_rate_max', 'mbp_min', 'mbp_mean', 'temperature_max', 'wbc_max', 'hemoglobin_min', 'pt_min','outcome'])]
y = smoten_knn_contin['outcome']

rfmodel = RandomForestClassifier(
                        n_estimators=150, criterion='entropy', 
                        max_depth=None, min_samples_split=2, 
                        min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                        max_features='sqrt', max_leaf_nodes=None, 
                        min_impurity_decrease=0.0, bootstrap=True, oob_score=True, 
                        n_jobs=4, 
                        random_state=None, verbose=0, warm_start=False, 
                        class_weight=None, ccp_alpha=0.0, max_samples=None)


treeScale_coef(StandardScaler(), rfmodel, 5, X, y)
                          
                          
print('Random Forest RobustScaler')

X = smoten_knn_contin.loc[:, ~smoten_knn_contin.columns.isin(['heart_rate_min', 'mbp_max', 'sbp_max', 'sbp_mean', 'dbp_max', 'temperature_max', 'pt_max', 'gender','outcome'])]
y = smoten_knn_contin['outcome']
                          
rfmodel = RandomForestClassifier(
                        n_estimators=100, criterion='entropy', 
                        max_depth=None, min_samples_split=2, 
                        min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                        max_features='sqrt', max_leaf_nodes=None, 
                        min_impurity_decrease=0.0, bootstrap=True, oob_score=True, 
                        n_jobs=4, 
                        random_state=None, verbose=0, warm_start=False, 
                        class_weight=None, ccp_alpha=0.0, max_samples=None)

                          
treeScale_coef(RobustScaler(), rfmodel, 5, X, y)


print('### SMOTEN median impute ###')
print()

print('Random Forest StandardScaler')
print()
                          
X = smoten_median_imputed_contin.loc[:, ~smoten_median_imputed_contin.columns.isin(['heart_rate_min', 'mbp_max', 'sbp_min', 'sbp_mean', 'dbp_min', 'wbc_min', 'gender','outcome'])]
y = smoten_median_imputed_contin['outcome']

rfmodel = RandomForestClassifier(
                        n_estimators=100, criterion='entropy', 
                        max_depth=None, min_samples_split=2, 
                        min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                        max_features='sqrt', max_leaf_nodes=None, 
                        min_impurity_decrease=0.0, bootstrap=True, oob_score=True, 
                        n_jobs=4, 
                        random_state=None, verbose=0, warm_start=False, 
                        class_weight=None, ccp_alpha=0.0, max_samples=None)
                                     
treeScale_coef(StandardScaler(), rfmodel, 5, X, y)

print('Random Forest RobustScaler')

X = smoten_median_imputed_contin.loc[:, ~smoten_median_imputed_contin.columns.isin(['heart_rate_min', 'heart_rate_mean', 'mbp_mean', 'sbp_mean', 'dbp_max', 'glucose_min','outcome'])]
y = smoten_median_imputed_contin['outcome']

rfmodel = RandomForestClassifier(
                        n_estimators=150, criterion='entropy', 
                        max_depth=None, min_samples_split=2, 
                        min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                        max_features='sqrt', max_leaf_nodes=None, 
                        min_impurity_decrease=0.0, bootstrap=True, oob_score=True, 
                        n_jobs=4, 
                        random_state=None, verbose=0, warm_start=False, 
                        class_weight=None, ccp_alpha=0.0, max_samples=None)

treeScale_coef(RobustScaler(), rfmodel, 5, X, y)




### SMOTEN knn contin ###

Random Forest
Random Forest StandardScaler

accuracy of each fold - [0.9805887372013652]
Avg accuracy : 0.19611774744027305
score 
 1.0
feature_names_in_
['age' 'heart_rate_mean' 'mbp_max' 'sbp_min' 'sbp_max' 'sbp_mean'
 'dbp_min' 'dbp_max' 'dbp_mean' 'temperature_min' 'temperature_mean'
 'glucose_min' 'glucose_max' 'wbc_min' 'creatinine_min' 'creatinine_max'
 'hemoglobin_max' 'pt_max' 'urineoutput' 'gender']

feature_importances_
[0.16540723 0.05890776 0.01407608 0.02184445 0.01344961 0.10355974
 0.01386493 0.0176814  0.09063086 0.00981978 0.07411015 0.02425202
 0.04030408 0.05036573 0.0163169  0.02190498 0.05344434 0.03761368
 0.17085348 0.00159282]

age : 0.16540722523575355
heart_rate_mean : 0.05890776430736215
mbp_max : 0.014076084704521869
sbp_min : 0.021844447305572532
sbp_max : 0.013449609741700223
sbp_mean : 0.10355973647610775
dbp_min : 0.013864930481909083
dbp_max : 0.01768140404669224
dbp_mean : 0.09063085891312572
temperature_min : 0.009819781057



(<70320x398882 sparse matrix of type '<class 'numpy.int64'>'
	with 110136887 stored elements in Compressed Sparse Row format>, array([     0,   2649,   5220,   7785,  10364,  12941,  15504,  18163,
        20826,  23615,  26246,  29079,  31648,  34273,  36900,  39627,
        42188,  44895,  47582,  50285,  52934,  55689,  58266,  61033,
        63742,  66481,  69026,  71737,  74414,  77079,  79704,  82413,
        84986,  87619,  90296,  92953,  95642,  98329, 100958, 103557,
       106254, 108913, 111598, 114395, 117002, 119701, 122334, 125053,
       127572, 130117, 132766, 135415, 138052, 140811, 143484, 146153,
       148762, 151429, 154014, 156763, 159552, 162241, 164792, 167407,
       170102, 172755, 175442, 178071, 180756, 183409, 186088, 188873,
       191568, 194247, 196828, 199409, 201974, 204709, 207454, 210063,
       212828, 215651, 218344, 221019, 223728, 226415, 229000, 231553,
       234218, 236925, 239560, 242223, 244902, 247501, 250118, 252851,
       255476, 258149



(<70320x266926 sparse matrix of type '<class 'numpy.int64'>'
	with 78846054 stored elements in Compressed Sparse Row format>, array([     0,   2689,   5360,   8179,  10922,  13635,  16420,  19199,
        21848,  24433,  27090,  29691,  32434,  35103,  37772,  40315,
        43056,  45659,  48226,  50845,  53586,  56301,  59040,  61811,
        64482,  67197,  69912,  72471,  75138,  77847,  80502,  83241,
        85922,  88609,  91428,  94101,  96830,  99381, 101990, 104687,
       107356, 110027, 112764, 115357, 118148, 120913, 123624, 126371,
       129072, 131717, 134366, 136939, 139622, 142177, 144896, 147625,
       150374, 153089, 155698, 158487, 161112, 163745, 166424, 169149,
       171738, 174263, 176926, 179471, 182160, 184811, 187438, 190049,
       192646, 195317, 198002, 200621, 203258, 205825, 208482, 211213,
       213938, 216609, 219286, 222007, 224714, 227333, 229958, 232571,
       235158, 237861, 240316, 242941, 245560, 248221, 250902, 253591,
       256370, 258965,



(<70320x259802 sparse matrix of type '<class 'numpy.int64'>'
	with 71717250 stored elements in Compressed Sparse Row format>, array([     0,   2525,   5168,   7803,  10450,  13139,  15676,  18341,
        21038,  23509,  26134,  28783,  31274,  33967,  36650,  39217,
        41756,  44397,  47030,  49575,  52220,  54807,  57436,  60017,
        62576,  65177,  67784,  70271,  72888,  75531,  78158,  80813,
        83394,  86029,  88626,  91327,  93884,  96469,  99066, 101641,
       104142, 106747, 109360, 112015, 114672, 117245, 119768, 122371,
       124996, 127643, 130232, 132769, 135236, 137745, 140238, 142823,
       145432, 148065, 150634, 153207, 155792, 158353, 160890, 163421,
       166048, 168645, 171298, 173843, 176446, 178983, 181524, 184177,
       186818, 189329, 191998, 194661, 197208, 199903, 202492, 205025,
       207540, 210209, 212886, 215553, 218090, 220601, 223166, 225759,
       228384, 230973, 233624, 236235, 238852, 241479, 244030, 246693,
       249320, 252027,



(<70320x405920 sparse matrix of type '<class 'numpy.int64'>'
	with 115380107 stored elements in Compressed Sparse Row format>, array([     0,   2601,   5384,   8103,  10810,  13533,  16292,  18931,
        21702,  24323,  27078,  29849,  32576,  35281,  37996,  40727,
        43428,  46219,  49140,  51895,  54658,  57457,  60062,  62717,
        65444,  68035,  70738,  73431,  76206,  79013,  81632,  84231,
        86966,  89533,  92292,  94991,  97634, 100401, 103056, 105839,
       108478, 111061, 113760, 116511, 119130, 122001, 124828, 127377,
       129974, 132735, 135446, 138095, 140740, 143481, 146138, 148851,
       151550, 154289, 156970, 159747, 162448, 165191, 167870, 170485,
       173110, 175699, 178402, 181053, 183846, 186527, 189378, 192069,
       194838, 197549, 200116, 202799, 205420, 208127, 210904, 213663,
       216346, 218983, 221618, 224185, 226992, 229701, 232442, 235183,
       237770, 240525, 243188, 246007, 248596, 251187, 254008, 256749,
       259434, 262083

In [1]:
print('##SFS otpimised###')

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier, SGDRegressor, RidgeCV, LassoCV, LogisticRegression

from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from imblearn.over_sampling import SMOTEN


smoten_knn_contin = pd.read_csv('smoten_knn_contin.csv', index_col=False)
smoten_median_imputed_contin = pd.read_csv('smoten_median_imputed_contin.csv', index_col=False)

pd.set_option("display.max_columns", None)


def treeScale_coef(scaleFunc1, estimatorFunc2, k_fold_int, x_array, y_array):
    k = k_fold_int
    kf = KFold(n_splits=k, shuffle=True, random_state=None)    
    clf = make_pipeline(scaleFunc1, estimatorFunc2)

    acc_score = []
    for train_index , test_index in kf.split(X):
        X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
        y_train , y_test = y[train_index] , y[test_index]
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
     
    acc = accuracy_score(y_pred , y_test)
    acc_score.append(acc)
     
    avg_acc_score = sum(acc_score)/k
    print('accuracy of each fold - {}'.format(acc_score))
    print('Avg accuracy : {}'.format(avg_acc_score))

    outcome_labels = ['Intubation False', 'Intubation True']

    #print('parameters \n', clf.get_params())
    print('score \n', clf.score(X_train, y_train))
    print('feature_names_in_')
    print(clf.feature_names_in_)
    print()
    print('feature_importances_')
    print(clf[1].feature_importances_)
    print()
    for i in range(len(clf.feature_names_in_)):
        print(clf.feature_names_in_[i], ":" ,clf[1].feature_importances_[i])
    print()
    print('fit')
    print(clf.fit(X_train, y_train).n_features_in_, clf.fit(X_train, y_train).feature_names_in_)
    print()
    print('classification report \n', classification_report(y_test, y_pred, target_names=outcome_labels))
    micro_roc_auc_ovr = roc_auc_score(y_test, y_pred, multi_class="ovr", average="micro")
    print()
    print(f"Micro-averaged One-vs-Rest ROC AUC score:\n{micro_roc_auc_ovr:.2f}")
    print()
    print('Decision Path')
    print(clf[1].decision_path(X))
    print('\n')
    
    
print('### SMOTEN knn contin ###')
print()

print('Random Forest')


print('Random Forest StandardScaler')
print()

X = smoten_knn_contin[['age', 'dbp_mean', 'temperature_mean', 'wbc_max', 'creatinine_min']]
y = smoten_knn_contin['outcome']

rfmodel = RandomForestClassifier(
                        n_estimators=150, criterion='entropy', 
                        max_depth=None, min_samples_split=2, 
                        min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                        max_features='sqrt', max_leaf_nodes=None, 
                        min_impurity_decrease=0.0, bootstrap=True, oob_score=True, 
                        n_jobs=4, 
                        random_state=None, verbose=0, warm_start=False, 
                        class_weight=None, ccp_alpha=0.0, max_samples=None)


treeScale_coef(StandardScaler(), rfmodel, 5, X, y)
                          
                          
print('Random Forest RobustScaler')

X = smoten_knn_contin[['age', 'dbp_mean', 'temperature_mean', 'wbc_max', 'creatinine_min']]
y = smoten_knn_contin['outcome']
                          
rfmodel = RandomForestClassifier(
                        n_estimators=100, criterion='entropy', 
                        max_depth=None, min_samples_split=2, 
                        min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                        max_features='sqrt', max_leaf_nodes=None, 
                        min_impurity_decrease=0.0, bootstrap=True, oob_score=True, 
                        n_jobs=4, 
                        random_state=None, verbose=0, warm_start=False, 
                        class_weight=None, ccp_alpha=0.0, max_samples=None)

                          
treeScale_coef(RobustScaler(), rfmodel, 5, X, y)


print('### SMOTEN median impute ###')
print()

print('Random Forest StandardScaler')
print()
                          
X = smoten_median_imputed_contin[['age', 'dbp_mean', 'temperature_mean', 'wbc_max', 'creatinine_min']]
y = smoten_median_imputed_contin['outcome']

rfmodel = RandomForestClassifier(
                        n_estimators=100, criterion='entropy', 
                        max_depth=None, min_samples_split=2, 
                        min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                        max_features='sqrt', max_leaf_nodes=None, 
                        min_impurity_decrease=0.0, bootstrap=True, oob_score=True, 
                        n_jobs=4, 
                        random_state=None, verbose=0, warm_start=False, 
                        class_weight=None, ccp_alpha=0.0, max_samples=None)
                                     
treeScale_coef(StandardScaler(), rfmodel, 5, X, y)

print('Random Forest RobustScaler')

X = smoten_median_imputed_contin[['age', 'dbp_mean', 'temperature_mean', 'wbc_max', 'creatinine_min']]
y = smoten_median_imputed_contin['outcome']

rfmodel = RandomForestClassifier(
                        n_estimators=150, criterion='entropy', 
                        max_depth=None, min_samples_split=2, 
                        min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                        max_features='sqrt', max_leaf_nodes=None, 
                        min_impurity_decrease=0.0, bootstrap=True, oob_score=True, 
                        n_jobs=4, 
                        random_state=None, verbose=0, warm_start=False, 
                        class_weight=None, ccp_alpha=0.0, max_samples=None)

treeScale_coef(RobustScaler(), rfmodel, 5, X, y)




##SFS otpimised###
### SMOTEN knn contin ###

Random Forest
Random Forest StandardScaler

accuracy of each fold - [0.9754692832764505]
Avg accuracy : 0.1950938566552901
score 
 1.0
feature_names_in_
['age' 'dbp_mean' 'temperature_mean' 'wbc_max' 'creatinine_min']

feature_importances_
[0.31292545 0.296438   0.19327205 0.13504617 0.06231833]

age : 0.31292544933057515
dbp_mean : 0.2964379981969761
temperature_mean : 0.1932720494185489
wbc_max : 0.13504617372025804
creatinine_min : 0.06231832933364189

fit
5 ['age' 'dbp_mean' 'temperature_mean' 'wbc_max' 'creatinine_min']

classification report 
                   precision    recall  f1-score   support

Intubation False       0.96      0.99      0.98      7018
 Intubation True       0.99      0.96      0.98      7046

        accuracy                           0.98     14064
       macro avg       0.98      0.98      0.98     14064
    weighted avg       0.98      0.98      0.98     14064


Micro-averaged One-vs-Rest ROC AUC score:
0.98



(<70320x519374 sparse matrix of type '<class 'numpy.int64'>'
	with 112401751 stored elements in Compressed Sparse Row format>, array([     0,   3447,   6966,  10517,  13948,  17463,  20954,  24441,
        27936,  31499,  34996,  38285,  41882,  45177,  48628,  52069,
        55498,  58905,  62410,  65891,  69338,  72771,  76384,  79755,
        83256,  86687,  90024,  93429,  96812, 100273, 103706, 107327,
       110914, 114341, 117750, 121211, 124826, 128179, 131690, 135137,
       138558, 141951, 145296, 148779, 152244, 155617, 159086, 162445,
       165978, 169551, 172956, 176523, 179894, 183297, 186734, 190245,
       193774, 197167, 200482, 203907, 207384, 210751, 214160, 217549,
       221012, 224387, 227728, 231215, 234674, 238135, 241480, 244925,
       248388, 251865, 255368, 258757, 262368, 265849, 269324, 272713,
       276200, 279663, 283086, 286573, 289986, 293393, 296856, 300317,
       303754, 307209, 310806, 314269, 317854, 321347, 324756, 328291,
       331772, 335193



(<70320x351108 sparse matrix of type '<class 'numpy.int64'>'
	with 78404598 stored elements in Compressed Sparse Row format>, array([     0,   3567,   7142,  10563,  14046,  17505,  21114,  24601,
        28196,  31629,  35178,  38797,  42246,  45661,  49254,  52865,
        56520,  60003,  63648,  67177,  70604,  74123,  77712,  81277,
        84808,  88305,  91746,  95215,  98822, 102411, 106004, 109443,
       112892, 116269, 119958, 123367, 126818, 130385, 133898, 137275,
       140802, 144337, 147864, 151313, 154930, 158429, 161826, 165337,
       168880, 172295, 175840, 179419, 182826, 186443, 190042, 193611,
       197208, 200755, 204346, 207835, 211454, 214877, 218412, 221847,
       225550, 229121, 232504, 235935, 239360, 242993, 246452, 249881,
       253314, 256857, 260460, 263989, 267486, 271067, 274586, 278037,
       281556, 285087, 288408, 292019, 295576, 299125, 302578, 306029,
       309528, 312939, 316374, 319823, 323314, 326781, 330226, 333781,
       337198, 340717,



(<70320x350870 sparse matrix of type '<class 'numpy.int64'>'
	with 72843364 stored elements in Compressed Sparse Row format>, array([     0,   3447,   6906,  10355,  13990,  17515,  21092,  24541,
        28048,  31571,  35064,  38555,  42090,  45515,  49162,  52517,
        55914,  59495,  62942,  66449,  69970,  73383,  76744,  80279,
        83952,  87555,  90996,  94597,  98248, 101893, 105456, 108909,
       112468, 116021, 119460, 123101, 126632, 130157, 133698, 137125,
       140754, 144321, 147782, 151311, 154776, 158237, 161804, 165301,
       168782, 172227, 175792, 179269, 182798, 186395, 189782, 193223,
       196774, 200213, 203744, 207327, 210838, 214459, 217964, 221529,
       225116, 228511, 231950, 235589, 239056, 242593, 246190, 249687,
       253192, 256783, 260258, 263779, 267232, 270607, 274008, 277473,
       280986, 284427, 287844, 291341, 294890, 298297, 301886, 305499,
       309028, 312439, 315928, 319419, 322854, 326381, 329876, 333427,
       336920, 340353,



(<70320x531950 sparse matrix of type '<class 'numpy.int64'>'
	with 108202515 stored elements in Compressed Sparse Row format>, array([     0,   3453,   7088,  10671,  14140,  17677,  21230,  24743,
        28202,  31891,  35352,  38997,  42418,  45867,  49436,  52887,
        56482,  59997,  63448,  67061,  70572,  74189,  77636,  81217,
        84810,  88311,  91872,  95379,  98932, 102393, 105908, 109441,
       113024, 116767, 120366, 123927, 127512, 131145, 134582, 138135,
       141776, 145447, 149024, 152611, 156282, 159889, 163374, 166979,
       170546, 174167, 177706, 181255, 184664, 188213, 191756, 195393,
       199012, 202509, 206022, 209563, 213132, 216759, 220410, 223987,
       227526, 231113, 234622, 238219, 241640, 245191, 248860, 252351,
       255950, 259423, 263030, 266481, 270142, 273561, 277186, 280671,
       284128, 287517, 291098, 294671, 298222, 301691, 305210, 308687,
       312328, 315869, 319446, 323007, 326530, 330111, 333574, 337087,
       340658, 344229