In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier, SGDRegressor, RidgeCV, LassoCV, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from imblearn.over_sampling import SMOTEN


smoten_median_imputed_less_40 = pd.read_csv('smoten_median_imputed_less_40.csv', index_col=False)

pd.set_option("display.max_columns", None)


def treeScale_coef(scaleFunc1, estimatorFunc2, k_fold_int, x_array, y_array):
    k = k_fold_int
    kf = KFold(n_splits=k, shuffle=True, random_state=None)    
    clf = make_pipeline(scaleFunc1, estimatorFunc2)

    acc_score = []
    for train_index , test_index in kf.split(X):
        X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
        y_train , y_test = y[train_index] , y[test_index]
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
     
    acc = accuracy_score(y_pred , y_test)
    acc_score.append(acc)
     
    avg_acc_score = sum(acc_score)/k
    print('accuracy of each fold - {}'.format(acc_score))
    print('Avg accuracy : {}'.format(avg_acc_score))

    outcome_labels = ['Intubation False', 'Intubation True']

    #print('parameters \n', clf.get_params())
    print('score \n', clf.score(X_train, y_train))
    print('feature_names_in_')
    print(clf.feature_names_in_)
    print()
    print('feature_importances_')
    print(clf[1].feature_importances_)
    print()
    for i in range(len(clf.feature_names_in_)):
        print(clf.feature_names_in_[i], ":" ,clf[1].feature_importances_[i])
    print()
    print('fit')
    print(clf.fit(X_train, y_train).n_features_in_, clf.fit(X_train, y_train).feature_names_in_)
    print()
    print('classification report \n', classification_report(y_test, y_pred, target_names=outcome_labels))
    micro_roc_auc_ovr = roc_auc_score(y_test, y_pred, multi_class="ovr", average="micro")
    print()
    print(f"Micro-averaged One-vs-Rest ROC AUC score:\n{micro_roc_auc_ovr:.2f}")
    print()
    print('Decision Path')
    print(clf[1].decision_path(X))
    print('\n')
    

print('Random Forest')

rfmodel = RandomForestClassifier(
                        n_estimators=100, criterion='entropy', 
                        max_depth=None, min_samples_split=2, 
                        min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                        max_features='sqrt', max_leaf_nodes=None, 
                        min_impurity_decrease=0.0, bootstrap=True, oob_score=True, 
                        n_jobs=4, 
                        random_state=None, verbose=0, warm_start=False, 
                        class_weight=None, ccp_alpha=0.0, max_samples=None)

print('### SMOTEN noncontin median impute ###')
print()

X = smoten_median_imputed_less_40.drop('outcome',axis= 1)
y = smoten_median_imputed_less_40['outcome']

print('Random Forest StandardScaler')
print()
treeScale_coef(StandardScaler(), rfmodel, 5, X, y)

print('Random Forest RobustScaler')
treeScale_coef(RobustScaler(), rfmodel, 5, X, y)




Random Forest
### SMOTEN noncontin median impute ###

Random Forest StandardScaler

accuracy of each fold - [0.9788822525597269]
Avg accuracy : 0.1957764505119454
score 
 0.9999111205915814
feature_names_in_
['gender' 'age' 'heart_rate_min' 'heart_rate_max' 'heart_rate_mean'
 'mbp_min' 'mbp_max' 'mbp_mean' 'sbp_min' 'sbp_max' 'sbp_mean' 'dbp_min'
 'dbp_max' 'dbp_mean' 'temperature_min' 'temperature_max'
 'temperature_mean' 'glucose_min' 'glucose_max' 'wbc_min' 'wbc_max'
 'creatinine_min' 'creatinine_max' 'hemoglobin_min' 'hemoglobin_max'
 'pt_min' 'pt_max' 'urineoutput' 'sofa_coagulation' 'sofa_cardiovascular'
 'sofa_cns' 'sofa_renal' 'charlson_comorbidity_index']

feature_importances_
[0.00108559 0.13061477 0.00843732 0.01075036 0.04188375 0.01329581
 0.01258082 0.0399405  0.02121851 0.01163083 0.06817762 0.00947629
 0.01045165 0.07193598 0.00633301 0.00813107 0.05629156 0.02126089
 0.03564309 0.04280652 0.0274827  0.01322908 0.01555167 0.03154462
 0.05414637 0.02077209 0.03296783 0.1



(<70320x246274 sparse matrix of type '<class 'numpy.int64'>'
	with 73999084 stored elements in Compressed Sparse Row format>, array([     0,   2537,   4982,   7327,   9788,  12255,  14668,  17145,
        19554,  22007,  24558,  27119,  29566,  32101,  34462,  36989,
        39492,  41869,  44332,  46759,  49264,  51729,  54152,  56505,
        59056,  61449,  63892,  66277,  68706,  71197,  73634,  76105,
        78492,  80927,  83382,  85863,  88260,  90677,  93208,  95661,
        98032, 100523, 103018, 105479, 108074, 110479, 112866, 115375,
       117940, 120271, 122826, 125373, 127878, 130369, 132956, 135407,
       137778, 140217, 142710, 145115, 147690, 150211, 152688, 155139,
       157542, 160053, 162516, 164955, 167406, 169859, 172364, 174747,
       177192, 179673, 182068, 184751, 187254, 189685, 192142, 194591,
       196952, 199409, 201830, 204233, 206684, 209061, 211492, 213915,
       216302, 218735, 221258, 223799, 226204, 228743, 231250, 233665,
       236226, 238803,



(<70320x248606 sparse matrix of type '<class 'numpy.int64'>'
	with 77295401 stored elements in Compressed Sparse Row format>, array([     0,   2573,   4950,   7455,   9960,  12509,  14952,  17439,
        19916,  22421,  24864,  27291,  29804,  32245,  34790,  37291,
        39714,  42319,  44764,  47185,  49748,  52211,  54720,  57297,
        59858,  62427,  64888,  67485,  69876,  72499,  74836,  77429,
        80030,  82505,  84940,  87471,  89782,  92271,  94690,  97183,
        99580, 102029, 104562, 107041, 109474, 111909, 114340, 116827,
       119328, 121907, 124476, 127005, 129466, 131967, 134392, 136909,
       139456, 141923, 144340, 146803, 149190, 151627, 154098, 156665,
       159048, 161599, 164064, 166505, 169054, 171541, 174044, 176513,
       179054, 181607, 184088, 186549, 188934, 191369, 193888, 196353,
       198858, 201293, 203730, 206207, 208678, 211187, 213700, 216173,
       218598, 221091, 223546, 226041, 228428, 230985, 233438, 235981,
       238478, 240959,