In [4]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier, SGDRegressor, RidgeCV, LassoCV, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from imblearn.over_sampling import SMOTEN


smoten_noncontin = pd.read_csv('smoten_noncontin.csv', index_col=False)

pd.set_option("display.max_columns", None)


def treeScale_coef(scaleFunc1, estimatorFunc2, k_fold_int, x_array, y_array):
    k = k_fold_int
    kf = KFold(n_splits=k, shuffle=True, random_state=None)    
    clf = make_pipeline(scaleFunc1, estimatorFunc2)

    acc_score = []
    for train_index , test_index in kf.split(X):
        X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
        y_train , y_test = y[train_index] , y[test_index]
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
     
    acc = accuracy_score(y_pred , y_test)
    acc_score.append(acc)
     
    avg_acc_score = sum(acc_score)/k
    print('accuracy of each fold - {}'.format(acc_score))
    print('Avg accuracy : {}'.format(avg_acc_score))

    outcome_labels = ['Intubation False', 'Intubation True']

    #print('parameters \n', clf.get_params())
    print('score \n', clf.score(X_train, y_train))
    print('feature_names_in_')
    print(clf.feature_names_in_)
    print()
    print('feature_importances_')
    print(clf[1].feature_importances_)
    print()
    for i in range(len(clf.feature_names_in_)):
        print(clf.feature_names_in_[i], ":" ,clf[1].feature_importances_[i])
    print()
    print('fit')
    print(clf.fit(X_train, y_train).n_features_in_, clf.fit(X_train, y_train).feature_names_in_)
    print()
    print('classification report \n', classification_report(y_test, y_pred, target_names=outcome_labels))
    micro_roc_auc_ovr = roc_auc_score(y_test, y_pred, multi_class="ovr", average="micro")
    print()
    print(f"Micro-averaged One-vs-Rest ROC AUC score:\n{micro_roc_auc_ovr:.2f}")
    print()
    print('Decision Path')
    print(clf[1].decision_path(X))
    print('\n')
    

print('Random Forest')

rfmodel = RandomForestClassifier(
                        n_estimators=100, criterion='entropy', 
                        max_depth=None, min_samples_split=2, 
                        min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                        max_features='sqrt', max_leaf_nodes=None, 
                        min_impurity_decrease=0.0, bootstrap=True, oob_score=True, 
                        n_jobs=4, 
                        random_state=None, verbose=0, warm_start=False, 
                        class_weight=None, ccp_alpha=0.0, max_samples=None)

print('### SMOTEN noncontin median impute ###')
print()

X = smoten_noncontin.drop('outcome',axis= 1)
y = smoten_noncontin['outcome']

print('Random Forest StandardScaler')
print()
treeScale_coef(StandardScaler(), rfmodel, 5, X, y)

print('Random Forest RobustScaler')
treeScale_coef(RobustScaler(), rfmodel, 5, X, y)




Random Forest
### SMOTEN noncontin median impute ###

Random Forest StandardScaler

accuracy of each fold - [0.8482650739476678]
Avg accuracy : 0.16965301478953357
score 
 0.8539889078498294
feature_names_in_
['sofa_coagulation' 'sofa_cardiovascular' 'sofa_cns' 'sofa_renal' 'gender'
 'charlson_comorbidity_index']

feature_importances_
[0.11091011 0.08921814 0.3042896  0.13409941 0.0496777  0.31180504]

sofa_coagulation : 0.11091011297187559
sofa_cardiovascular : 0.08921814224414845
sofa_cns : 0.30428960014647727
sofa_renal : 0.1340994052552457
gender : 0.049677697141276095
charlson_comorbidity_index : 0.31180504224097694

fit
6 ['sofa_coagulation' 'sofa_cardiovascular' 'sofa_cns' 'sofa_renal' 'gender'
 'charlson_comorbidity_index']

classification report 
                   precision    recall  f1-score   support

Intubation False       0.87      0.82      0.84      7014
 Intubation True       0.83      0.88      0.85      7050

        accuracy                           0.85     14064



(<70320x320038 sparse matrix of type '<class 'numpy.int64'>'
	with 75558007 stored elements in Compressed Sparse Row format>, array([     0,   3179,   6390,   9527,  12654,  15963,  19138,  22379,
        25646,  28925,  32176,  35347,  38562,  41787,  45040,  48335,
        51536,  54777,  57936,  61165,  64260,  67517,  70670,  73995,
        77214,  80593,  83766,  87085,  90240,  93335,  96536,  99785,
       102962, 106169, 109298, 112429, 115660, 118907, 122066, 125251,
       128494, 131617, 134820, 138001, 141108, 144289, 147476, 150769,
       153934, 157113, 160440, 163545, 166766, 169909, 173126, 176451,
       179586, 182871, 186056, 189277, 192276, 195491, 198810, 201897,
       205132, 208225, 211472, 214655, 217748, 221015, 224194, 227341,
       230568, 233635, 236830, 240021, 243378, 246551, 249756, 252841,
       255946, 259251, 262452, 265597, 268792, 271915, 275054, 278283,
       281502, 284745, 287954, 291139, 294258, 297481, 300604, 303861,
       307078, 310377,



(<70320x315684 sparse matrix of type '<class 'numpy.int64'>'
	with 71444614 stored elements in Compressed Sparse Row format>, array([     0,   3067,   6236,   9349,  12592,  15687,  18888,  22087,
        25202,  28443,  31652,  34797,  37896,  41147,  44266,  47403,
        50606,  53749,  56848,  59973,  63170,  66379,  69464,  72777,
        75944,  78973,  82144,  85233,  88510,  91563,  94760,  97925,
       101020, 104163, 107348, 110405, 113574, 116725, 119832, 123053,
       126130, 129277, 132458, 135719, 138864, 141871, 144858, 147983,
       151182, 154377, 157480, 160655, 163820, 166931, 170050, 173091,
       176268, 179521, 182656, 185847, 188954, 192103, 195290, 198561,
       201592, 204653, 207838, 211001, 214300, 217509, 220684, 223863,
       227074, 230269, 233452, 236627, 239812, 242965, 246236, 249315,
       252502, 255705, 258780, 261929, 265192, 268231, 271324, 274489,
       277512, 280705, 284026, 287085, 290172, 293289, 296416, 299595,
       302932, 306135,