In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier, SGDRegressor, RidgeCV, LassoCV, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from imblearn.over_sampling import SMOTEN


smoten_noncontin = pd.read_csv('smoten_noncontin.csv', index_col=False)

pd.set_option("display.max_columns", None)


def treeScale_coef(scaleFunc1, estimatorFunc2, k_fold_int, x_array, y_array):
    k = k_fold_int
    kf = KFold(n_splits=k, shuffle=True, random_state=None)    
    clf = make_pipeline(scaleFunc1, estimatorFunc2)

    acc_score = []
    for train_index , test_index in kf.split(X):
        X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
        y_train , y_test = y[train_index] , y[test_index]
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
     
    acc = accuracy_score(y_pred , y_test)
    acc_score.append(acc)
     
    avg_acc_score = sum(acc_score)/k
    print('accuracy of each fold - {}'.format(acc_score))
    print('Avg accuracy : {}'.format(avg_acc_score))

    outcome_labels = ['Intubation False', 'Intubation True']

    #print('parameters \n', clf.get_params())
    print('score \n', clf.score(X_train, y_train))
    print('feature_names_in_')
    print(clf.feature_names_in_)
    print()
    print('feature_importances_')
    print(clf[1].feature_importances_)
    print()
    for i in range(len(clf.feature_names_in_)):
        print(clf.feature_names_in_[i], ":" ,clf[1].feature_importances_[i])
    print()
    print('fit')
    print(clf.fit(X_train, y_train).n_features_in_, clf.fit(X_train, y_train).feature_names_in_)
    print()
    print('classification report \n', classification_report(y_test, y_pred, target_names=outcome_labels))
    micro_roc_auc_ovr = roc_auc_score(y_test, y_pred, multi_class="ovr", average="micro")
    print()
    print(f"Micro-averaged One-vs-Rest ROC AUC score:\n{micro_roc_auc_ovr:.2f}")
    print()
    print('Decision Path')
    print(clf[1].decision_path(X))
    print('\n')
    

print('Random Forest')

rfmodel = RandomForestClassifier(
                        n_estimators=150, criterion='entropy', 
                        max_depth=None, min_samples_split=2, 
                        min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                        max_features='sqrt', max_leaf_nodes=None, 
                        min_impurity_decrease=0.0, bootstrap=True, oob_score=True, 
                        n_jobs=4, 
                        random_state=None, verbose=0, warm_start=False, 
                        class_weight=None, ccp_alpha=0.0, max_samples=None)

print('### SMOTEN noncontin median impute ###')
print()

X = smoten_noncontin.drop('outcome',axis= 1)
y = smoten_noncontin['outcome']

print('Random Forest StandardScaler')
print()
treeScale_coef(StandardScaler(), rfmodel, 5, X, y)

print('Random Forest RobustScaler')

rfmodel = RandomForestClassifier(
                        n_estimators=50, criterion='entropy', 
                        max_depth=None, min_samples_split=2, 
                        min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                        max_features='sqrt', max_leaf_nodes=None, 
                        min_impurity_decrease=0.0, bootstrap=True, oob_score=True, 
                        n_jobs=4, 
                        random_state=None, verbose=0, warm_start=False, 
                        class_weight=None, ccp_alpha=0.0, max_samples=None)

treeScale_coef(RobustScaler(), rfmodel, 5, X, y)




Random Forest
### SMOTEN noncontin median impute ###

Random Forest StandardScaler

accuracy of each fold - [0.8459186575654153]
Avg accuracy : 0.16918373151308305
score 
 0.854077787258248
feature_names_in_
['sofa_coagulation' 'sofa_cardiovascular' 'sofa_cns' 'sofa_renal' 'gender'
 'charlson_comorbidity_index']

feature_importances_
[0.10390676 0.09400234 0.29965996 0.12966577 0.05046214 0.32230304]

sofa_coagulation : 0.10390675517805431
sofa_cardiovascular : 0.09400233927922674
sofa_cns : 0.29965996066851647
sofa_renal : 0.12966576793524928
gender : 0.05046214190415645
charlson_comorbidity_index : 0.32230303503479674

fit
6 ['sofa_coagulation' 'sofa_cardiovascular' 'sofa_cns' 'sofa_renal' 'gender'
 'charlson_comorbidity_index']

classification report 
                   precision    recall  f1-score   support

Intubation False       0.86      0.83      0.84      7050
 Intubation True       0.83      0.86      0.85      7014

        accuracy                           0.85     14064




(<70320x476062 sparse matrix of type '<class 'numpy.int64'>'
	with 111265821 stored elements in Compressed Sparse Row format>, array([     0,   3229,   6408,   9519,  12682,  15845,  18982,  22181,
        25466,  28551,  31712,  34945,  38082,  41423,  44580,  47833,
        50998,  54301,  57480,  60759,  63874,  67201,  70524,  73793,
        76956,  80117,  83304,  86397,  89440,  92643,  95834,  98977,
       102138, 105361, 108460, 111615, 114736, 117967, 121056, 124095,
       127144, 130413, 133622, 136859, 139982, 143089, 146216, 149397,
       152632, 155765, 158878, 162001, 165300, 168505, 171702, 174839,
       178102, 181341, 184472, 187683, 190810, 194033, 197212, 200367,
       203396, 206663, 209834, 213025, 216294, 219473, 222726, 225893,
       229032, 232173, 235364, 238611, 241654, 244919, 248002, 251137,
       254450, 257615, 260654, 263797, 266936, 270069, 273248, 276439,
       279616, 282765, 285934, 289025, 292198, 295455, 298496, 301649,
       304842, 307897



(<70320x159790 sparse matrix of type '<class 'numpy.int64'>'
	with 36428440 stored elements in Compressed Sparse Row format>, array([     0,   3255,   6504,   9655,  12896,  16153,  19344,  22659,
        25858,  29091,  32302,  35593,  38820,  41891,  44962,  48091,
        51238,  54405,  57550,  60803,  64022,  67207,  70250,  73395,
        76682,  79827,  82904,  86117,  89284,  92459,  95750,  98993,
       102140, 105253, 108590, 111709, 114820, 118015, 121366, 124527,
       127662, 130799, 134026, 137213, 140572, 143831, 146970, 150109,
       153528, 156709, 159790], dtype=int32))


