In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier, SGDRegressor, RidgeCV, LassoCV, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from imblearn.over_sampling import SMOTEN


smoten_median_imputed_less_40 = pd.read_csv('smoten_median_imputed_less_40.csv', index_col=False)

pd.set_option("display.max_columns", None)


def treeScale_coef(scaleFunc1, estimatorFunc2, k_fold_int, x_array, y_array):
    k = k_fold_int
    kf = KFold(n_splits=k, shuffle=True, random_state=None)    
    clf = make_pipeline(scaleFunc1, estimatorFunc2)

    acc_score = []
    for train_index , test_index in kf.split(X):
        X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
        y_train , y_test = y[train_index] , y[test_index]
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
     
    acc = accuracy_score(y_pred , y_test)
    acc_score.append(acc)
     
    avg_acc_score = sum(acc_score)/k
    print('accuracy of each fold - {}'.format(acc_score))
    print('Avg accuracy : {}'.format(avg_acc_score))

    outcome_labels = ['Intubation False', 'Intubation True']

    #print('parameters \n', clf.get_params())
    print('score \n', clf.score(X_train, y_train))
    print('feature_names_in_')
    print(clf.feature_names_in_)
    print()
    print('feature_importances_')
    print(clf[1].feature_importances_)
    print()
    for i in range(len(clf.feature_names_in_)):
        print(clf.feature_names_in_[i], ":" ,clf[1].feature_importances_[i])
    print()
    print('fit')
    print(clf.fit(X_train, y_train).n_features_in_, clf.fit(X_train, y_train).feature_names_in_)
    print()
    print('classification report \n', classification_report(y_test, y_pred, target_names=outcome_labels))
    micro_roc_auc_ovr = roc_auc_score(y_test, y_pred, multi_class="ovr", average="micro")
    print()
    print(f"Micro-averaged One-vs-Rest ROC AUC score:\n{micro_roc_auc_ovr:.2f}")
    print()
    print('Decision Path')
    print(clf[1].decision_path(X))
    print('\n')
    

print('Random Forest')



print('### SMOTEN noncontin median impute ###')
print()

print('Random Forest StandardScaler')
print()

X = smoten_median_imputed_less_40.loc[:, ~smoten_median_imputed_less_40.columns.isin(['heart_rate_max', 'heart_rate_mean', 'sbp_mean', 'dbp_min', 'dbp_max', 'temperature_max', 'glucose_min', 'pt_max', 'sofa_cardiovascular','outcome'])]
y = smoten_median_imputed_less_40['outcome']

                          
rfmodel = RandomForestClassifier(
                        n_estimators=150, criterion='entropy', 
                        max_depth=None, min_samples_split=2, 
                        min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                        max_features='sqrt', max_leaf_nodes=None, 
                        min_impurity_decrease=0.0, bootstrap=True, oob_score=True, 
                        n_jobs=4, 
                        random_state=None, verbose=0, warm_start=False, 
                        class_weight=None, ccp_alpha=0.0, max_samples=None)

treeScale_coef(StandardScaler(), rfmodel, 5, X, y)

print('Random Forest RobustScaler')
print()

X = smoten_median_imputed_less_40.loc[:, ~smoten_median_imputed_less_40.columns.isin(['mbp_min', 'mbp_mean', 'sbp_min', 'sbp_max', 'temperature_max', 'glucose_min', 'pt_min', 'sofa_cardiovascular', 'charlson_comorbidity_index', 'outcome'])]
y = smoten_median_imputed_less_40['outcome']

rfmodel = RandomForestClassifier(
                        n_estimators=50, criterion='entropy', 
                        max_depth=None, min_samples_split=2, 
                        min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                        max_features='sqrt', max_leaf_nodes=None, 
                        min_impurity_decrease=0.0, bootstrap=True, oob_score=True, 
                        n_jobs=4, 
                        random_state=None, verbose=0, warm_start=False, 
                        class_weight=None, ccp_alpha=0.0, max_samples=None)


treeScale_coef(RobustScaler(), rfmodel, 5, X, y)




Random Forest
### SMOTEN noncontin median impute ###

Random Forest StandardScaler

accuracy of each fold - [0.9793799772468714]
Avg accuracy : 0.19587599544937428
score 
 1.0
feature_names_in_
['gender' 'age' 'heart_rate_min' 'mbp_min' 'mbp_max' 'mbp_mean' 'sbp_min'
 'sbp_max' 'dbp_mean' 'temperature_min' 'temperature_mean' 'glucose_max'
 'wbc_min' 'wbc_max' 'creatinine_min' 'creatinine_max' 'hemoglobin_min'
 'hemoglobin_max' 'pt_min' 'urineoutput' 'sofa_coagulation' 'sofa_cns'
 'sofa_renal' 'charlson_comorbidity_index']

feature_importances_
[0.00186373 0.15356023 0.01586334 0.01307661 0.01896807 0.0742692
 0.02346606 0.01543227 0.09499298 0.00892302 0.07201718 0.04473275
 0.04776118 0.04080437 0.01672386 0.02282074 0.03609848 0.06077533
 0.03260691 0.16068364 0.01156911 0.01586403 0.00917705 0.00794987]

gender : 0.001863726020990093
age : 0.15356022761030252
heart_rate_min : 0.0158633401413189
mbp_min : 0.013076606169203373
mbp_max : 0.018968066767957332
mbp_mean : 0.07426920057948



(<70320x398706 sparse matrix of type '<class 'numpy.int64'>'
	with 112437649 stored elements in Compressed Sparse Row format>, array([     0,   2613,   5232,   7847,  10460,  13187,  15692,  18415,
        21092,  23853,  26498,  29195,  31900,  34537,  37158,  39759,
        42418,  44985,  47696,  50421,  53148,  55861,  58468,  60995,
        63632,  66269,  68964,  71605,  74334,  76975,  79644,  82307,
        85004,  87635,  90262,  93053,  95740,  98327, 101026, 103601,
       106320, 108887, 111574, 114211, 116820, 119473, 122188, 124829,
       127458, 130155, 132772, 135397, 138040, 140683, 143272, 145943,
       148562, 151021, 153726, 156427, 158992, 161629, 164192, 166887,
       169512, 172369, 175024, 177673, 180354, 182909, 185648, 188195,
       190828, 193497, 196262, 198923, 201592, 204347, 207178, 209817,
       212484, 215143, 217822, 220563, 223254, 225889, 228480, 231159,
       233780, 236443, 239108, 241803, 244366, 247083, 249830, 252425,
       255114, 257775



(<70320x130656 sparse matrix of type '<class 'numpy.int64'>'
	with 38309236 stored elements in Compressed Sparse Row format>, array([     0,   2557,   5198,   7769,  10378,  12989,  15562,  18107,
        20710,  23329,  25938,  28517,  31148,  33729,  36442,  38893,
        41406,  44095,  46642,  49231,  51828,  54373,  56994,  59499,
        62030,  64647,  67290,  69937,  72832,  75457,  78052,  80707,
        83348,  85995,  88698,  91385,  94072,  96583,  99186, 101803,
       104346, 106897, 109590, 112241, 114962, 117667, 120350, 122951,
       125534, 128187, 130656], dtype=int32))


