In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier, SGDRegressor, RidgeCV, LassoCV, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from imblearn.over_sampling import SMOTEN


smoten_knn_contin = pd.read_csv('smoten_knn_contin.csv', index_col=False)
smoten_median_imputed_contin = pd.read_csv('smoten_median_imputed_contin.csv', index_col=False)

pd.set_option("display.max_columns", None)


def ensembleScale_coef(scaleFunc1, estimatorFunc2, k_fold_int, x_array, y_array):
    k = k_fold_int
    kf = KFold(n_splits=k, shuffle=True, random_state=None)    
    clf = make_pipeline(scaleFunc1, estimatorFunc2)

    acc_score = []
    for train_index , test_index in kf.split(X):
        X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
        y_train , y_test = y[train_index] , y[test_index]
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
     
    acc = accuracy_score(y_pred , y_test)
    acc_score.append(acc)
     
    avg_acc_score = sum(acc_score)/k
    print('accuracy of each fold - {}'.format(acc_score))
    print('Avg accuracy : {}'.format(avg_acc_score))

    outcome_labels = ['Intubation False', 'Intubation True']

    #print('parameters \n', clf.get_params())
    print('score \n', clf.score(X_train, y_train))
    print('feature_names_in_')
    print(clf.feature_names_in_)
    print()
    print('feature_importances_')
    print(clf[1].feature_importances_)
    print()
    for i in range(len(clf.feature_names_in_)):
        print(clf.feature_names_in_[i], ":" ,clf[1].feature_importances_[i])
    print()
    print('fit')
    print(clf.fit(X_train, y_train).n_features_in_, clf.fit(X_train, y_train).feature_names_in_)
    print()
    print('classification report \n', classification_report(y_test, y_pred, target_names=outcome_labels))
    micro_roc_auc_ovr = roc_auc_score(y_test, y_pred, multi_class="ovr", average="micro")
    print()
    print(f"Micro-averaged One-vs-Rest ROC AUC score:\n{micro_roc_auc_ovr:.2f}")
    print()
    print('\n')
    

print('Gradient Boost Ensemble')

estimate = GradientBoostingClassifier(loss='log_loss', learning_rate=0.1, 
                                      n_estimators=100, subsample=1.0, 
                                      criterion='friedman_mse', min_samples_split=2, 
                                      min_samples_leaf=1, min_weight_fraction_leaf=0.0,
                                      max_depth=3, min_impurity_decrease=0.0, init=None, 
                                      random_state=None, max_features=None, verbose=0, 
                                      max_leaf_nodes=None, warm_start=False, 
                                      validation_fraction=0.1, n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0)


print('### SMOTEN knn contin ###')
print()

X = smoten_knn_contin.drop('outcome',axis= 1)
y = smoten_knn_contin['outcome']

print('StandardScaler')
print()
ensembleScale_coef(StandardScaler(), estimate, 5, X, y)

print('RobustScaler')
ensembleScale_coef(RobustScaler(), estimate, 5, X, y)

print('### SMOTEN median impute ###')
print()

X = smoten_median_imputed_contin.drop('outcome',axis= 1)
y = smoten_median_imputed_contin['outcome']

print('StandardScaler')
print()
ensembleScale_coef(StandardScaler(), estimate, 5, X, y)

print('RobustScaler')
ensembleScale_coef(RobustScaler(), estimate, 5, X, y)


Gradient Boost Ensemble
### SMOTEN knn contin ###

StandardScaler

accuracy of each fold - [0.975042662116041]
Avg accuracy : 0.1950085324232082
score 
 0.9769980091012515
feature_names_in_
['age' 'heart_rate_min' 'heart_rate_max' 'heart_rate_mean' 'mbp_min'
 'mbp_max' 'mbp_mean' 'sbp_min' 'sbp_max' 'sbp_mean' 'dbp_min' 'dbp_max'
 'dbp_mean' 'temperature_min' 'temperature_max' 'temperature_mean'
 'glucose_min' 'glucose_max' 'wbc_min' 'wbc_max' 'creatinine_min'
 'creatinine_max' 'hemoglobin_min' 'hemoglobin_max' 'pt_min' 'pt_max'
 'urineoutput' 'gender']

feature_importances_
[2.83236967e-01 1.16107721e-04 6.37376361e-04 2.77670033e-02
 8.13227098e-04 1.68241177e-04 8.82018944e-03 2.07700914e-03
 7.57986409e-04 4.73533298e-02 1.12624144e-03 1.37171188e-03
 1.01367565e-01 1.37957685e-04 1.26322615e-04 5.00690748e-02
 1.72411123e-03 8.20371047e-03 6.41195148e-03 6.37083116e-03
 4.18799512e-03 1.29335441e-02 5.14291790e-03 2.65630248e-02
 7.76993085e-03 1.07422826e-02 3.83652275e-01 3.5111

accuracy of each fold - [0.9768202502844141]
Avg accuracy : 0.19536405005688282
score 
 0.9762691979522184
feature_names_in_
['age' 'heart_rate_min' 'heart_rate_max' 'heart_rate_mean' 'mbp_min'
 'mbp_max' 'mbp_mean' 'sbp_min' 'sbp_max' 'sbp_mean' 'dbp_min' 'dbp_max'
 'dbp_mean' 'temperature_min' 'temperature_max' 'temperature_mean'
 'glucose_min' 'glucose_max' 'wbc_min' 'wbc_max' 'creatinine_min'
 'creatinine_max' 'hemoglobin_min' 'hemoglobin_max' 'pt_min' 'pt_max'
 'urineoutput' 'gender']

feature_importances_
[2.85710227e-01 3.70125351e-04 1.49555580e-04 3.39077304e-02
 1.40234700e-03 2.59380273e-04 7.27856220e-03 3.16967439e-03
 6.19656584e-04 3.26896132e-02 1.62592613e-03 1.91826870e-03
 1.11725157e-01 1.51195408e-04 9.04825004e-05 4.46401411e-02
 9.05117660e-04 7.60160287e-03 4.88654474e-03 6.36473744e-03
 5.64133421e-03 9.86040136e-03 5.10696901e-03 3.02816256e-02
 7.35270739e-03 1.15413960e-02 3.84343196e-01 4.06324704e-04]

age : 0.2857102273674618
heart_rate_min : 0.0003701253

In [None]:
Non scale

In [4]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier, SGDRegressor, RidgeCV, LassoCV, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from imblearn.over_sampling import SMOTEN


smoten_knn_contin = pd.read_csv('smoten_knn_contin.csv', index_col=False)
smoten_median_imputed_contin = pd.read_csv('smoten_median_imputed_contin.csv', index_col=False)

pd.set_option("display.max_columns", None)


def ensembleScale_coef(estimatorFunc2, k_fold_int, x_array, y_array):
    k = k_fold_int
    kf = KFold(n_splits=k, shuffle=True, random_state=None)    
    clf = make_pipeline(estimatorFunc2)

    acc_score = []
    for train_index , test_index in kf.split(X):
        X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
        y_train , y_test = y[train_index] , y[test_index]
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
     
    acc = accuracy_score(y_pred , y_test)
    acc_score.append(acc)
     
    avg_acc_score = sum(acc_score)/k
    print('accuracy of each fold - {}'.format(acc_score))
    print('Avg accuracy : {}'.format(avg_acc_score))

    outcome_labels = ['Intubation False', 'Intubation True']

    #print('parameters \n', clf.get_params())
    print('score \n', clf.score(X_train, y_train))
    print('feature_names_in_')
    print(clf.feature_names_in_)
    print()
    print('feature_importances_')
    print(clf.feature_importances_)
    print()
    for i in range(len(clf.feature_names_in_)):
        print(clf.feature_names_in_[i], ":" ,clf.feature_importances_[i])
    print()
    print('fit')
    print(clf.fit(X_train, y_train).n_features_in_, clf.fit(X_train, y_train).feature_names_in_)
    print()
    print('classification report \n', classification_report(y_test, y_pred, target_names=outcome_labels))
    micro_roc_auc_ovr = roc_auc_score(y_test, y_pred, multi_class="ovr", average="micro")
    print()
    print(f"Micro-averaged One-vs-Rest ROC AUC score:\n{micro_roc_auc_ovr:.2f}")
    print()
    print('\n')
    

print('Gradient Boost Ensemble')

estimate = GradientBoostingClassifier(loss='log_loss', learning_rate=0.1, 
                                      n_estimators=100, subsample=1.0, 
                                      criterion='friedman_mse', min_samples_split=2, 
                                      min_samples_leaf=1, min_weight_fraction_leaf=0.0,
                                      max_depth=3, min_impurity_decrease=0.0, init=None, 
                                      random_state=None, max_features=None, verbose=0, 
                                      max_leaf_nodes=None, warm_start=False, 
                                      validation_fraction=0.1, n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0)


print('### SMOTEN knn contin ###')
print()

X = smoten_knn_contin.drop('outcome',axis= 1)
y = smoten_knn_contin['outcome']

print('non Scale')
print()
ensembleScale_coef(estimate, 5, X, y)


print('### SMOTEN median impute ###')
print()

X = smoten_median_imputed_contin.drop('outcome',axis= 1)
y = smoten_median_imputed_contin['outcome']

print('non Scale')
print()
ensembleScale_coef(estimate, 5, X, y)


Gradient Boost Ensemble
### SMOTEN knn contin ###

non Scale

accuracy of each fold - [0.9749004550625711]
Avg accuracy : 0.1949800910125142
score 
 0.9764647326507395
feature_names_in_
['age' 'heart_rate_min' 'heart_rate_max' 'heart_rate_mean' 'mbp_min'
 'mbp_max' 'mbp_mean' 'sbp_min' 'sbp_max' 'sbp_mean' 'dbp_min' 'dbp_max'
 'dbp_mean' 'temperature_min' 'temperature_max' 'temperature_mean'
 'glucose_min' 'glucose_max' 'wbc_min' 'wbc_max' 'creatinine_min'
 'creatinine_max' 'hemoglobin_min' 'hemoglobin_max' 'pt_min' 'pt_max'
 'urineoutput' 'gender']

feature_importances_


AttributeError: 'Pipeline' object has no attribute 'feature_importances_'