In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier, SGDRegressor, RidgeCV, LassoCV
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from imblearn.over_sampling import SMOTEN

smoten_median_imputed_less_40 = pd.read_csv('smoten_median_imputed_less_40.csv', index_col=False)

pd.set_option("display.max_columns", None)

def classifyScale_coef(scaleFunc1, estimatorFunc2, k_fold_int, x_array, y_array):
    k = k_fold_int
    kf = KFold(n_splits=k, shuffle=True, random_state=None)    
    clf = make_pipeline(scaleFunc1, estimatorFunc2)

    acc_score = []
    for train_index , test_index in kf.split(X):
        X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
        y_train , y_test = y[train_index] , y[test_index]
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
     
    acc = accuracy_score(y_pred , y_test)
    acc_score.append(acc)
     
    avg_acc_score = sum(acc_score)/k
    print('accuracy of each fold - {}'.format(acc_score))
    print('Avg accuracy : {}'.format(avg_acc_score))

    outcome_labels = ['Intubation False', 'Intubation True']

    #print('parameters \n', clf.get_params())
    print('score \n', clf.score(X_train, y_train))
    print('coef_')
    print(clf[1].coef_)
    print('intercept_')
    print(clf[1].intercept_)
    print('fit')
    print(clf.fit(X_train, y_train).n_features_in_, clf.fit(X_train, y_train).feature_names_in_)
    print('decision function \n', clf[1].decision_function(X_test))
    print('classification report \n', classification_report(y_test, y_pred, target_names=outcome_labels))
    micro_roc_auc_ovr = roc_auc_score(y_test, y_pred, multi_class="ovr", average="micro")
    print(f"Micro-averaged One-vs-Rest ROC AUC score:\n{micro_roc_auc_ovr:.2f}")
    print('\n')

    
print('### SMOTEN median all undr 40 missing ###')
print()
sgdClass = SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=0.001, shuffle=True, verbose=0, epsilon=0.1, n_jobs=None, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, class_weight=None, warm_start=False, average=False)
X = smoten_median_imputed_less_40.drop('outcome',axis= 1)
y = smoten_median_imputed_less_40['outcome']

print('linear SGDClassifier SVM StandardScaler')
print()
classifyScale_coef(StandardScaler(), sgdClass, 5, X, y)

print('linear SGDClassifier SVM RobustScaler')
classifyScale_coef(RobustScaler(), sgdClass, 5, X, y)




### SMOTEN median all undr 40 missing ###

linear SGDClassifier SVM StandardScaler

accuracy of each fold - [0.9670079635949943]
Avg accuracy : 0.19340159271899887
score 
 0.9700120875995449
coef_
[[ 0.11073139 -0.84075141  0.22471615  0.15481951 -0.66664863 -0.02761057
   0.00162523  0.59496451 -0.16964939  0.04504454 -0.31697079 -0.10405995
   0.05985317 -0.83952648 -0.00438285  0.12605004 -0.47412577 -0.11406773
  -0.21376324 -0.06129465 -0.31424956 -0.17436553 -0.14195291 -0.3292039
  -0.02641773 -0.26444106  0.04853392 -0.83118528 -0.21647953 -0.20945222
   0.35113945 -0.21226596  0.15577987]]
intercept_
[-0.23753962]
fit
33 ['gender' 'age' 'heart_rate_min' 'heart_rate_max' 'heart_rate_mean'
 'mbp_min' 'mbp_max' 'mbp_mean' 'sbp_min' 'sbp_max' 'sbp_mean' 'dbp_min'
 'dbp_max' 'dbp_mean' 'temperature_min' 'temperature_max'
 'temperature_mean' 'glucose_min' 'glucose_max' 'wbc_min' 'wbc_max'
 'creatinine_min' 'creatinine_max' 'hemoglobin_min' 'hemoglobin_max'
 'pt_min' 'pt_max' 'urineo



accuracy of each fold - [0.9623151308304891]
Avg accuracy : 0.19246302616609784
score 
 0.960537542662116
coef_
[[ 0.23195959 -1.22952377  0.19481488  0.1966911  -0.73240838 -0.19083007
  -0.09563058  0.688949   -0.1743874   0.00178692 -0.51328546 -0.03655214
   0.04997002 -0.9638687  -0.04462061  0.13374691 -0.47577527 -0.1060344
  -0.09947392 -0.01561448 -0.33444954 -0.09747238 -0.09870621 -0.57322179
  -0.11829322 -0.14850798  0.20491409 -0.79852546 -0.39976014 -0.32573049
   0.73782891 -0.15792993  0.20728304]]
intercept_
[0.77620848]
fit
33 ['gender' 'age' 'heart_rate_min' 'heart_rate_max' 'heart_rate_mean'
 'mbp_min' 'mbp_max' 'mbp_mean' 'sbp_min' 'sbp_max' 'sbp_mean' 'dbp_min'
 'dbp_max' 'dbp_mean' 'temperature_min' 'temperature_max'
 'temperature_mean' 'glucose_min' 'glucose_max' 'wbc_min' 'wbc_max'
 'creatinine_min' 'creatinine_max' 'hemoglobin_min' 'hemoglobin_max'
 'pt_min' 'pt_max' 'urineoutput' 'sofa_coagulation' 'sofa_cardiovascular'
 'sofa_cns' 'sofa_renal' 'charlson_com



no smoting only imputed

In [4]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier, SGDRegressor, RidgeCV, LassoCV
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from imblearn.over_sampling import SMOTEN

median_imputed_less_40 = pd.read_csv('median_imputed_contin.csv', index_col=False)

pd.set_option("display.max_columns", None)

def classifyScale_coef(scaleFunc1, estimatorFunc2, k_fold_int, x_array, y_array):
    k = k_fold_int
    kf = KFold(n_splits=k, shuffle=True, random_state=None)    
    clf = make_pipeline(scaleFunc1, estimatorFunc2)

    acc_score = []
    for train_index , test_index in kf.split(X):
        X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
        y_train , y_test = y[train_index] , y[test_index]
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
     
    acc = accuracy_score(y_pred , y_test)
    acc_score.append(acc)
     
    avg_acc_score = sum(acc_score)/k
    print('accuracy of each fold - {}'.format(acc_score))
    print('Avg accuracy : {}'.format(avg_acc_score))

    outcome_labels = ['Intubation False', 'Intubation True']

    #print('parameters \n', clf.get_params())
    print('score \n', clf.score(X_train, y_train))
    print('coef_')
    print(clf[1].coef_)
    print('intercept_')
    print(clf[1].intercept_)
    print('fit')
    print(clf.fit(X_train, y_train).n_features_in_, clf.fit(X_train, y_train).feature_names_in_)
    print('decision function \n', clf[1].decision_function(X_test))
    print('classification report \n', classification_report(y_test, y_pred, target_names=outcome_labels))
    micro_roc_auc_ovr = roc_auc_score(y_test, y_pred, multi_class="ovr", average="micro")
    print(f"Micro-averaged One-vs-Rest ROC AUC score:\n{micro_roc_auc_ovr:.2f}")
    print('\n')

    
print('### imputed only median all undr 40 missing ###')
print()
sgdClass = SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=0.001, shuffle=True, verbose=0, epsilon=0.1, n_jobs=None, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, class_weight=None, warm_start=False, average=False)
X = median_imputed_less_40.drop('outcome',axis= 1)
y = median_imputed_less_40['outcome']

print('linear SGDClassifier SVM StandardScaler')
print()
classifyScale_coef(StandardScaler(), sgdClass, 5, X, y)

print('linear SGDClassifier SVM RobustScaler')
classifyScale_coef(RobustScaler(), sgdClass, 5, X, y)





### imputed only median all undr 40 missing ###

linear SGDClassifier SVM StandardScaler

accuracy of each fold - [0.9650541318350007]
Avg accuracy : 0.19301082636700012
score 
 0.9632090983831186
coef_
[[ 0.00473836 -0.01011403 -0.00340882  0.01012744 -0.01159386 -0.03837535
   0.10178915 -0.05235171  0.0212784   0.02275769 -0.05525507 -0.04982751
  -0.03668853  0.02045423  0.01131414  0.02013553 -0.03295787 -0.01991526
   0.09026786 -0.13134141 -0.02358831 -0.01866026  0.00452611  0.06715376
  -0.03410179  0.07735182 -0.07811808  0.02379025]]
intercept_
[-1.3607287]
fit
28 ['age' 'heart_rate_min' 'heart_rate_max' 'heart_rate_mean' 'mbp_min'
 'mbp_max' 'mbp_mean' 'sbp_min' 'sbp_max' 'sbp_mean' 'dbp_min' 'dbp_max'
 'dbp_mean' 'temperature_min' 'temperature_max' 'temperature_mean'
 'glucose_min' 'glucose_max' 'wbc_min' 'wbc_max' 'creatinine_min'
 'creatinine_max' 'hemoglobin_min' 'hemoglobin_max' 'pt_min' 'pt_max'
 'urineoutput' 'gender']


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


decision function 
 [-128.43254895  -41.30561254  -90.91519748 ... -124.38657151  -74.45628082
 -179.67134524]
classification report 
                   precision    recall  f1-score   support

Intubation False       0.97      1.00      0.98      7042
 Intubation True       0.00      0.00      0.00       255

        accuracy                           0.97      7297
       macro avg       0.48      0.50      0.49      7297
    weighted avg       0.93      0.97      0.95      7297

Micro-averaged One-vs-Rest ROC AUC score:
0.50


linear SGDClassifier SVM RobustScaler
accuracy of each fold - [0.9666986432780594]
Avg accuracy : 0.1933397286556119
score 
 0.962798026856673
coef_
[[-0.04016532  0.0109248   0.03115591  0.02729654  0.02322855 -0.02675564
   0.06652577 -0.01083959  0.05236369 -0.01212154  0.01846811 -0.03160908
  -0.13791687 -0.00043699 -0.0245808   0.00632437  0.0379493  -0.02969304
  -0.02351433  0.01845405 -0.018208   -0.0282224  -0.07702535  0.01926524
   0.01294791 -0.132

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
