In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier, SGDRegressor, RidgeCV, LassoCV, LogisticRegression
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from imblearn.over_sampling import SMOTEN

smoten_median_imputed_less_40 = pd.read_csv('smoten_median_imputed_less_40.csv', index_col=False)

pd.set_option("display.max_columns", None)


def classifyScale_coef(scaleFunc1, estimatorFunc2, k_fold_int, x_array, y_array):
    k = k_fold_int
    kf = KFold(n_splits=k, shuffle=True, random_state=None)    
    clf = make_pipeline(scaleFunc1, estimatorFunc2)

    acc_score = []
    for train_index , test_index in kf.split(X):
        X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
        y_train , y_test = y[train_index] , y[test_index]
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
     
    acc = accuracy_score(y_pred , y_test)
    acc_score.append(acc)
     
    avg_acc_score = sum(acc_score)/k
    print('accuracy of each fold - {}'.format(acc_score))
    print('Avg accuracy : {}'.format(avg_acc_score))

    outcome_labels = ['Intubation False', 'Intubation True']

    #print('parameters \n', clf.get_params())
    print('score \n', clf.score(X_train, y_train))
    print('coef_')
    print(clf[1].coef_)
    print('intercept_')
    print(clf[1].intercept_)
    print('fit')
    print(clf.fit(X_train, y_train).n_features_in_, clf.fit(X_train, y_train).feature_names_in_)
    print('decision function \n', clf[1].decision_function(X_test))
    print('classification report \n', classification_report(y_test, y_pred, target_names=outcome_labels))
    micro_roc_auc_ovr = roc_auc_score(y_test, y_pred, multi_class="ovr", average="micro")
    print(f"Micro-averaged One-vs-Rest ROC AUC score:\n{micro_roc_auc_ovr:.2f}")
    print('\n')
    
    
print('### SMOTEN median impute ###')
print()


print('Logistic Regression lbfgs')
print()


print('lbfgs StandardScaler')
print()
X = smoten_median_imputed_less_40.loc[:, ~smoten_median_imputed_less_40.columns.isin(['heart_rate_max', 'mbp_max', 'sbp_max', 'temperature_max', 'creatinine_max', 'hemoglobin_max', 'pt_max','outcome'])]
y = smoten_median_imputed_less_40['outcome']

logRegress = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=3.0, 
                   fit_intercept=True, intercept_scaling=1, class_weight=None, 
                   random_state=None, solver='lbfgs', max_iter=100, multi_class='auto', 
                   verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)



classifyScale_coef(StandardScaler(), logRegress, 5, X, y)

print('lbfgs RobustScaler')
print()
X = smoten_median_imputed_less_40.loc[:, ~smoten_median_imputed_less_40.columns.isin(['gender', 'mbp_min', 'mbp_mean', 'dbp_min', 'dbp_max', 'temperature_min', 'pt_max','outcome'])]
y = smoten_median_imputed_less_40['outcome']

logRegress = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=2.0, 
                   fit_intercept=True, intercept_scaling=1, class_weight=None, 
                   random_state=None, solver='lbfgs', max_iter=100, multi_class='auto', 
                   verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)

classifyScale_coef(RobustScaler(), logRegress, 5, X, y)


### SMOTEN median impute ###

Logistic Regression lbfgs

lbfgs StandardScaler

accuracy of each fold - [0.9645904436860068]
Avg accuracy : 0.19291808873720134
score 
 0.9664035836177475
coef_
[[ 0.30226048 -1.61264608  0.34877197 -0.79807241 -0.28800453  1.11750143
  -0.34603905 -0.60375768 -0.03175092  0.0203619  -1.44580098  0.05667862
  -0.54483371 -0.17640838 -0.2386214  -0.04858083 -0.62742459 -0.27829085
  -0.85221643 -0.29226715 -1.32780637 -0.51290889 -0.33012891  0.90011897
  -0.4230167   0.37429058]]
intercept_
[-0.00253396]
fit
26 ['gender' 'age' 'heart_rate_min' 'heart_rate_mean' 'mbp_min' 'mbp_mean'
 'sbp_min' 'sbp_mean' 'dbp_min' 'dbp_max' 'dbp_mean' 'temperature_min'
 'temperature_mean' 'glucose_min' 'glucose_max' 'wbc_min' 'wbc_max'
 'creatinine_min' 'hemoglobin_min' 'pt_min' 'urineoutput'
 'sofa_coagulation' 'sofa_cardiovascular' 'sofa_cns' 'sofa_renal'
 'charlson_comorbidity_index']
decision function 
 [-2090.60312162 -1046.83740221 -2411.90211651 ... -1498.99099492
 



accuracy of each fold - [0.9638083048919226]
Avg accuracy : 0.1927616609783845
score 
 0.9660125142207053
coef_
[[-2.2232045   0.56922775  0.31196871 -1.47288567 -0.03571811 -0.49814981
   0.16426746 -0.39712084 -0.92533985  0.16124104 -0.69299434 -0.15379948
  -0.13292265 -0.10866621 -0.32805225 -0.08184089 -0.01912077 -1.02438703
  -0.33926028 -0.16714654 -1.2890196  -0.75446942 -0.48625086  1.49710891
  -0.3699564   0.39891604]]
intercept_
[1.32362006]
fit
26 ['age' 'heart_rate_min' 'heart_rate_max' 'heart_rate_mean' 'mbp_max'
 'sbp_min' 'sbp_max' 'sbp_mean' 'dbp_mean' 'temperature_max'
 'temperature_mean' 'glucose_min' 'glucose_max' 'wbc_min' 'wbc_max'
 'creatinine_min' 'creatinine_max' 'hemoglobin_min' 'hemoglobin_max'
 'pt_min' 'urineoutput' 'sofa_coagulation' 'sofa_cardiovascular'
 'sofa_cns' 'sofa_renal' 'charlson_comorbidity_index']
decision function 
 [-3694.57285635 -2073.88792175 -2707.56292127 ...  -499.09358713
 -1498.67930354 -1740.18441544]
classification report 
      

