## <b>DATA</b>

In [1]:
from ucimlrepo import fetch_ucirepo 
from joblib import dump, load
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
# fetch dataset 
sepsis_survival_minimal_clinical_records = fetch_ucirepo(id=827) 
  
# data (as pandas dataframes) 
X = sepsis_survival_minimal_clinical_records.data.features 
y = sepsis_survival_minimal_clinical_records.data.targets 


In [3]:
# sepsis features
X

Unnamed: 0,age_years,sex_0male_1female,episode_number
0,21,1,1
1,20,1,1
2,21,1,1
3,77,0,1
4,72,0,1
...,...,...,...
110336,47,0,1
110337,50,0,1
110338,62,0,1
110339,58,0,1


In [4]:
# sepsis targets
y.head(5)

Unnamed: 0,hospital_outcome_1alive_0dead
0,1
1,1
2,1
3,1
4,1


In [5]:
# Separating Norway data and Korean data - train on Norway, test on Korea for generalization

X_n = X[:-137] 
y_n = y[:-137]

X_k = X[-137:].reset_index(drop=True)
y_k = y[-137:].reset_index(drop=True)

In [6]:
# normalizing
scaler = StandardScaler()
X_num_scaled = scaler.fit_transform(X_n[['age_years', 'episode_number']])

X_normed = X_n.copy()
X_normed['age_years'] = X_num_scaled[:,0]
X_normed['episode_number'] = X_num_scaled[:,1]

X_normed.head()

Unnamed: 0,age_years,sex_0male_1female,episode_number
0,-1.729837,1,-0.464727
1,-1.771285,1,-0.464727
2,-1.729837,1,-0.464727
3,0.591243,0,-0.464727
4,0.384004,0,-0.464727


## <b>Gradient Boosting (XGBoost)</b>


In [7]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
from imblearn.over_sampling import SMOTE

In [8]:
y_ravel = y_n.values.ravel()

X_train, X_test, y_train, y_test = train_test_split(X_normed, y_ravel, test_size=0.2, random_state=42,shuffle=True)

In [9]:
# SMOTE - oversample minority class
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

#base model
xgb_base = XGBClassifier(eval_metric='logloss',
                         objective='binary:logistic',
                         reg_alpha = 0.5,
                         reg_lambda=5,
                         random_state=42)

# tune hyperparameters with gridsearch
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3,7, 10]
}

grid_search = GridSearchCV(
    estimator= xgb_base,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=5,
    verbose=1,
    n_jobs=-1 #parallel process
) 

grid_search.fit(X_train_smote, y_train_smote)

print("Best Parameters:", grid_search.best_params_)
print("Best ROC-AUC:", grid_search.best_score_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Parameters: {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 200}
Best ROC-AUC: 0.7328670227896773


In [10]:
# train on 3 splits
for test_size in [0.2,0.5, 0.8]:
    X_train, X_test, y_train, y_test = train_test_split(X_normed, y_ravel, test_size=test_size, random_state=42,shuffle=True)
    
    # create model with optimal params
    xgb_opt = XGBClassifier(
        eval_metric='mlogloss',
        **grid_search.best_params_,
        random_state=42
    )
    
    xgb_opt.fit(X_train, y_train)
    
    y_pred_proba = xgb_opt.predict_proba(X_test)[:,1]
    
    y_pred_class = xgb_opt.predict(X_test)
    
    print("Train="+str(round(1-test_size,2)))
    print(classification_report(y_test, y_pred_class,zero_division=0))
    

    try:
        roc_auc = roc_auc_score(y_test, y_pred_proba)  # One-vs-rest
        print(f"ROC-AUC: {roc_auc:.4f}")
    except ValueError as e:
        print(f"Could not calculate ROC-AUC: {e}")
        
    print(" ")
    print(" ")
    


Train=0.8
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1613
           1       0.93      1.00      0.96     20428

    accuracy                           0.93     22041
   macro avg       0.46      0.50      0.48     22041
weighted avg       0.86      0.93      0.89     22041

ROC-AUC: 0.6931
 
 
Train=0.5
              precision    recall  f1-score   support

           0       0.33      0.00      0.00      4062
           1       0.93      1.00      0.96     51040

    accuracy                           0.93     55102
   macro avg       0.63      0.50      0.48     55102
weighted avg       0.88      0.93      0.89     55102

ROC-AUC: 0.6903
 
 
Train=0.2
              precision    recall  f1-score   support

           0       0.14      0.00      0.00      6533
           1       0.93      1.00      0.96     81631

    accuracy                           0.93     88164
   macro avg       0.53      0.50      0.48     88164
weig

In [11]:
# save the model

dump(xgb_opt, "trained/xgb_opt_SEPSIS.joblib")

['trained/xgb_opt_SEPSIS.joblib']

## <b>NEURALNET</b>

In [12]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, BatchNormalization, Input
from sklearn.model_selection import KFold
from tensorflow.keras.callbacks import EarlyStopping
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV
from scikeras.wrappers import KerasClassifier
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report, roc_auc_score, roc_curve

In [13]:
# Split 80/20 train/test

y_ravel = y_n.values.ravel()

X_normed_nn = scaler.fit_transform(X_n)
X_normed_nn

# splitting NORMED data
X_train, X_test, y_train, y_test = train_test_split(X_normed, y_ravel, test_size=0.2, random_state=42,shuffle=True)

# Oversample the minority class (dead patients)
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [14]:
def create_nn(learning_rate=0.001):
    model = Sequential([Input(shape=(3,)),
                        Dense(64, activation="relu"),
                        Dense(32, activation="relu"), 
                        Dense(1, activation="sigmoid")]) 
    
    model.compile(optimizer = Adam(learning_rate=learning_rate),
                  loss = "binary_crossentropy",
                  metrics = ["accuracy"])
                  
    return model

In [15]:
# Keras wrapper -> use gridsearchcv with neuralnet

nn = KerasClassifier(model=create_nn)

In [16]:
# Gridsearchcv

param_grid = {
    'model__learning_rate': [0.001, 0.01],
    'batch_size': [64, 128, 256]
}

grid_search = GridSearchCV(
    estimator=nn,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1
) 

# early stop
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

grid_search.fit(
    X_train_smote,
    y_train_smote,
    epochs=200,
    callbacks=[early_stop]
)

print("Best Parameters:", grid_search.best_params_)
print("Best ROC-AUC:", grid_search.best_score_)



Epoch 1/200
[1m1277/1277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 785us/step - accuracy: 0.6412 - loss: 0.6213
Epoch 2/200
[1m1277/1277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 784us/step - accuracy: 0.6497 - loss: 0.6116
Epoch 3/200
[1m1277/1277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 779us/step - accuracy: 0.6520 - loss: 0.6102
Epoch 4/200
[1m1277/1277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 772us/step - accuracy: 0.6537 - loss: 0.6095
Epoch 5/200
[1m1277/1277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 791us/step - accuracy: 0.6521 - loss: 0.6113
Epoch 6/200
[1m1277/1277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 768us/step - accuracy: 0.6519 - loss: 0.6107
Epoch 7/200
[1m1277/1277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 779us/step - accuracy: 0.6520 - loss: 0.6094
Epoch 8/200
[1m1277/1277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 783us/step - accuracy: 0.6519 - loss: 0.6091


In [17]:

# train on 3 splits
for test_size in [0.2,0.5, 0.8]:
    X_train, X_test, y_train, y_test = train_test_split(X_normed, y_ravel, test_size=test_size, random_state=42,shuffle=True)
    
    # create model with optimal learning rate
    nn_opt = create_nn(
        learning_rate=grid_search.best_params_['model__learning_rate']
    )
    
    nn_opt.fit(X_train, y_train,
               batch_size=grid_search.best_params_['batch_size'],
               epochs=200,
               validation_split=0.2,
               callbacks=[early_stop])
    
    y_pred = nn_opt.predict(X_test)
    y_pred_proba = nn_opt.predict(X_test).ravel()
    
    print("Train="+str(round(1-test_size)))
    print(classification_report(y_test, (nn_opt.predict(X_test) > 0.5).astype(int)))
    print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")
    print(" ")
    print(" ")

           

Epoch 1/200
[1m552/552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9095 - loss: 0.3221 - val_accuracy: 0.9284 - val_loss: 0.2396
Epoch 2/200
[1m552/552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 979us/step - accuracy: 0.9234 - loss: 0.2524 - val_accuracy: 0.9284 - val_loss: 0.2382
Epoch 3/200
[1m552/552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 978us/step - accuracy: 0.9271 - loss: 0.2419 - val_accuracy: 0.9284 - val_loss: 0.2381
Epoch 4/200
[1m552/552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 979us/step - accuracy: 0.9248 - loss: 0.2467 - val_accuracy: 0.9284 - val_loss: 0.2379
Epoch 5/200
[1m552/552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 964us/step - accuracy: 0.9258 - loss: 0.2454 - val_accuracy: 0.9284 - val_loss: 0.2377
Epoch 6/200
[1m552/552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 981us/step - accuracy: 0.9251 - loss: 0.2465 - val_accuracy: 0.9284 - val_loss: 0.2374
Epoch 7/200


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[1m345/345[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8647 - loss: 0.3631 - val_accuracy: 0.9298 - val_loss: 0.2390
Epoch 2/200
[1m345/345[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9267 - loss: 0.2455 - val_accuracy: 0.9298 - val_loss: 0.2352
Epoch 3/200
[1m345/345[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9264 - loss: 0.2448 - val_accuracy: 0.9298 - val_loss: 0.2356
Epoch 4/200
[1m345/345[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9262 - loss: 0.2446 - val_accuracy: 0.9298 - val_loss: 0.2342
Epoch 5/200
[1m345/345[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 992us/step - accuracy: 0.9259 - loss: 0.2441 - val_accuracy: 0.9298 - val_loss: 0.2340
Epoch 6/200
[1m345/345[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9274 - loss: 0.2421 - val_accuracy: 0.9298 - val_loss: 0.2345
Epoch 7/200
[1m345/345[0m [32

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8712 - loss: 0.4174 - val_accuracy: 0.9285 - val_loss: 0.2475
Epoch 2/200
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9294 - loss: 0.2399 - val_accuracy: 0.9285 - val_loss: 0.2423
Epoch 3/200
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9319 - loss: 0.2329 - val_accuracy: 0.9285 - val_loss: 0.2400
Epoch 4/200
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9325 - loss: 0.2297 - val_accuracy: 0.9285 - val_loss: 0.2381
Epoch 5/200
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9298 - loss: 0.2357 - val_accuracy: 0.9285 - val_loss: 0.2374
Epoch 6/200
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9276 - loss: 0.2384 - val_accuracy: 0.9285 - val_loss: 0.2375
Epoch 7/200
[1m138/138[0m [32m━

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [18]:
# save the model

dump(nn_opt, "trained/nn_opt_SEPSIS.joblib")

['trained/nn_opt_SEPSIS.joblib']

## <b>RandomForest</b>

In [19]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE

In [20]:
# Split 80/20 train/test

y_ravel = y_n.values.ravel()

X_train, X_test, y_train, y_test = train_test_split(X_normed, y_ravel, test_size=0.2, random_state=42,shuffle=True)

# Oversample the minority class (dead patients)
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [21]:
# gridsearchcv
param_grid = {
    'n_estimators': [50,100,200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5]
}

grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42), 
    param_grid, 
    cv=5, 
    scoring='roc_auc',
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train_smote, y_train_smote)
print("Best Parameters:", grid_search.best_params_)
print("Best ROC-AUC:", grid_search.best_score_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Parameters: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 200}
Best ROC-AUC: 0.7479761980936049


In [22]:
# Check most 'influential' feature to predict the target
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([0.97542572, 0.00737341, 0.01720087])

In [23]:
#Train model with best parameters on 3 different splits

for test_size in [0.2,0.5, 0.8]:
    X_train, X_test, y_train, y_test = train_test_split(X_normed, y_ravel, test_size=test_size, random_state=42,shuffle=True)    
    
    rf_opt = RandomForestClassifier(**grid_search.best_params_, random_state=42)
    
    rf_opt.fit(X_train, y_train)
    
    y_pred = rf_opt.predict(X_test)
    y_pred_proba = rf_opt.predict_proba(X_test)[:,1]
    
    print("Train="+str(round(1-test_size,2)))
    print(classification_report(y_test, y_pred))
    print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")
    print(" ")
    print(" ")

Train=0.8
              precision    recall  f1-score   support

           0       0.20      0.00      0.00      1613
           1       0.93      1.00      0.96     20428

    accuracy                           0.93     22041
   macro avg       0.56      0.50      0.48     22041
weighted avg       0.87      0.93      0.89     22041

ROC-AUC: 0.6901
 
 
Train=0.5
              precision    recall  f1-score   support

           0       0.20      0.00      0.00      4062
           1       0.93      1.00      0.96     51040

    accuracy                           0.93     55102
   macro avg       0.56      0.50      0.48     55102
weighted avg       0.87      0.93      0.89     55102

ROC-AUC: 0.6848
 
 
Train=0.2
              precision    recall  f1-score   support

           0       0.07      0.00      0.00      6533
           1       0.93      1.00      0.96     81631

    accuracy                           0.92     88164
   macro avg       0.50      0.50      0.48     88164
weig

In [24]:
# save the model

dump(rf_opt, "trained/rf_opt_SEPSIS.joblib")

['trained/rf_opt_SEPSIS.joblib']

## <b>SGDClassifier (SVM approximation)</b>

In [38]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix, classification_report


In [39]:
# Split 80/20 train/test

y_ravel = y_n.values.ravel()

# splitting NORMED data
X_train, X_test, y_train, y_test = train_test_split(X_normed, y_ravel, test_size=0.2, random_state=42,shuffle=True)

# Oversample the minority class (dead patients)
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [40]:
# base model
sgd = SGDClassifier(
    learning_rate='optimal',
    penalty='elasticnet',
    max_iter=1000,
    random_state=42
)

#gridsearch
param_grid = {
    'alpha' : [1e-5, 1e-3, 1e-1],
    'l1_ratio': [0.15,0.5, 0.85],
    'tol' : [1e-5, 1e-6],
    'loss': ['log_loss']
}

grid_search = GridSearchCV(
    estimator=sgd,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=5,
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train_smote, y_train_smote)

# Best parameters and evaluation
print("Best Parameters:", grid_search.best_params_)
print("Best ROC-AUC:", grid_search.best_score_)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best Parameters: {'alpha': 0.001, 'l1_ratio': 0.15, 'loss': 'log_loss', 'tol': 1e-05}
Best ROC-AUC: 0.7076167741122281


In [41]:
for test_size in [0.2,0.5, 0.8]:
    X_train, X_test, y_train, y_test = train_test_split(X_normed, y_ravel, test_size=test_size, random_state=42,shuffle=True)    
    
    sgd_opt = SGDClassifier(
        **grid_search.best_params_, 
        learning_rate='optimal',
        penalty='elasticnet',
        max_iter=1000,
        random_state=42
    )
    
    sgd_opt.fit(X_train, y_train)
    
    y_pred = sgd_opt.predict(X_test)
    y_pred_proba = sgd_opt.predict_proba(X_test)[:,1]
    
    print("Train="+str(round(1 - test_size, 2)))
    print(classification_report(y_test, y_pred, zero_division=0))
    print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")
    print(" ")
    print(" ")

Train=0.8
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1613
           1       0.93      1.00      0.96     20428

    accuracy                           0.93     22041
   macro avg       0.46      0.50      0.48     22041
weighted avg       0.86      0.93      0.89     22041

ROC-AUC: 0.6996
 
 
Train=0.5
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      4062
           1       0.93      1.00      0.96     51040

    accuracy                           0.93     55102
   macro avg       0.46      0.50      0.48     55102
weighted avg       0.86      0.93      0.89     55102

ROC-AUC: 0.7039
 
 
Train=0.2
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      6533
           1       0.93      1.00      0.96     81631

    accuracy                           0.93     88164
   macro avg       0.46      0.50      0.48     88164
weig

In [42]:
# save the model

dump(sgd_opt, "trained/sgd_opt_SEPSIS.joblib")

['trained/sgd_opt_SEPSIS.joblib']

## <b>Ensemble (Meta-model = SVM)</b>

In [43]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, roc_auc_score

In [44]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X_normed, y_ravel, test_size=0.2, random_state=42,shuffle=True)    

# all pretrained models
base_models = [sgd_opt, rf_opt, nn_opt, xgb_opt]

# make meta features
meta_trainl = []
meta_testl = []

for model in base_models:
    if hasattr(model, "predict_proba"):
        meta_trainl.append(model.predict_proba(X_train)[:,1])
        meta_testl.append(model.predict_proba(X_test)[:,1])
    else:
        meta_trainl.append(model.predict(X_train).ravel())
        meta_testl.append(model.predict(X_test).ravel())    #NN doesnt have predict proba

meta_train = np.array(meta_trainl).T
meta_test = np.array(meta_testl).T

[1m2756/2756[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 406us/step
[1m689/689[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 430us/step


In [45]:
svm_meta = SVC(kernel='rbf', probability=True, random_state=42)

svm_meta.fit(meta_train, y_train)

param_grid = {
    'C' : [0.1, 1, 10],
    'gamma': [0.01, 0.1, 1]
}

grid_search = GridSearchCV(
    estimator=svm_meta,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=5,  # Inner cross-validation
    verbose=2,
    n_jobs=-1
)

# Fit the meta-model using GridSearchCV
grid_search.fit(meta_train, y_train)

# Best parameters and evaluation
print("Best Parameters:", grid_search.best_params_)
print("Best ROC-AUC:", grid_search.best_score_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best Parameters: {'C': 0.1, 'gamma': 0.1}
Best ROC-AUC: 0.7053380984089033


In [53]:
for test_size in [0.2,0.5, 0.8]:
    # split data
    X_train, X_test, y_train, y_test = train_test_split(X_normed, y_ravel, test_size=test_size, random_state=42,shuffle=True)    

    meta_trainl=[]
    meta_testl=[]

    for model in base_models:
        if hasattr(model, "predict_proba"):
            meta_trainl.append(model.predict_proba(X_train)[:,1])
            meta_testl.append(model.predict_proba(X_test)[:,1])
        else:
            meta_trainl.append(model.predict(X_train).ravel())
            meta_testl.append(model.predict(X_test).ravel())    #NN doesnt have predict proba
    
    meta_train = np.array(meta_trainl).T
    meta_test = np.array(meta_testl).T
    
    svm_meta_opt = SVC(
        **grid_search.best_params_, 
        probability=True,
        random_state=42
    )
    
    svm_meta_opt.fit(meta_train, y_train)
    
    y_pred = svm_meta_opt.predict(meta_test)
    y_pred_proba = svm_meta_opt.predict_proba(meta_test)[:,1]
    
    print("Train="+str(round(1 - test_size, 2)))
    print(classification_report(y_test, y_pred, zero_division=0))
    print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")
    print(" ")
    print(" ")

[1m2756/2756[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step
[1m689/689[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 932us/step
Train=0.8
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1613
           1       0.93      1.00      0.96     20428

    accuracy                           0.93     22041
   macro avg       0.46      0.50      0.48     22041
weighted avg       0.86      0.93      0.89     22041

ROC-AUC: 0.6997
 
 
[1m1722/1722[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 819us/step
[1m1722/1722[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 837us/step
Train=0.5
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      4062
           1       0.93      1.00      0.96     51040

    accuracy                           0.93     55102
   macro avg       0.46      0.50      0.48     55102
weighted avg       0.86      0.93      0.89     5

In [54]:
# save the model

dump(svm_meta_opt, "trained/svm_meta_opt_SEPSIS.joblib")

['trained/svm_meta_opt_SEPSIS.joblib']