# Model training and prediction - `s30d`

In [1]:
import numpy as np
import pandas as pd

from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import StackingClassifier, RandomForestClassifier

In [2]:
X_train = pd.read_csv("../data/processed/s30d/X_train.csv", index_col = 0)
y_train = pd.read_csv("../data/processed/s30d/y_train.csv", index_col = 0).s30d
X_test = pd.read_csv("../data/processed/s30d/X_test.csv", index_col = 0)
y_test = pd.read_csv("../data/processed/s30d/y_test.csv", index_col = 0).s30d

In [3]:
y_train.value_counts() / len(y_train.index) * 100

0.0    94.197074
1.0     5.802926
Name: s30d, dtype: float64

In [4]:
cont_features = ["age", "hr", "sbp", "dbp", "spo2", "rr", "delay"]
cat_features = list(X_train.loc[:, ~X_train.columns.isin(cont_features)].columns)

### Lightgbm

In [5]:
from lightgbm import LGBMClassifier

In [6]:
continous_transformer = StandardScaler()
preprocessor = ColumnTransformer(
    transformers=[
        ('cont', continous_transformer, cont_features)]
)

In [7]:
ss = StandardScaler()
X_train.loc[:, cont_features] = ss.fit_transform(X_train.loc[:, cont_features])
X_test.loc[:, cont_features] = ss.fit_transform(X_test.loc[:, cont_features])

In [8]:
clf = LGBMClassifier()

In [9]:
clf.fit(
    X = X_train,
    y = y_train,
    categorical_feature = cat_features
)



LGBMClassifier()

In [67]:
y_pred_train_prob = clf.predict_proba(X = X_train)
y_pred_test_prob = clf.predict_proba(X = X_test)
y_pred_train = clf.predict(X = X_train)
y_pred_test = clf.predict(X = X_test)

Report for continous scores

In [68]:
print(classification_report(y_true = y_train, y_pred = y_pred_train))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      5860
         1.0       1.00      0.99      1.00       361

    accuracy                           1.00      6221
   macro avg       1.00      1.00      1.00      6221
weighted avg       1.00      1.00      1.00      6221



In [69]:
print(classification_report(y_true = y_test, y_pred = y_pred_test))

              precision    recall  f1-score   support

         0.0       0.98      0.99      0.98      1954
         1.0       0.77      0.62      0.69       120

    accuracy                           0.97      2074
   macro avg       0.87      0.80      0.83      2074
weighted avg       0.96      0.97      0.97      2074



Gridsearch breaks for the continous score (to enable comparison with clinicians)

In [96]:
import itertools as it
combinations = [(0, ) + x + (np.inf,) for x in it.combinations(np.arange(0.01, 1, 0.01), r=3)]

In [120]:
from sklearn.model_selection import KFold

In [146]:
for train_index, test_index in KFold(4).split(X_train):
     print("TRAIN:", train_index, "TEST:", test_index)

TRAIN: [1556 1557 1558 ... 6218 6219 6220] TEST: [   0    1    2 ... 1553 1554 1555]
TRAIN: [   0    1    2 ... 6218 6219 6220] TEST: [1556 1557 1558 ... 3108 3109 3110]
TRAIN: [   0    1    2 ... 6218 6219 6220] TEST: [3111 3112 3113 ... 4663 4664 4665]
TRAIN: [   0    1    2 ... 4663 4664 4665] TEST: [4666 4667 4668 ... 6218 6219 6220]


In [140]:
import random

def gridsearch_breaks(combinations, y_pred_prob, y_true, sample_size = None) -> tuple([tuple, list, pd.DataFrame]):
    """Gridsearch breaks for continous probabilites.
    
    Breaks are chosen based on validation set performance.
    """
    ## Sample combinations
    if sample_size:
        combinations = random.sample(combinations, round(sample_size * len(combinations)))
    
    ## Compute performance of of each combination
    predictions = np.empty((len(y_pred_prob), len(combinations)))
    roc_aucs = np.empty((len(combinations), ))
    for i, combination in enumerate(combinations):
        binned_predictions = pd.cut(y_pred_prob, combination, labels = [0, 1, 2, 3], right = True, include_lowest = False)
        roc_aucs[i] = roc_auc_score(y_true = y_true, y_score = binned_predictions)
        predictions[:, i] = binned_predictions
        
    ## Select the combination of breaks that results in best performance
    best_breaks = combinations[np.argmax(roc_aucs)]
    
    return best_breaks, roc_aucs, predictions

In [142]:
best_breaks, roc_aucs, predictions = gridsearch_breaks(
    combinations = combinations, 
    y_pred_prob = y_pred_train_prob[:, 1], ## Predicted probabilities of 1s
    y_true = y_train, 
    sample_size = 0.05
)