# Model training and prediction - `s30d`

In [5]:
os.getcwd()

'/home/ludvigwgerdin/projects/Python/pemett/pemett'

In [1]:
import os
os.chdir("..")

In [2]:
import numpy as np
import pandas as pd

from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import StackingClassifier, RandomForestClassifier

In [8]:
X_train = pd.read_csv("./data/processed/s30d/X_train.csv", index_col = 0)
y_train = pd.read_csv("./data/processed/s30d/y_train.csv", index_col = 0).s30d
X_test = pd.read_csv("./data/processed/s30d/X_test.csv", index_col = 0)
y_test = pd.read_csv("./data/processed/s30d/y_test.csv", index_col = 0).s30d

In [9]:
y_train.value_counts() / len(y_train.index) * 100

0.0    94.197074
1.0     5.802926
Name: s30d, dtype: float64

In [10]:
cont_features = ["age", "hr", "sbp", "dbp", "spo2", "rr", "delay"]
cat_features = list(X_train.loc[:, ~X_train.columns.isin(cont_features)].columns)

### Lightgbm

In [11]:
from lightgbm import LGBMClassifier

In [12]:
continous_transformer = StandardScaler()
preprocessor = ColumnTransformer(
    transformers=[
        ('cont', continous_transformer, cont_features)]
)

In [13]:
ss = StandardScaler()
X_train.loc[:, cont_features] = ss.fit_transform(X_train.loc[:, cont_features])
X_test.loc[:, cont_features] = ss.fit_transform(X_test.loc[:, cont_features])

In [14]:
clf = LGBMClassifier()

In [15]:
clf.fit(
    X = X_train,
    y = y_train,
    categorical_feature = cat_features
)



LGBMClassifier()

In [16]:
y_pred_prob_train = clf.predict_proba(X = X_train)
y_pred_prob_test = clf.predict_proba(X = X_test)
y_pred_train = clf.predict(X = X_train)
y_pred_test = clf.predict(X = X_test)

Report for continous scores

In [17]:
print(classification_report(y_true = y_train, y_pred = y_pred_train))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      5860
         1.0       1.00      0.99      1.00       361

    accuracy                           1.00      6221
   macro avg       1.00      1.00      1.00      6221
weighted avg       1.00      1.00      1.00      6221



In [18]:
print(classification_report(y_true = y_test, y_pred = y_pred_test))

              precision    recall  f1-score   support

         0.0       0.98      0.99      0.98      1954
         1.0       0.77      0.62      0.69       120

    accuracy                           0.97      2074
   macro avg       0.87      0.80      0.83      2074
weighted avg       0.96      0.97      0.97      2074



Gridsearch breaks for the continous score (to enable comparison with clinicians)

In [226]:
import copy
import random
from src.models.train_model import generate_all_combinations

from sklearn.model_selection import StratifiedKFold

def gridsearch_breaks(clf, hyper_parameters, all_breaks,
                      X_train, y_pred_prob_train, y_train, 
                      sample_size = None) -> tuple([tuple, list, pd.DataFrame]):
    """Gridsearch breaks for continous probabilites.
    
    Breaks are chosen based on validation set performance.
    """
    ## Sample combinations
    if sample_size:
        all_breaks = random.sample(all_breaks, round(sample_size * len(all_breaks)))
        
    ## Merge breaks and model hyper parameters
    d = {**hyper_parameters, 'breaks': all_breaks}
    hyper_parameters = generate_all_combinations(d)
    
    ## Compute performance of each combination over five folds
    n_folds = 5
    roc_aucs = pd.DataFrame(
        data = np.zeros((len(hyper_parameters), n_folds)),
        columns = range(1, n_folds + 1),
        index = [str(hp) for hp in hyper_parameters]
    )

    for i, (train_index, val_index) in enumerate(StratifiedKFold(n_splits = n_folds).split(X_train, y_train)):

        X_train_ = X_train.iloc[train_index]
        y_train_ = y_train.iloc[train_index]

        X_val = X_train.iloc[val_index]
        y_val = y_train.iloc[val_index]

        for j, hp in enumerate(hyper_parameters):

            hp_ = copy.deepcopy(hp)
            breaks = hp_.pop("breaks")

            clf = clf.set_params(**hp_)
            clf.fit(X_train_, y_train_)

            y_pred_val = clf.predict(X_val)
            y_pred_prob_val = clf.predict_proba(X_val)

            binned_predictions = pd.cut(y_pred_prob_val[:, 1], breaks, labels = [0, 1, 2, 3], right = True, include_lowest = False)
            roc_aucs.iloc[j, i] = roc_auc_score(y_true = y_val, y_score = binned_predictions)

    return roc_aucs.mean(axis = 1)

In [231]:
import itertools as it
all_breaks = [(0, ) + x + (np.inf,) for x in it.combinations(np.arange(0.01, 1, 0.01), r=3)]

In [232]:
hyper_parameters = {'max_depth': [5, 10]}

In [234]:
roc_aucs = gridsearch_breaks(
    clf = LGBMClassifier(),
    hyper_parameters = hyper_parameters,
    all_breaks = all_breaks,
    X_train = X_train,
    y_pred_prob_train = y_pred_prob_train[:, 1], ## Predicted probabilities of 1s, i.e. dead within 30 days
    y_train = y_train, 
    sample_size = 0.0001
)