# Model training and prediction - `s30d`

In [1]:
import os
os.chdir("..")

In [2]:
os.getcwd()

'/Users/ludvigwarnberggerdin/projects/ttris/pemett'

In [3]:
import numpy as np
import pandas as pd

from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import StackingClassifier, RandomForestClassifier

In [4]:
X_train = pd.read_csv("./data/processed/s30d/X_train.csv", index_col = 0)
y_train = pd.read_csv("./data/processed/s30d/y_train.csv", index_col = 0).s30d
X_test = pd.read_csv("./data/processed/s30d/X_test.csv", index_col = 0)
y_test = pd.read_csv("./data/processed/s30d/y_test.csv", index_col = 0).s30d

In [5]:
y_train.value_counts() / len(y_train.index) * 100

0.0    94.197074
1.0     5.802926
Name: s30d, dtype: float64

In [6]:
cont_features = ["age", "hr", "sbp", "dbp", "spo2", "rr", "delay"]
cat_features = list(X_train.loc[:, ~X_train.columns.isin(cont_features)].columns)

### Lightgbm

In [7]:
from lightgbm import LGBMClassifier

In [8]:
continous_transformer = StandardScaler()
preprocessor = ColumnTransformer(
    transformers=[
        ('cont', continous_transformer, cont_features)]
)

In [9]:
ss = StandardScaler()
X_train.loc[:, cont_features] = ss.fit_transform(X_train.loc[:, cont_features])
X_test.loc[:, cont_features] = ss.fit_transform(X_test.loc[:, cont_features])

In [10]:
clf = LGBMClassifier()

In [11]:
clf.fit(
    X = X_train,
    y = y_train,
    categorical_feature = cat_features
)



LGBMClassifier()

In [12]:
y_pred_prob_train = clf.predict_proba(X = X_train)
y_pred_prob_test = clf.predict_proba(X = X_test)
y_pred_train = clf.predict(X = X_train)
y_pred_test = clf.predict(X = X_test)

Report for continous scores

In [13]:
print(classification_report(y_true = y_train, y_pred = y_pred_train))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      5860
         1.0       1.00      0.99      1.00       361

    accuracy                           1.00      6221
   macro avg       1.00      1.00      1.00      6221
weighted avg       1.00      1.00      1.00      6221



In [14]:
print(classification_report(y_true = y_test, y_pred = y_pred_test))

              precision    recall  f1-score   support

         0.0       0.98      0.99      0.98      1954
         1.0       0.77      0.62      0.69       120

    accuracy                           0.97      2074
   macro avg       0.87      0.80      0.83      2074
weighted avg       0.96      0.97      0.97      2074



Gridsearch breaks for the continous score (to enable comparison with clinicians)

In [15]:
import copy
import random
from src.models.train_model import generate_all_combinations
from tqdm.notebook import tqdm

from typing import Callable, Optional

from sklearn.model_selection import cross_val_predict, StratifiedKFold

In [16]:
def cv_inner_loop(base_clfs: list,
                  inner_loop: Callable, 
                  X: pd.DataFrame, 
                  y: pd.Series,
                  verbose: bool) -> np.ndarray:
    """Run inner loop of k-fold cross-validation.
    
    Uses sklearn's cross_val_predict.
    
    That is,
    1. Fit classifier to the training folds.
    2. Make prediction on the validation fold.
    3. Use all folds as validation fold, one time each.

    Args:
      base_clfs: List of classifiers. E.g. [LGBMClassifier, LogisticRegression]
      inner_loop: scikit-learn callable to split into folds
      X: Features
      y: Targets

    Returns:
      Each column represent predictions by each respective classifier
    """
    predictions = np.zeros((len(X_train.index), ))
    for clf in base_clfs:
        if verbose: print("Running predictions for " + str(clf))
        preds = cross_val_predict(
            estimator=clf,
            X=X,
            y=y,
            cv=inner_loop
        )
        if predictions.any():
            predictions = np.hstack([predictions, preds[:, np.newaxis]])
        else:
            predictions = preds[:, np.newaxis]

    return predictions

In [78]:
from copy import deepcopy

def cv_outer_loop(base_clfs: list, meta_clf: Callable,
                  all_hyper_parameters: list,
                  X: pd.DataFrame, y: pd.Series,
                  use_meta_features: bool = False, 
                  verbose: bool = False, refit: bool = False,
                  **kwargs):
    """Run outer cross-validation.
    
    That is, find the best combination cut-points for the classifier.
    "Best" is defined by the highest AUC of ROC.
    
    Inspired by:
        https://github.com/rasbt/mlxtend/blob/master/mlxtend/classifier/stacking_cv_classification.py
        
    Args:
        base_clfs: Base classifiers
        meta_clf: Meta classifier
        all_hyper_parameters: Model hyper parameters and breaks for continous probabilities
        X: Features
        y: Targets
        use_meta_features: If True, the feature set for meta classifier is predicted probabilities
                           of positive labels from base classifiers + features used to train
                           base classifiers
        verbose: If True, logging is used

    Returns:
        Hyper parameters yielding highest average auc of roc across outer folds
    """
    
    def fit(**kwargs):
        base_clfs_ = []
        # Set the hyper hyper parameters of the base classifiers
        for clfk in base_clfs.keys():
            ks = [s for s in hyper_parameters.keys() if clfk in s]
            clf_params = {k.split("__")[1]: hyper_parameters.get(k) for k in ks}
            clf = base_clfs[clfk]
            clf.set_params(**clf_params)
            base_clfs_.append(clf)
        
        # Get meta features of training set
        meta_features_train = cv_inner_loop(
            base_clfs=base_clfs_,
            inner_loop=inner_loop,
            X=X_train,
            y=y_train,
            verbose=verbose
        )

        # Fit meta classifier to meta features of train
        meta_clf.fit(meta_features_train, y_train)
        
        return base_clfs_, meta_clf, meta_features_train
        
    def predict_and_score(**kwargs):
        
        base_clfs_, meta_clf, meta_features_train = fit(
            base_clfs=base_clfs,
            meta_clf=meta_clf,
            hyper_parameters=hyper_parameters,
            X_train=X_train,
            y_train=y_train,
            verbose=verbose
        )

        # Get meta features of validation set
        per_model_preds = []
        for clf in base_clfs_:
            clf.fit(X_train, y_train)
            prediction = clf.predict_proba(X_val)[:, :-1]
            per_model_preds.append(prediction)
        meta_features_val = np.hstack(per_model_preds)
        
        # Fit meta classifier to meta features of train
        meta_clf.fit(meta_features_train, y_train)
        # Predict using validation meta features
        y_pred_prob_meta = meta_clf.predict_proba(meta_features_val)
        # Calculate AUC of ROC for cut predictions
        binned_predictions = pd.cut(
            x=y_pred_prob_meta[:, 1],
            bins=hyper_parameters["breaks"],
            labels=[0, 1, 2, 3],
            right=True,
            include_lowest=False
        )
        return roc_auc_score(
            y_true=y_val,
            y_score=binned_predictions
        )
                
    ## Setup splitting
    inner_folds = 3
    outer_folds = 2
    inner_loop = StratifiedKFold(n_splits = inner_folds)
    outer_loop = StratifiedKFold(n_splits = outer_folds)
    
    ## Setup for recording auc from each combination of hps
    roc_aucs = pd.DataFrame(
        data = np.zeros((len(all_hyper_parameters), outer_folds)),
        columns = range(1, outer_folds + 1)
    )
    
    for i, hyper_parameters in enumerate(all_hyper_parameters):

        for j, (train_index, val_index) in enumerate(outer_loop.split(X, y)):
        
            X_train = X.iloc[train_index]
            y_train = y.iloc[train_index]
            X_val = X.iloc[val_index]
            y_val = y.iloc[val_index]
            
            auc = predict_and_score(
                base_clfs=base_clfs,
                meta_clf=meta_clf,
                hyper_parameters=hyper_parameters,
                inner_loop=inner_loop,
                X_train=X_train,
                y_train=y_train,
                X_val=X_val,
                y_val=y_val,
                verbose=verbose
            )
            
            roc_aucs.iloc[i, j] = 1 - auc
    
    max_row = roc_aucs.mean(axis=1).idxmax()
    best_hyper_parameters = all_hyper_parameters[max_row]
    
    if refit: 
        base_clfs_, meta_clf, _ = fit(
            base_clfs=base_clfs,
            meta_clf=meta_clf,
            hyper_parameters=hyper_parameters,
            X_train=X_train,
            y_train=y_train,
            verbose=verbose
        )
    
    return base_clfs_, meta_clf

In [79]:
import itertools as it
all_breaks = [(0, ) + x + (np.inf,) for x in it.combinations(np.arange(0.01, 1, 0.01), r=3)]

In [80]:
hyper_parameters = {
    "lgbm1__max_depth": [100, 500],
    "lgbm1__num_leaves": [200, 100],
    "lgbm2__max_depth": [100, 200],
    "breaks": all_breaks[:1]
}

In [81]:
base_clfs = {
    "lgbm1": LGBMClassifier(),
    "lgbm2": LGBMClassifier()
}

In [82]:
all_hyper_parameters = generate_all_combinations(hyper_parameters)

In [84]:
best_hyper_parameters = cv_outer_loop(
    base_clfs=base_clfs,
    meta_clf=LogisticRegression(),
    all_hyper_parameters=all_hyper_parameters,
    all_breaks=all_breaks,
    X=X_train,
    y=y_train,
    refit=True
)

UnboundLocalError: local variable 'meta_clf' referenced before assignment

In [None]:
best_hyper_parameters

Refit each classifier to the training set