# Model training and prediction - `s30d`

In [1]:
import os
os.chdir("..")

In [2]:
os.getcwd()

'/Users/ludvigwarnberggerdin/projects/ttris/pemett'

In [3]:
import numpy as np
import pandas as pd

from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import StackingClassifier, RandomForestClassifier

In [4]:
X_train = pd.read_csv("./data/processed/s30d/X_train.csv", index_col = 0)
y_train = pd.read_csv("./data/processed/s30d/y_train.csv", index_col = 0).s30d
X_test = pd.read_csv("./data/processed/s30d/X_test.csv", index_col = 0)
y_test = pd.read_csv("./data/processed/s30d/y_test.csv", index_col = 0).s30d

In [5]:
y_train.value_counts() / len(y_train.index) * 100

0.0    94.197074
1.0     5.802926
Name: s30d, dtype: float64

In [6]:
cont_features = ["age", "hr", "sbp", "dbp", "spo2", "rr", "delay"]
cat_features = list(X_train.loc[:, ~X_train.columns.isin(cont_features)].columns)

### Lightgbm

In [7]:
from lightgbm import LGBMClassifier

In [8]:
continous_transformer = StandardScaler()
preprocessor = ColumnTransformer(
    transformers=[
        ('cont', continous_transformer, cont_features)]
)

In [9]:
ss = StandardScaler()
X_train.loc[:, cont_features] = ss.fit_transform(X_train.loc[:, cont_features])
X_test.loc[:, cont_features] = ss.fit_transform(X_test.loc[:, cont_features])

In [10]:
clf = LGBMClassifier()

In [11]:
clf.fit(
    X = X_train,
    y = y_train,
    categorical_feature = cat_features
)



LGBMClassifier()

In [12]:
y_pred_prob_train = clf.predict_proba(X = X_train)
y_pred_prob_test = clf.predict_proba(X = X_test)
y_pred_train = clf.predict(X = X_train)
y_pred_test = clf.predict(X = X_test)

Report for continous scores

In [13]:
print(classification_report(y_true = y_train, y_pred = y_pred_train))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      5860
         1.0       1.00      0.99      1.00       361

    accuracy                           1.00      6221
   macro avg       1.00      1.00      1.00      6221
weighted avg       1.00      1.00      1.00      6221



In [14]:
print(classification_report(y_true = y_test, y_pred = y_pred_test))

              precision    recall  f1-score   support

         0.0       0.98      0.99      0.98      1954
         1.0       0.77      0.62      0.69       120

    accuracy                           0.97      2074
   macro avg       0.87      0.80      0.83      2074
weighted avg       0.96      0.97      0.97      2074



Gridsearch breaks for the continous score (to enable comparison with clinicians)

In [98]:
import copy
import random
from src.models.train_model import generate_all_combinations
from tqdm.notebook import tqdm

from typing import Callable, Optional

from sklearn.model_selection import cross_val_predict, StratifiedKFold

def cv_inner_loop(base_clfs: list, hyper_parameters: list, 
                  inner_loop: Callable, X: pd.DataFrame, 
                  y: pd.Series) -> pd.DataFrame:
    """Run inner loop of k-fold cross-validation.
    
    Uses sklearn's cross_val_predict
    
    That is,
    1. Fit classifier to the training folds.
    2. Make prediction on the validation fold.
    3. Repeat for all k-folds.

    Args:
      clf: Classifier 
      hyper_parameters: 

    Returns:
      Predictions
    """
    for clflk in base_clfs.keys()
        d = clfs[clfk]
        clf = d['clf']
        hp = d['hp']
        clf = clf.set_params(**hp)
        print("Running predictions for " + str(clf))
        preds = cross_val_predict(
            estimator=clf,
            X=X,
            y=y,
            cv=inner_loop
        )
        if predictions.any():
            predictions = np.hstack([predictions, preds[:, np.newaxis]])
        else:
            predictions = preds[:, np.newaxis]
            
    preds = np.zeros((len(X_train.index), ))
    for i, (train_index, val_index) in enumerate(inner_loop.split(X_train, y_train)):

        X_train_ = X_train.iloc[train_index]
        y_train_ = y_train.iloc[train_index]

        X_val = X_train.iloc[val_index]
        y_val = y_train.iloc[val_index]

        for j, hp in enumerate(tqdm(hyper_parameters)):

            clf = clf.set_params(**hp)
            clf.fit(X_train_, y_train_)

            y_pred_val = clf.predict(X_val)
            y_pred_prob_val = clf.predict_proba(X_val)
            preds[val_index] = y_pred_prob_val[:, 1]

    return preds

SyntaxError: invalid syntax (2948371950.py, line 28)

In [96]:
def cv_outer_loop(base_clfs: list, meta_clf: Callable,
                  X_train: pd.DataFrame, y_train: pd.Series,
                  use_meta_features: bool = False, 
                  **kwargs):
    """Conduct outer cross-validation.
    
    That is, find the best combination cut-points for the classifier.
    
    Inspired by:
        https://github.com/rasbt/mlxtend/blob/master/mlxtend/classifier/stacking_cv_classification.py
        
    Args:
        base_clfs: Base classifiers.
        meta_clf:
        X_train:
        y_train
        use_meta_features
    """
    ## Fit each classifier to the original training set
    meta_predictions = np.array([])
    clf_keys = base_clfs.keys()
    ## Setup splitting
    inner_folds = 5
    outer_folds = 2
    inner_loop = StratifiedKFold(n_splits = inner_folds)
    outer_loop = StratifiedKFold(n_splits = outer_folds)
    
    ## Setup for recording auc from each combination of hps
    roc_aucs = pd.DataFrame(
        data = np.zeros((len(hyper_parameters), n_folds)),
        columns = range(1, n_folds + 1),
        index = [str(hp) for hp in hyper_parameters]
    )

    for i, (train_index, val_index) in enumerate(outer_loop.split(X_train, y_train)):
        
        X_train_ = X_train.iloc[train_index]
        y_train_ = y_train.iloc[train_index]

        X_val = X_train.iloc[val_index]
        y_val = y_train.iloc[val_index]
        
        for j, hp in enumerate(hyper_parameters):
            
            hp = 
            breaks = hp['breaks']
            
            ## Fit each classifier to the predicted probabilities
            base_predictions = inner_loop(
                base_clfs=base_clfs
                hyper_parameters=hyper_parameters,
                inner_loop=inner_loop,
                X=X_train_,
                y=y_train_
            )
    
            ## Fit the meta clf to predictions, and predict on validation set
            meta_clf.fit(predictions, y_train)
            y_pred_prob_meta = meta_clf.predict_proba(predictions)
            binned_predictions = pd.cut(y_pred_prob_val[:, 1], breaks, labels = [0, 1, 2, 3], right = True, include_lowest = False)
            roc_aucs.iloc[j, i] = roc_auc_score(y_true = y_val, y_score = binned_predictions)
    
    return base_clfs, meta_clf

In [76]:
from sklearn.ensemble import RandomForestClassifier

In [99]:
import itertools as it
all_breaks = [(0, ) + x + (np.inf,) for x in it.combinations(np.arange(0.01, 1, 0.01), r=3)]

In [101]:
hyper_parameters = {'max_depth': [5]}

In [95]:
roc_aucs, preds = cv_inner_loop(
    clf = LGBMClassifier(),
    hyper_parameters = hyper_parameters,
    all_breaks = all_breaks,
    X_train = X_train,
    y_train = y_train, 
    sample_size = 0.0001
)

  0%|          | 0/1 [00:00<?, ?it/s]

TypeError: lightgbm.sklearn.LGBMModel.set_params() argument after ** must be a mapping, not str

In [102]:
lgbm1 = dict(
    clf=LGBMClassifier(),
    hp=dict(
        max_depth=[5]
    )
)
lgbm2 = dict(
    clf=LGBMClassifier(),
    hp=dict(
        max_depth=[100, 200]
    )
)
clfs = dict(
    lgbm1=lgbm1,
    lgbm2=lgbm2
)

In [103]:
clfs

{'lgbm1': {'clf': LGBMClassifier(), 'hp': {'max_depth': [5]}},
 'lgbm2': {'clf': LGBMClassifier(), 'hp': {'max_depth': [100, 200]}}}

In [78]:
clfs, meta_clf = ensemble_learning(
    base_clfs=clfs,
    meta_clf=LGBMClassifier(),
    all_breaks=all_breaks,
    X_train=X_train, 
    y_pred_prob_train=y_pred_prob_train,
    y_train = y_train,
    sample_size=0.0001
)

Running predictions for LGBMClassifier()


  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

TypeError: tuple indices must be integers or slices, not tuple