# SI LSx: ONTRAM 3D CNN
## Outcome: mRS binary

### Load dependencies

In [None]:
!python -V

In [None]:
# !python -m pip install -U scikit-image

In [None]:
import os
import h5py
import pandas as pd
import numpy as np
import random
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from scipy import ndimage
from sklearn import metrics
from sklearn import linear_model

# Tensorflow/Keras
import tensorflow as tf
print(tf.__version__)
from tensorflow import keras
print(keras.__version__)
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras import layers
from tensorflow.keras import regularizers
from keras.utils import to_categorical

# Own functions
from functions.plot_slices import plot_slices
from functions.ontram import ontram
from functions.fit_ontram import fit_ontram
from functions.fit_ontram_batches import fit_ontram_batches
from functions.plot_results import plot_results
from functions.methods import predict

### Config Variables

In [None]:
# OUTPUT_VARIABLE = "stroke"
OUTPUT_VARIABLE = "mrs"
N_ENSEMBLES = 5
N_FOLDS = 5
MODEL_SELECTION = "train" # train, test or last

In [None]:
DIR = "/tf/notebooks/katrin/"
OUTPUT_DIR = '{}results/mrs_binary/ensemble/'.format(DIR)
INPUT_IMG = "{}data/dicom_3d_128x128x30.h5".format(DIR)
INPUT_TAB = "{}data/baseline_data_DWI_imputed.csv".format(DIR)

### Import tabular data

In [None]:
INPUT_TAB

In [None]:
dat = pd.read_csv(INPUT_TAB, sep = ',')
dat.tail(3)

In [None]:
# change values to numbers
dat = dat.replace('no', 0)
dat = dat.replace('yes', 1)
dat.sex = dat.sex.replace('female', 1)
dat.sex = dat.sex.replace('male', 0)
dat.event = dat.event.replace('Stroke', 1)
dat.event = dat.event.replace('TIA', 0)
dat.p_id =[format(id, '03d') for id in dat.p_id]
dat.head(3)

In [None]:
print("number of missing outcomes: {}".format(sum(dat.mrs_3months.isna())))
missing_ids = dat.p_id[dat.mrs_3months.isna()]

In [None]:
#remove all patients with missing outcome
keeps = [not i for i in dat.mrs_3months.isna()]
dat = dat[keeps]

In [None]:
sum(dat.mrs_3months.isna())

In [None]:
# Variables we have
dat.columns

In [None]:
# define mRS binary 
dat["mrs_3months_binary"] = 1
dat.loc[dat.mrs_3months <= 2, "mrs_3months_binary"] = 0
plt.hist(dat.mrs_3months_binary, bins = 2)

In [None]:
# match tabular data to image data
# standardize age to 0 mean 1 variance
X_tab = np.zeros((dat.shape[0], 12))
Y_mrs = np.zeros((dat.shape[0]))
Y_mrs_bin = np.zeros((dat.shape[0]))
pat = []
for i, p in enumerate(dat.p_id):
    k = np.where(dat.p_id.values == p)[0][0]
    dat_tmp = dat.iloc[k]
    pat.append(dat_tmp.p_id)
    X_tab[i,:] = np.array([dat_tmp.age, dat_tmp.sex, dat_tmp.mrs_before, dat_tmp.nihss_baseline, 
                           dat_tmp.stroke_before, dat_tmp.tia_before, dat_tmp.rf_hypertonia, 
                           dat_tmp.rf_diabetes, dat_tmp.rf_hypercholesterolemia, dat_tmp.rf_smoker, 
                           dat_tmp.rf_atrial_fibrillation, dat_tmp.rf_chd])
    Y_mrs[i] = dat_tmp.mrs_3months
    Y_mrs_bin[i] = dat_tmp.mrs_3months_binary
X_tab

In [None]:
if OUTPUT_VARIABLE == "stroke":
    Y = Y_pat
elif OUTPUT_VARIABLE == "mrs":
    Y = Y_mrs_bin
else:
    raise ValueError("unknown OUTPUT_VARIABLE: {}".format(OUTPUT_VARIABLE))

### Define train validation test set

In [None]:
## get mrs = 0 and mrs = 1 indeces
idx_0 = np.where(Y == 0)
idx_1 = np.where(Y == 1)
print("{} mRS 0 patients".format(len(idx_0[0])))
print("{} mRS 1 patients".format(len(idx_1[0])))

## shuffle indices
np.random.seed(2021)
np.random.shuffle(idx_0[0])
np.random.shuffle(idx_1[0])

## split indices into 5 parts
splits_0 = np.array_split(idx_0[0], N_FOLDS)
splits_1 = np.array_split(idx_1[0], N_FOLDS)

## define chosen splits for each fold
test_folds = [0, 1, 2, 3, 4]
valid_folds = [1, 2, 3, 4, 0]
train_folds = [[0, 1], [1, 2], [2, 3], [3, 4], [0, 4]] ## remove these splits for training data

### Define models for tabular data
### Simple intercept, linear shift (logistic regression)
Train with
- tabular data
- outcome = mRS binary
- Ensemble with 5 Models
- 5-Fold CV

In [None]:
# simple intercept
def simple_intercept(y_dim, run):
    initializer = keras.initializers.HeNormal(seed = 2802 + run)
    in_ = keras.Input(shape = (1, ), name = "bl_in")
    out_ = layers.Dense(y_dim - 1, activation = "linear",
                        use_bias = False, name = "bl_out", 
                        kernel_initializer = initializer)(in_)
    nn_bl = keras.Model(inputs = in_, outputs = out_)
    return nn_bl

# linear shift
def linear_shift_x(x, run):
    initializer = keras.initializers.HeNormal(seed = 2802 + run)
    in_ = keras.Input(shape = x.shape[1:], name = 'x_in')
    out_ = layers.Dense(1, activation = 'linear',
                        use_bias = False, name = 'x_out', 
                        kernel_initializer = initializer)(in_)
    nn_x = keras.Model(inputs = in_, outputs = out_)
    return nn_x

### Train Models
#### First run:
- batch size = 128
- small learning rate

#### Second run:
- batch size = n
- large learning rate

In [None]:
for fold in range(N_FOLDS):
    
    ## define train, test and validation splits
    test_idx = np.concatenate((splits_0[test_folds[fold]], splits_1[test_folds[fold]]), axis = None)
    valid_idx = np.concatenate((splits_0[valid_folds[fold]], splits_1[valid_folds[fold]]), axis = None)

    train_0 = np.delete(splits_0, train_folds[fold], 0)
    train_0 = [item for sublist in train_0 for item in sublist]
    
    train_1 = np.delete(splits_1, train_folds[fold], 0)
    train_1 = [item for sublist in train_1 for item in sublist]
    
    train_idx = np.concatenate((train_0, train_1), axis = None)
    
    X_tab_train = X_tab[train_idx]
    X_tab_test = X_tab[test_idx]
    X_tab_valid = X_tab[valid_idx]
    
    Y_train = Y[train_idx]
    Y_test = Y[test_idx]
    Y_valid = Y[valid_idx] 
    
    Y_train = to_categorical(Y_train)
    Y_valid = to_categorical(Y_valid)
    Y_test = to_categorical(Y_test)
    
    for run in range(N_ENSEMBLES):
    
        ## create output directory
        folder_name = "SI_LSx/fold_{}/run_{}/".format(fold, run)
        if not os.path.exists(OUTPUT_DIR + folder_name):
            os.makedirs(OUTPUT_DIR + folder_name)
       
        print("training fold {}/{}, run {}/{}".format(fold+1, N_FOLDS, run+1, N_ENSEMBLES))
    
        # define model
        nn_bl = simple_intercept(Y_train.shape[1], run)
        nn_x = linear_shift_x(X_tab_train, run)
        
        si_ls = ontram(nn_bl = nn_bl, nn_x = nn_x, response_varying = False)
        
        MODEL_SELECTION = "train"
        hist = fit_ontram(si_ls, 
                          x_train = X_tab_train, y_train = Y_train,
                          x_test = X_tab_valid, y_test = Y_valid,
                          batch_size = 128,
                          epochs = 200,
                          optimizer = tf.keras.optimizers.Adam(lr = 0.01),
                          balance_batches = False,
                          output_dir = OUTPUT_DIR + folder_name,
                          model_selection = MODEL_SELECTION)
        
        ## save training loss and accuracy
        out = pd.DataFrame({'fold': fold,
                            'run': run,
                            'train_loss': hist["train_loss"], 
                            'train_acc': hist["train_acc"],
                            'test_loss': hist["test_loss"], 
                            'test_acc': hist["test_acc"]})

        ## save best model
        if(MODEL_SELECTION == "test"): # test loss...
            best_model = np.where(out.test_loss == np.min(out.test_loss))[0][0]
        if(MODEL_SELECTION == "train"): # train loss...
            best_model = np.where(out.train_loss == np.min(out.train_loss))[0][0]
        if(MODEL_SELECTION == "last"): # last model
            best_model = out.shape[0] - 1
        print('model selection: {}'.format(MODEL_SELECTION))
        print('best model run {}: {}'.format(run, best_model))
        si_ls.model.load_weights('{}{}model-{:03d}.hdf5'.format(OUTPUT_DIR, folder_name, best_model))
        
        MODEL_SELECTION = "last"
        hist = fit_ontram(si_ls, 
                          x_train = X_tab_train, y_train = Y_train,
                          x_test = X_tab_valid, y_test = Y_valid,
                          batch_size = X_tab_train.shape[0],
                          epochs = 1000,
                          optimizer = tf.keras.optimizers.Adam(lr = 0.1),
                          balance_batches = False,
                          output_dir = OUTPUT_DIR + folder_name,
                          model_selection = MODEL_SELECTION)
        
        ## save training loss and accuracy
        out = pd.DataFrame({'fold': fold,
                            'run': run,
                            'train_loss': hist["train_loss"], 
                            'train_acc': hist["train_acc"],
                            'test_loss': hist["test_loss"], 
                            'test_acc': hist["test_acc"]})
        
        if run == 0 and fold == 0:
            out.to_csv("{}SI_LSx/ensemble_history.csv".format(OUTPUT_DIR), index = False)
        else:
            out.to_csv("{}SI_LSx/ensemble_history.csv".format(OUTPUT_DIR), 
                       mode='a', header=False, index = False)
            
        ## save best model
        if(MODEL_SELECTION == "test"): # test loss...
            best_model = np.where(out.test_loss == np.min(out.test_loss))[0][0]
        if(MODEL_SELECTION == "train"): # train loss...
            best_model = np.where(out.train_loss == np.min(out.train_loss))[0][0]
        if(MODEL_SELECTION == "last"): # last model
            best_model = out.shape[0] - 1
        print('model selection: {}'.format(MODEL_SELECTION))
        print('best model run {}: {}'.format(run, best_model))
        si_ls.model.load_weights('{}{}model-{:03d}.hdf5'.format(OUTPUT_DIR, folder_name, best_model))
        si_ls.model.save_weights('{}SI_LSx/fold_{}/best_model_run{}.hdf5'.format(OUTPUT_DIR, fold, run))
        
        # predict model
        pred = predict(si_ls, x = X_tab_test, y = Y_test)
        out = pd.DataFrame({'pid': np.array(pat)[test_idx],
                            'fold': fold,
                            'run': run,
                            'pred_prob_mrs0': pred["pdf"][:, 0],
                            'pred_prob_mrs1': pred["pdf"][:, 1],
                            'pred_label_mrs1': pred["response"],
                            'patient_label_mrs0': Y_test[:, 0],
                            'patient_label_mrs1': Y_test[:, 1]})
        if run == 0 and fold == 0:
            out.to_csv("{}SI_LSx/ensemble_predictions.csv".format(OUTPUT_DIR), index = False)
        else:
            out.to_csv("{}SI_LSx/ensemble_predictions.csv".format(OUTPUT_DIR), 
                       mode='a', header=False, index = False)
            
        ## save model weights
        names = ['intercept', 'age', 'sex', 'mrs_before', 'nihss_baseline', 'stroke_before', 
                     'tia_before', 'rf_hypertonia', 'rf_diabetes', 'rf_hypercholesterolemia', 
                     'rf_smoker', 'rf_atrial_fibrillation', 'rf_chd']
        weights = np.concatenate((pred['theta'][0], np.array(pred['beta_w']).flatten()))
        out = pd.DataFrame({'fold': fold,
                            'run': run,
                            'names': names,
                            'coef': weights})
        if run == 0 and fold == 0:
            out.to_csv("{}SI_LSx/ensemble_weights.csv".format(OUTPUT_DIR), index = False)
        else:
            out.to_csv("{}SI_LSx/ensemble_weights.csv".format(OUTPUT_DIR), 
                       mode='a', header=False, index = False)

In [None]:
pred = pd.read_csv("{}SI_LSx/ensemble_predictions.csv".format(OUTPUT_DIR))
pred.head(5)

In [None]:
weights = pd.read_csv("{}SI_LSx/ensemble_weights.csv".format(OUTPUT_DIR))
weights.head(14)

In [None]:
hist = pd.read_csv("{}SI_LSx/ensemble_history.csv".format(OUTPUT_DIR))
hist.head(5)