# Imports and functions

In [1]:
from datetime import datetime
print(datetime.now())
#data preprocessing
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import collections
from collections import defaultdict
# NN
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import time
import math
from sklearn.calibration import calibration_curve
from sklearn.metrics import roc_curve, precision_recall_curve, f1_score, roc_auc_score, auc, accuracy_score
import sklearn.metrics as metrics
import matplotlib.lines as mlines
from matplotlib import pyplot as plt
import seaborn as sns
from captum.attr import IntegratedGradients


2024-09-17 18:43:35.346931


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch

if (torch.cuda.is_available()):
    print('Training on GPU')
else:
    print('Training on CPU') # On mac book GPU is not possible =() 
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

Training on GPU


In [3]:
# Set parameter as constant 

TESTING = False 
TEST_SIZE = 0.05

SPLIT_SIZE = 0.2 

NORMALIZATION = 'min-max' 

CAPPING_THRESHOLD_UPPER = 0.99
CAPPING_THRESHOLD_LOWER = 0.01

# How much time the prediction should occur (hours)
HOURS_AHEAD = 48

NORM_TYPE = 'min_max'

RANDOM = 42

# LSTM
batch_size = 5

# naming model and plot
classifier_name = "None vs. Any AKI"    ###change every time #Moderate vs. Severe #None vs. Any #Others vs. Severe
plot_name = "adult_AnyAKI_LR"    ###change every time

base_data_path=r"/home/jori152b/DIR/horse/jori152b-medinf/KP_MedInf/model_development/data"

In [4]:
# Some functions used later

def cap_data(df):
    print("Capping between the {} and {} quantile".format(CAPPING_THRESHOLD_LOWER, CAPPING_THRESHOLD_UPPER))
    cap_mask = df.columns.difference(['icustay_id', 'charttime', 'aki_stage'])
    df[cap_mask] = df[cap_mask].clip(df[cap_mask].quantile(CAPPING_THRESHOLD_LOWER),
                                     df[cap_mask].quantile(CAPPING_THRESHOLD_UPPER),
                                     axis=1)

    return df
 
    
def normalise_data(df, norm_mask):
    print("Normalizing in [0,1] with {} normalization".format(NORMALIZATION))
    
    min_values = df[norm_mask].min()
    max_values = df[norm_mask].max()
    
    # Skip normalization for constant columns
    for column in norm_mask:
        if min_values[column] != max_values[column]:
            df[column] = (df[column] - min_values[column]) / (max_values[column] - min_values[column])
    
    normalization_parameters = {column: {'min': min_values[column], 'max': max_values[column]} for column in norm_mask}
    
    return df, normalization_parameters


# impute missing value in resampleing data with most common based on each id
def fast_mode(df, key_cols, value_col):
    """ Calculate a column mode, by group, ignoring null values. 
    
    key_cols : list of str - Columns to groupby for calculation of mode.
    value_col : str - Column for which to calculate the mode. 

    Return
    pandas.DataFrame
        One row for the mode of value_col per key_cols group. If ties, returns the one which is sorted first. """
    return (df.groupby(key_cols + [value_col]).size() 
              .to_frame('counts').reset_index() 
              .sort_values('counts', ascending=False) 
              .drop_duplicates(subset=key_cols)).drop('counts',axis=1)


#get max shape of 3d array
def get_dimensions(array, level=0):   
    yield level, len(array)
    try:
        for row in array:
            yield from get_dimensions(row, level + 1)
    except TypeError: #not an iterable
        pass

def get_max_shape(array):
    dimensions = defaultdict(int)
    for level, length in get_dimensions(array):
        dimensions[level] = max(dimensions[level], length)
    return [value for _, value in sorted(dimensions.items())]

#pad the ragged 3d array to rectangular shape based on max size
def iterate_nested_array(array, index=()):
    try:
        for idx, row in enumerate(array):
            yield from iterate_nested_array(row, (*index, idx)) 
    except TypeError: # final level            
        yield (*index, slice(len(array))), array # think of the types

def pad(array, fill_value):
    dimensions = get_max_shape(array)
    result = np.full(dimensions, fill_value, dtype = np.float64)  
    for index, value in iterate_nested_array(array):
        result[index] = value 
    return result

def bin_total(y_true, y_prob, n_bins):
    bins = np.linspace(0., 1. + 1e-8, n_bins + 1)

    # In sklearn.calibration.calibration_curve,
    # the last value in the array is always 0.
    binids = np.digitize(y_prob, bins) - 1

    return np.bincount(binids, minlength=len(bins))

def missing_bin(bin_array):
    midpoint = " "    
    if bin_array[0]==0:
        midpoint = "5%, "
    if bin_array[1]==0:
        midpoint = midpoint + "15%, "
    if bin_array[2]==0:
        midpoint = midpoint + "25%, "
    if bin_array[3]==0:
        midpoint = midpoint + "35%, " 
    if bin_array[4]==0:
        midpoint = midpoint + "45%, "
    if bin_array[5]==0:
        midpoint = midpoint + "55%, "
    if bin_array[6]==0:
        midpoint = midpoint + "65%, "
    if bin_array[7]==0:
        midpoint = midpoint + "75%, "
    if bin_array[8]==0:
        midpoint = midpoint + "85%, "
    if bin_array[9]==0:
        midpoint = midpoint + "95%, "
    return "The missing bins have midpoint values of "+ str(midpoint)

def batch(data, batch_size):
    X_batches = []
    y_batches = []
    times = math.floor(data.shape[0]/batch_size)
    remainder = data.shape[0]%times
    a = 0
    start = 0
    end = start+batch_size
    if remainder ==0:
        a +=1
    while a<times:
        temp = pad(data[start:end,],0)
        x = torch.from_numpy(temp[:,:,1:-1]).float() # without icustay_id and without aki_stage columns
        y = torch.flatten(torch.from_numpy(temp[:, :,-1].reshape(-1,1)).float()).long()
        X_batches.append(x)
        y_batches.append(y)
        start = end
        end = start+batch_size
        a +=1
    temp = pad(data[start:data.shape[0]],0)
    x = torch.from_numpy(temp[:,:,1:-1]).float()
    y = torch.flatten(torch.from_numpy(temp[:, :,-1].reshape(-1,1)).float()).long()
    X_batches.append(x)
    y_batches.append(y)
    if len(X_batches) != len(y_batches):
        print("length error")
    return X_batches, y_batches # arrays

class Net(nn.Module):
    def __init__(self, input_size, emb_size, output_size, bi_directional, number_layers, dropout):
        super(Net, self).__init__()
        self.input_size = input_size
        self.emb_size = emb_size 
        self.output_size = output_size
        self.number_layers = number_layers
        self.fc1 = nn.Linear(self.input_size, self.emb_size, bias = True) # I can have a few (IV) within this line - documentation        
        self.fc2 = nn.LSTM(self.emb_size, self.output_size,num_layers=self.number_layers, batch_first = True, bidirectional = bi_directional) 
        # in bidirectional encoder we have  forward and backward hidden states
        self.encoding_size = self.output_size * 2 if bi_directional else self.output_size
        self.combination_layer = nn.Linear(self.encoding_size, self.encoding_size)
        # Create affine layer to project to the classes 
        self.projection = nn.Linear(self.encoding_size, self.output_size)
        #dropout layer for regularizetion of a sequence
        self.dropout_layer = nn.Dropout(p = dropout)  
        self.relu = nn.ReLU()
        
    def forward(self, x):
        h = self.relu(self.fc1(x))
        h, _ = self.fc2(h) # h, _ : as I have 2outputs (tuple), only take the real output [0]. 
        #print(type(h)) # Underscore throughs away the rest, _ "I do not care" variable notation in python
        h = self.relu(self.combination_layer(h))
        h = self.dropout_layer(h)
        h = self.projection(h) 
        return h
    pass

In [None]:
# optional: load best features
optimal_features = np.load("data/optimal_features.npy", allow_pickle=True)
# Extracting feature names (keys) from optimal_features
optimal_feature_names = [feature[0] for feature in optimal_features]
# include also aki_stage and icustay_id
optimal_feature_names.extend(['aki_stage', 'icustay_id', 'charttime'])
print(optimal_feature_names)

# XGB Training Loop

In [None]:
# original = pd.read_csv("data/preprocessed/preprocessed_data_6H.csv")
original = pd.read_csv(os.path.join(base_data_path, "resampled", "aki_stage_X_original_24H.csv"))
extended = pd.read_csv(os.path.join(base_data_path, "resampled", "aki_stage_X_extended_24H.csv"))

In [None]:
print(len(original.columns))
print(len(extended.columns))

In [None]:
print(original.columns)

In [None]:
print(extended.columns)

In [None]:
# normal

from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, average_precision_score
import os
import gc

results = {}

datasets = [
    "aki_stage_X_extended_6H.csv",
              ]

for dataset in datasets:
    
    tail = dataset
    data_path = os.path.join(base_data_path,"resampled", dataset)
    X = pd.read_csv(data_path)
    # take only head 
    # X = X.head(10000)

    # For training a testing model, take only icu_stay_id, charttime,creatinine_mean,uo_rt_6hr,aki_stage
    # X = X[['icustay_id', 'charttime', 'creatinine_mean', 'uo_rt_6hr', 'aki_stage']]

    numeric_feat = X.select_dtypes(include=[np.number]).columns.tolist()
    numeric_feat.remove('aki_stage',)
    numeric_feat.remove('icustay_id',)


    # normalize data and cap features
    # X = cap_data(X)
    X, normalization_parameters = normalise_data(X, numeric_feat)
    
    print(len(X.columns))
    print(X.columns)

    # X = X.sort_values(by=['icustay_id', 'charttime'])
    X = X.sort_values(by=['icustay_id'])

    seq_lengths = X.groupby(['icustay_id'],as_index=False).size().sort_values(by = ['size'],ascending=False)
    sequence_length = seq_lengths.max() # the longest sequence per icustay-id

    #AL re-write as try except to make it work as hadm_id is not used if only one csv file is used and none are merged
    try:
        X.drop(['hadm_id'], axis=1, inplace = True)
    except:
        pass

    
    id_list = X['icustay_id'].unique()
    # take the common id list defined earlier
    # id_list = common_id_list
    
    id_train, id_test_val = train_test_split(id_list, test_size = SPLIT_SIZE, random_state = 42) # train set is 80%)
    # remaining 20% split in halves as test and validation 10% and 10%
    id_valid, id_test = train_test_split(id_test_val, test_size = 0.5, random_state = 42) # test 10% valid 10%

    # move ("aki_stage") to last column
    X = X.reindex(columns = [col for col in X.columns if col != 'aki_stage'] + ['aki_stage'])

    train = X[X.icustay_id.isin(id_train)].sort_values(by=['icustay_id'])
    test = X[X.icustay_id.isin(id_test)].sort_values(by=['icustay_id'], ignore_index = True) 
    validation = X[X.icustay_id.isin(id_valid)].sort_values(by=['icustay_id']) 

    test = test.sort_values(by=['icustay_id'], ignore_index = True)
    train = train.sort_values(by=['icustay_id'], ignore_index = True)
    validation = validation.sort_values(by=['icustay_id'], ignore_index = True)

    train.drop(['charttime'], axis=1, inplace = True)  
    test.drop(['charttime'], axis=1, inplace = True)
    validation.drop(['charttime'], axis=1, inplace = True)

    try:
        X.drop(['hadm_id'], axis=1, inplace = True)
    except:
        pass

    train = train.groupby(['icustay_id'],as_index=False).apply(pd.DataFrame.to_numpy)
    test = test.groupby(['icustay_id'],as_index=False).apply(pd.DataFrame.to_numpy)
    validation = validation.groupby(['icustay_id'],as_index=False).apply(pd.DataFrame.to_numpy)


    # flatten the train, test and validation data
    train_flat = np.concatenate(train, axis=0)
    test_flat = np.concatenate(test, axis=0)
    validation_flat = np.concatenate(validation, axis=0)

    # get the labels
    train_labels = np.array([x[-1] for x in train_flat])
    test_labels = np.array([x[-1] for x in test_flat])
    validation_labels = np.array([x[-1] for x in validation_flat])

    # get the features
    train_features = np.array([x[1:-1] for x in train_flat])
    validation_features = np.array([x[1:-1] for x in validation_flat])
    test_features = np.array([x[1:-1] for x in test_flat])

    # create the XGBoost classifier
    xgb = XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='mlogloss', random_state=RANDOM)

    # train the classifier
    xgb.fit(train_features, train_labels)

    # get the predictions
    train_predictions = xgb.predict(train_features)
    test_predictions = xgb.predict(test_features)
    validation_predictions = xgb.predict(validation_features)

    # get the accuracy
    train_accuracy = accuracy_score(train_labels, train_predictions)
    test_accuracy = accuracy_score(test_labels, test_predictions)
    validation_accuracy = accuracy_score(validation_labels, validation_predictions)

    # get the probabilities of the positive class
    training_prob = xgb.predict_proba([x[1:-1] for x in train_flat])[:, 1]
    test_prob = xgb.predict_proba([x[1:-1] for x in test_flat])[:, 1]
    validation_prob = xgb.predict_proba([x[1:-1] for x in validation_flat])[:, 1]

    # calculate ROC AUC and PR AUC for the training set
    training_roc_auc = roc_auc_score(train_labels, training_prob)
    training_pr_auc = average_precision_score(train_labels, training_prob)

    # calculate ROC AUC and PR AUC for the test set
    test_roc_auc = roc_auc_score(test_labels, test_prob)
    test_pr_auc = average_precision_score(test_labels, test_prob)

    # calculate ROC AUC and PR AUC for the validation set
    validation_roc_auc = roc_auc_score(validation_labels, validation_prob)
    validation_pr_auc = average_precision_score(validation_labels, validation_prob)
    
    print(f"Results for {tail}")
    print(f"Train accuracy: {train_accuracy:.3f}.. Train ROC AUC: {training_roc_auc:.2f}.. Train PR AUC: {training_pr_auc:.2f}..")
    print(f"Test accuracy: {test_accuracy:.3f}.. Test ROC AUC: {test_roc_auc:.2f}.. Test PR AUC: {test_pr_auc:.2f}..")
    print(f"Validation accuracy: {validation_accuracy:.3f}.. Validation ROC AUC: {validation_roc_auc:.2f}.. Validation PR AUC: {validation_pr_auc:.2f}..")

    now = datetime.now()
    out_path = os.path.join(base_data_path, "models", f"{tail}_{now.strftime('%Y%m%d%H%M%S')}")
    os.makedirs(out_path, exist_ok=True)
    # save the xgb model
    xgb.save_model(f'{out_path}/xgb.model')
    # save normalization parameters
    try:
        np.save(f'{out_path}/normalization_parameters.npy', normalization_parameters)
    except:
        pass
    # save the train feature names
    np.save(f'{out_path}/train_feature_names.npy', X.columns[2:-1])

    results[tail] = {'train_accuracy': train_accuracy, 'test_accuracy': test_accuracy, 'validation_accuracy': validation_accuracy,
                                'train_roc_auc': training_roc_auc, 'test_roc_auc': test_roc_auc, 'validation_roc_auc': validation_roc_auc,
                                'train_pr_auc': training_pr_auc, 'test_pr_auc': test_pr_auc, 'validation_pr_auc': validation_pr_auc}

    # save results dict
    np.save(f'{out_path}/results.npy', results)
    
    gc.collect()
    

In [None]:
# hyperparameter search (grid search)
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score
from sklearn.model_selection import GridSearchCV
import os
import gc
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from datetime import datetime

results = {}

datasets = [
    "aki_stage_X_extended_6H.csv",
]

for dataset in datasets:

    tail = dataset
    data_path = os.path.join(base_data_path,"resampled", dataset)
    X = pd.read_csv(data_path)
    # take only head 
    # X = X.head(10000)

    # For training a testing model, take only icu_stay_id, charttime,creatinine_mean,uo_rt_6hr,aki_stage
    # X = X[['icustay_id', 'charttime', 'creatinine_mean', 'uo_rt_6hr', 'aki_stage']]

    numeric_feat = X.select_dtypes(include=[np.number]).columns.tolist()
    numeric_feat.remove('aki_stage',)
    numeric_feat.remove('icustay_id',)


    # normalize data and cap features
    # X = cap_data(X)
    X, normalization_parameters = normalise_data(X, numeric_feat)
    
    print(len(X.columns))
    print(X.columns)

    # X = X.sort_values(by=['icustay_id', 'charttime'])
    X = X.sort_values(by=['icustay_id'])

    seq_lengths = X.groupby(['icustay_id'],as_index=False).size().sort_values(by = ['size'],ascending=False)
    sequence_length = seq_lengths.max() # the longest sequence per icustay-id

    #AL re-write as try except to make it work as hadm_id is not used if only one csv file is used and none are merged
    try:
        X.drop(['hadm_id'], axis=1, inplace = True)
    except:
        pass

    
    id_list = X['icustay_id'].unique()
    # take the common id list defined earlier
    # id_list = common_id_list
    
    id_train, id_test_val = train_test_split(id_list, test_size = SPLIT_SIZE, random_state = 42) # train set is 80%)
    # remaining 20% split in halves as test and validation 10% and 10%
    id_valid, id_test = train_test_split(id_test_val, test_size = 0.5, random_state = 42) # test 10% valid 10%

    # move ("aki_stage") to last column
    X = X.reindex(columns = [col for col in X.columns if col != 'aki_stage'] + ['aki_stage'])

    train = X[X.icustay_id.isin(id_train)].sort_values(by=['icustay_id'])
    test = X[X.icustay_id.isin(id_test)].sort_values(by=['icustay_id'], ignore_index = True) 
    validation = X[X.icustay_id.isin(id_valid)].sort_values(by=['icustay_id']) 

    test = test.sort_values(by=['icustay_id'], ignore_index = True)
    train = train.sort_values(by=['icustay_id'], ignore_index = True)
    validation = validation.sort_values(by=['icustay_id'], ignore_index = True)

    train.drop(['charttime'], axis=1, inplace = True)  
    test.drop(['charttime'], axis=1, inplace = True)
    validation.drop(['charttime'], axis=1, inplace = True)

    try:
        X.drop(['hadm_id'], axis=1, inplace = True)
    except:
        pass

    train = train.groupby(['icustay_id'],as_index=False).apply(pd.DataFrame.to_numpy)
    test = test.groupby(['icustay_id'],as_index=False).apply(pd.DataFrame.to_numpy)
    validation = validation.groupby(['icustay_id'],as_index=False).apply(pd.DataFrame.to_numpy)


    # flatten the train, test and validation data
    train_flat = np.concatenate(train, axis=0)
    test_flat = np.concatenate(test, axis=0)
    validation_flat = np.concatenate(validation, axis=0)

    # get the labels
    train_labels = np.array([x[-1] for x in train_flat])
    test_labels = np.array([x[-1] for x in test_flat])
    validation_labels = np.array([x[-1] for x in validation_flat])

    # get the features
    train_features = np.array([x[1:-1] for x in train_flat])
    validation_features = np.array([x[1:-1] for x in validation_flat])
    test_features = np.array([x[1:-1] for x in test_flat])
    
    # Define the parameter grid
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.3],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
    
    # Create the base model
    xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=RANDOM)
    
    # Instantiate GridSearchCV
    grid_search = GridSearchCV(
        estimator=xgb,
        param_grid=param_grid,
        cv=3,
        n_jobs=-1,
        verbose=2,
        scoring='roc_auc'
    )
    
    # Perform grid search on the training data
    grid_search.fit(train_features, train_labels)
    
    # Get the best model
    best_xgb = grid_search.best_estimator_
    
    # Print the best parameters
    print("Best parameters:", grid_search.best_params_)
    
    # # Use the best model for predictions
    # train_predictions = best_xgb.predict(train_features)
    # test_predictions = best_xgb.predict(test_features)
    # validation_predictions = best_xgb.predict(validation_features)
    
    # # Calculate accuracies
    # train_accuracy = accuracy_score(train_labels, train_predictions)
    # test_accuracy = accuracy_score(test_labels, test_predictions)
    # validation_accuracy = accuracy_score(validation_labels, validation_predictions)
    
    # # Get probabilities
    # training_prob = best_xgb.predict_proba(train_features)[:, 1]
    # test_prob = best_xgb.predict_proba(test_features)[:, 1]
    # validation_prob = best_xgb.predict_proba(validation_features)[:, 1]
    
    # # Calculate ROC AUC and PR AUC
    # training_roc_auc = roc_auc_score(train_labels, training_prob)
    # training_pr_auc = average_precision_score(train_labels, training_prob)
    # test_roc_auc = roc_auc_score(test_labels, test_prob)
    # test_pr_auc = average_precision_score(test_labels, test_prob)
    # validation_roc_auc = roc_auc_score(validation_labels, validation_prob)
    # validation_pr_auc = average_precision_score(validation_labels, validation_prob)
        
    gc.collect()

In [None]:
# hyperparameter search (bayesian optimization)

from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
import os
import gc
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from datetime import datetime

results = {}

datasets = [
    "aki_stage_X_extended_6H.csv",
]

for dataset in datasets:
    tail = dataset
    data_path = os.path.join(base_data_path,"resampled", dataset)
    X = pd.read_csv(data_path)

    numeric_feat = X.select_dtypes(include=[np.number]).columns.tolist()
    numeric_feat.remove('aki_stage',)
    numeric_feat.remove('icustay_id',)

    X, normalization_parameters = normalise_data(X, numeric_feat)

    # X = X.sort_values(by=['icustay_id', 'charttime'])
    X = X.sort_values(by=['icustay_id'])

    seq_lengths = X.groupby(['icustay_id'],as_index=False).size().sort_values(by = ['size'],ascending=False)
    sequence_length = seq_lengths.max() # the longest sequence per icustay-id

    #AL re-write as try except to make it work as hadm_id is not used if only one csv file is used and none are merged
    try:
        X.drop(['hadm_id'], axis=1, inplace = True)
    except:
        pass

    
    id_list = X['icustay_id'].unique()

    
    id_train, id_test_val = train_test_split(id_list, test_size = SPLIT_SIZE, random_state = 42) # train set is 80%)
    # remaining 20% split in halves as test and validation 10% and 10%
    id_valid, id_test = train_test_split(id_test_val, test_size = 0.5, random_state = 42) # test 10% valid 10%

    X = X.reindex(columns = [col for col in X.columns if col != 'aki_stage'] + ['aki_stage'])

    train = X[X.icustay_id.isin(id_train)].sort_values(by=['icustay_id'])
    test = X[X.icustay_id.isin(id_test)].sort_values(by=['icustay_id'], ignore_index = True) 
    validation = X[X.icustay_id.isin(id_valid)].sort_values(by=['icustay_id']) 

    test = test.sort_values(by=['icustay_id'], ignore_index = True)
    train = train.sort_values(by=['icustay_id'], ignore_index = True)
    validation = validation.sort_values(by=['icustay_id'], ignore_index = True)

    train.drop(['charttime'], axis=1, inplace = True)  
    test.drop(['charttime'], axis=1, inplace = True)
    validation.drop(['charttime'], axis=1, inplace = True)

    try:
        X.drop(['hadm_id'], axis=1, inplace = True)
    except:
        pass

    train = train.groupby(['icustay_id'],as_index=False).apply(pd.DataFrame.to_numpy)
    test = test.groupby(['icustay_id'],as_index=False).apply(pd.DataFrame.to_numpy)
    validation = validation.groupby(['icustay_id'],as_index=False).apply(pd.DataFrame.to_numpy)


    # flatten the train, test and validation data
    train_flat = np.concatenate(train, axis=0)
    test_flat = np.concatenate(test, axis=0)
    validation_flat = np.concatenate(validation, axis=0)

    # get the labels
    train_labels = np.array([x[-1] for x in train_flat])
    test_labels = np.array([x[-1] for x in test_flat])
    validation_labels = np.array([x[-1] for x in validation_flat])

    # get the features
    train_features = np.array([x[1:-1] for x in train_flat])
    validation_features = np.array([x[1:-1] for x in validation_flat])
    test_features = np.array([x[1:-1] for x in test_flat])
    
    param_space = {
    'n_estimators': Integer(300, 2000),
    'subsample': Real(0.1, 0.6),
    'min_child_weight': Integer(10, 100),
    'gamma': Real(0.5, 0.99, prior='log-uniform')
}

    # Create the base model with fixed parameters
    xgb = XGBClassifier(
        max_depth=8,  # Fixed parameter
        learning_rate=0.025,  # Fixed parameter
        colsample_bytree=1.0,  # Fixed parameter
        eval_metric='mlogloss',
        random_state=RANDOM
    )
    
    # Instantiate BayesSearchCV
    bayes_search = BayesSearchCV(
        estimator=xgb,
        search_spaces=param_space,
        n_iter=10,  # number of iterations
        cv=3,
        n_jobs=-1,
        verbose=2,
        scoring='roc_auc',
        random_state=RANDOM
    )
    
    # Perform Bayesian optimization on the training data
    bayes_search.fit(train_features, train_labels)
    
    # Get the best model
    best_xgb = bayes_search.best_estimator_
    
    # Print the best parameters
    print("Best parameters:", bayes_search.best_params_)
    
    # # Use the best model for predictions
    # train_predictions = best_xgb.predict(train_features)
    # test_predictions = best_xgb.predict(test_features)
    # validation_predictions = best_xgb.predict(validation_features)
    
    # # Calculate accuracies
    # train_accuracy = accuracy_score(train_labels, train_predictions)
    # test_accuracy = accuracy_score(test_labels, test_predictions)
    # validation_accuracy = accuracy_score(validation_labels, validation_predictions)
    
    # # Get probabilities
    # training_prob = best_xgb.predict_proba(train_features)[:, 1]
    # test_prob = best_xgb.predict_proba(test_features)[:, 1]
    # validation_prob = best_xgb.predict_proba(validation_features)[:, 1]
    
    # # Calculate ROC AUC and PR AUC
    # training_roc_auc = roc_auc_score(train_labels, training_prob)
    # training_pr_auc = average_precision_score(train_labels, training_prob)
    # test_roc_auc = roc_auc_score(test_labels, test_prob)
    # test_pr_auc = average_precision_score(test_labels, test_prob)
    # validation_roc_auc = roc_auc_score(validation_labels, validation_prob)
    # validation_pr_auc = average_precision_score(validation_labels, validation_prob)
    
    # # Print results
    # print(f"Results for {tail}")
    # print(f"Train accuracy: {train_accuracy:.3f}.. Train ROC AUC: {training_roc_auc:.2f}.. Train PR AUC: {training_pr_auc:.2f}..")
    # print(f"Test accuracy: {test_accuracy:.3f}.. Test ROC AUC: {test_roc_auc:.2f}.. Test PR AUC: {test_pr_auc:.2f}..")
    # print(f"Validation accuracy: {validation_accuracy:.3f}.. Validation ROC AUC: {validation_roc_auc:.2f}.. Validation PR AUC: {validation_pr_auc:.2f}..")
    
    # # Save results and model
    # now = datetime.now()
    # out_path = os.path.join(base_data_path, "models", f"{tail}_{now.strftime('%Y%m%d%H%M%S')}")
    # os.makedirs(out_path, exist_ok=True)
    
    # best_xgb.save_model(f'{out_path}/best_xgb.model')
    # np.save(f'{out_path}/best_params.npy', bayes_search.best_params_)
    # np.save(f'{out_path}/train_feature_names.npy', X.columns[2:-1])
    
    # results[tail] = {
    #     'train_accuracy': train_accuracy, 'test_accuracy': test_accuracy, 'validation_accuracy': validation_accuracy,
    #     'train_roc_auc': training_roc_auc, 'test_roc_auc': test_roc_auc, 'validation_roc_auc': validation_roc_auc,
    #     'train_pr_auc': training_pr_auc, 'test_pr_auc': test_pr_auc, 'validation_pr_auc': validation_pr_auc,
    #     'best_params': bayes_search.best_params_
    # }
    
    # np.save(f'{out_path}/results.npy', results)
    
    gc.collect()

In [10]:
data_paths = [
    "data/preprocessed/preprocessed_data_1H.csv",
    "data/preprocessed/preprocessed_data_2H.csv",
    "data/preprocessed/preprocessed_data_4H.csv",
    "data/preprocessed/preprocessed_data_6H.csv",
    "data/preprocessed/preprocessed_data_8H.csv",
    "data/preprocessed/preprocessed_data_12H.csv",
    "data/preprocessed/preprocessed_data_24H.csv",
]

for data_path in data_paths:
    X = pd.read_csv(data_path)
    X.drop(['height_first', 'hadm_id', 'weight_first', 'inr_max'], axis=1, inplace = True)
    # write back to the same file
    X.to_csv(data_path, index=False)

In [17]:
# cross validation

from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score, roc_curve, precision_recall_curve, f1_score, auc, brier_score_loss
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd
import os
import gc
from sklearn.metrics import roc_curve, precision_recall_curve, f1_score, roc_auc_score, auc, accuracy_score

results = {}

datasets = [
    # "aki_stage_X_extended_1H.csv",
    # "aki_stage_X_extended_2H.csv",
    # "aki_stage_X_extended_4H.csv",
    # "aki_stage_X_extended_6H.csv",
    # "aki_stage_X_extended_8H.csv",
    "aki_stage_X_extended_12H.csv",
    # "aki_stage_X_extended_24H.csv",
    # "aki_stage_X_original_1H.csv",
    # "aki_stage_X_original_2H.csv",
    # "aki_stage_X_original_4H.csv",
    # "aki_stage_X_original_6H.csv",
    # "aki_stage_X_original_8H.csv",
    # "aki_stage_X_original_12H.csv",
    # "aki_stage_X_original_24H.csv",
              ]

for dataset in datasets:
    
    tail = dataset
    data_path = os.path.join(base_data_path,"resampled", dataset)
    X = pd.read_csv(data_path)

    numeric_feat = X.select_dtypes(include=[np.number]).columns.tolist()
    numeric_feat.remove('aki_stage')
    numeric_feat.remove('icustay_id')

    X, normalization_parameters = normalise_data(X, numeric_feat)
    X = X.sort_values(by=['icustay_id'])

    try:
        X.drop(['hadm_id'], axis=1, inplace=True)
    except:
        pass

    id_list = X['icustay_id'].unique()

    id_list.sort()

    # Move "aki_stage" to last column
    X = X.reindex(columns=[col for col in X.columns if col != 'aki_stage'] + ['aki_stage'])

    X.drop(['charttime'], axis=1, inplace=True)

    # Group by icustay_id and convert to numpy arrays
    grouped_data = X.groupby('icustay_id').apply(lambda x: x.drop('icustay_id', axis=1).to_numpy())

    # Initialize KFold
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    fold_results = []

    print(len(X))

    for fold, (id_train_idx, id_val_idx) in enumerate(kf.split(id_list), 1):
        print(f"Processing fold {fold}")
        id_train = [id_list[idx] for idx in id_train_idx]
        id_val = [id_list[idx] for idx in id_val_idx]       

        train = X[X.icustay_id.isin(id_train)].sort_values(by=['icustay_id'])
        validation = X[X.icustay_id.isin(id_val)].sort_values(by=['icustay_id']) 

        train = train.sort_values(by=['icustay_id'], ignore_index = True)
        validation = validation.sort_values(by=['icustay_id'], ignore_index = True)

        try:
            X.drop(['hadm_id'], axis=1, inplace = True)
        except:
            pass
        
        train = train.groupby(['icustay_id'],as_index=False).apply(pd.DataFrame.to_numpy)
        validation = validation.groupby(['icustay_id'],as_index=False).apply(pd.DataFrame.to_numpy)

        # flatten the train, test and validation data
        train_flat = np.concatenate(train, axis=0)
        validation_flat = np.concatenate(validation, axis=0)

        # get the labels
        train_labels = np.array([x[-1] for x in train_flat])
        val_labels = np.array([x[-1] for x in validation_flat])

        # get the features
        train_features = np.array([x[1:-1] for x in train_flat])
        val_features = np.array([x[1:-1] for x in validation_flat])

        # Create and train the XGBoost classifier
        # xgb = XGBClassifier(n_estimators=1000, use_label_encoder=False, eval_metric='mlogloss', random_state=42)
        xgb = XGBClassifier(
            gamma=0.6775421629797868, 
            min_child_weight=93, 
            n_estimators=478, 
            subsample=0.31666640093511256,
            max_depth=8,  # Fixed parameter
            learning_rate=0.025,  # Fixed parameter
            colsample_bytree=1.0,  # Fixed parameter
            eval_metric='mlogloss',
            random_state=RANDOM
        )
            
        xgb.fit(train_features, train_labels)

        # Make predictions
        train_predictions = xgb.predict(train_features)
        val_predictions = xgb.predict(val_features)

        # unique values in the labels
        unique_labels = np.unique(np.concatenate([train_labels, val_labels]))
        unique_labels_pred = np.unique(np.concatenate([train_predictions, val_predictions]))
        
        # Calculate accuracies
        train_accuracy = accuracy_score(train_labels, train_predictions)
        val_accuracy = accuracy_score(val_labels, val_predictions)
        

        # Calculate probabilities
        train_prob = xgb.predict_proba(train_features)[:, 1]
        val_prob = xgb.predict_proba(val_features)[:, 1]


        # Calculate ROC AUC and PR AUC
        train_roc_auc = roc_auc_score(train_labels, train_prob)
        train_pr_auc = average_precision_score(train_labels, train_prob)
        val_roc_auc = roc_auc_score(val_labels, val_prob)
        val_pr_auc = average_precision_score(val_labels, val_prob)
        train_brier = brier_score_loss(train_labels, train_prob)
        val_brier = brier_score_loss(val_labels, val_prob)
                
        # compute roc auc
        roc_auc = roc_auc_score(val_labels, val_prob, average = 'micro')
        # compute Precision_Recall curves
        precision, recall, _ = precision_recall_curve(val_labels, val_prob)
        # compute PR_AUC
        pr_auc = metrics.auc(recall, precision)

        fold_results.append({
            'fold': fold,
            'train_accuracy': train_accuracy,
            'val_accuracy': val_accuracy,
            'train_roc_auc': train_roc_auc,
            'val_roc_auc': val_roc_auc,
            'train_pr_auc': train_pr_auc,
            'val_pr_auc': val_pr_auc,
            'roc_auc': roc_auc,
            'pr_auc': pr_auc,
            'train_brier': train_brier,
            'val_brier': val_brier
        })

        print(f"Fold {fold} results:")
        print(f"Train accuracy: {train_accuracy:.3f}, ROC AUC: {train_roc_auc:.2f}, PR AUC: {train_pr_auc:.2f}, Brier: {train_brier:.4f}")
        print(f"Validation accuracy: {val_accuracy:.3f}, ROC AUC: {val_roc_auc:.2f}, PR AUC: {val_pr_auc:.2f}, ROC AUC: {roc_auc:.2f}, PR AUC: {pr_auc:.2f}, Brier: {val_brier:.4f}")

    # Calculate average scores across folds
    avg_scores = {
        'train_accuracy': np.mean([r['train_accuracy'] for r in fold_results]),
        'val_accuracy': np.mean([r['val_accuracy'] for r in fold_results]),
        'train_roc_auc': np.mean([r['train_roc_auc'] for r in fold_results]),
        'val_roc_auc': np.mean([r['val_roc_auc'] for r in fold_results]),
        'train_pr_auc': np.mean([r['train_pr_auc'] for r in fold_results]),
        'val_pr_auc': np.mean([r['val_pr_auc'] for r in fold_results]),
        'roc_auc': np.mean([r['roc_auc'] for r in fold_results]),
        'pr_auc': np.mean([r['pr_auc'] for r in fold_results]),
        'train_brier': np.mean([r['train_brier'] for r in fold_results]),
        'val_brier': np.mean([r['val_brier'] for r in fold_results])
    }

    print("\nAverage scores across 5 folds:")
    for metric, value in avg_scores.items():
        print(f"{metric}: {value:.4f}")

    results[tail] = {
        'fold_results': fold_results,
        'average_scores': avg_scores
    }

    now = datetime.now()
    out_path = os.path.join(base_data_path, "models", f"{tail}_{now.strftime('%Y%m%d%H%M%S')}")
    os.makedirs(out_path, exist_ok=True)
    
    # Save results
    np.save(f'{out_path}/results.npy', results)

    gc.collect()

# Save overall results
np.save(os.path.join(base_data_path, "models", "xgb_cross_validation_results.npy"), results)

Normalizing in [0,1] with min-max normalization


  grouped_data = X.groupby('icustay_id').apply(lambda x: x.drop('icustay_id', axis=1).to_numpy())


872509
Processing fold 1


  train = train.groupby(['icustay_id'],as_index=False).apply(pd.DataFrame.to_numpy)
  validation = validation.groupby(['icustay_id'],as_index=False).apply(pd.DataFrame.to_numpy)


Fold 1 results:
Train accuracy: 0.837, ROC AUC: 0.79, PR AUC: 0.56, Brier: 0.1220
Validation accuracy: 0.821, ROC AUC: 0.77, PR AUC: 0.54, ROC AUC: 0.77, PR AUC: 0.54, Brier: 0.1329
Processing fold 2


  train = train.groupby(['icustay_id'],as_index=False).apply(pd.DataFrame.to_numpy)
  validation = validation.groupby(['icustay_id'],as_index=False).apply(pd.DataFrame.to_numpy)


Fold 2 results:
Train accuracy: 0.834, ROC AUC: 0.79, PR AUC: 0.56, Brier: 0.1241
Validation accuracy: 0.832, ROC AUC: 0.77, PR AUC: 0.53, ROC AUC: 0.77, PR AUC: 0.53, Brier: 0.1263
Processing fold 3


  train = train.groupby(['icustay_id'],as_index=False).apply(pd.DataFrame.to_numpy)
  validation = validation.groupby(['icustay_id'],as_index=False).apply(pd.DataFrame.to_numpy)


Fold 3 results:
Train accuracy: 0.833, ROC AUC: 0.79, PR AUC: 0.56, Brier: 0.1245
Validation accuracy: 0.836, ROC AUC: 0.78, PR AUC: 0.53, ROC AUC: 0.78, PR AUC: 0.53, Brier: 0.1235
Processing fold 4


  train = train.groupby(['icustay_id'],as_index=False).apply(pd.DataFrame.to_numpy)
  validation = validation.groupby(['icustay_id'],as_index=False).apply(pd.DataFrame.to_numpy)


Fold 4 results:
Train accuracy: 0.834, ROC AUC: 0.79, PR AUC: 0.57, Brier: 0.1233
Validation accuracy: 0.828, ROC AUC: 0.76, PR AUC: 0.51, ROC AUC: 0.76, PR AUC: 0.51, Brier: 0.1293
Processing fold 5


  train = train.groupby(['icustay_id'],as_index=False).apply(pd.DataFrame.to_numpy)
  validation = validation.groupby(['icustay_id'],as_index=False).apply(pd.DataFrame.to_numpy)


Fold 5 results:
Train accuracy: 0.834, ROC AUC: 0.79, PR AUC: 0.56, Brier: 0.1234
Validation accuracy: 0.828, ROC AUC: 0.77, PR AUC: 0.54, ROC AUC: 0.77, PR AUC: 0.54, Brier: 0.1283

Average scores across 5 folds:
train_accuracy: 0.8343
val_accuracy: 0.8290
train_roc_auc: 0.7924
val_roc_auc: 0.7701
train_pr_auc: 0.5630
val_pr_auc: 0.5306
roc_auc: 0.7701
pr_auc: 0.5306
train_brier: 0.1234
val_brier: 0.1280


# LSTM Training Loop

In [12]:
# lstm with whole sequences

import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score, brier_score_loss
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import math
from datetime import datetime
import os
import gc
from torch.utils.tensorboard import SummaryWriter

results = {}

datasets = [
    # "aki_stage_X_extended_1H.csv",
    # "aki_stage_X_extended_2H.csv",
    # "aki_stage_X_extended_4H.csv",
    "aki_stage_X_extended_6H.csv",
    # "aki_stage_X_extended_8H.csv",
    # "aki_stage_X_extended_12H.csv",
    # "aki_stage_X_extended_24H.csv",
    # "aki_stage_X_original_1H.csv",
    # "aki_stage_X_original_2H.csv",
    # "aki_stage_X_original_4H.csv",
    # "aki_stage_X_original_6H.csv",
    # "aki_stage_X_original_8H.csv",
    # "aki_stage_X_original_12H.csv",
    # "aki_stage_X_original_24H.csv",
              ]

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f'Training on {device}')

def batch(data, batch_size):
    X_batches = []
    y_batches = []
    
    for i in range(0, len(data), batch_size):
        batch_data = data[i:i+batch_size]
        
        # Pad sequences to the same length within the batch
        max_seq_length = max(len(seq) for seq in batch_data)
        
        X_batch = []
        y_batch = []
        
        for seq in batch_data:
            padded_seq = np.pad(seq, ((0, max_seq_length - len(seq)), (0, 0)), mode='constant')
            X_batch.append(padded_seq[:, 1:-1])  # Exclude icustay_id and aki_stage
            y_batch.append(padded_seq[-1, -1])  # Take the aki_stage of the first row
        
        X_batches.append(torch.FloatTensor(X_batch))
        y_batches.append(torch.LongTensor(y_batch))
    
    return X_batches, y_batches

class Net(nn.Module):
    def __init__(self, input_size, emb_size, output_size, bi_directional, number_layers, dropout):
        super(Net, self).__init__()
        self.lstm = nn.LSTM(input_size, emb_size, num_layers=number_layers, 
                            bidirectional=bi_directional, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(emb_size * (2 if bi_directional else 1), output_size)

    def forward(self, x):
        _, (hidden, _) = self.lstm(x)
        
        if self.lstm.bidirectional:
            hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        else:
            hidden = hidden[-1]
        
        out = self.fc(hidden)
        return out.squeeze(1)

for dataset in datasets:
    tail = dataset
    data_path = os.path.join(base_data_path,"resampled", dataset)
    print(f"Processing {tail}")

    X = pd.read_csv(data_path)
    # X = X.head(10000)  # only take the first 10000 rows

    numeric_feat = X.select_dtypes(include=[np.number]).columns.tolist()
    numeric_feat.remove('aki_stage')
    numeric_feat.remove('icustay_id')

    X, normalization_parameters = normalise_data(X, numeric_feat)
    X = X.sort_values(by=['icustay_id'])

    try:
        X.drop(['hadm_id'], axis=1, inplace=True)
    except:
        pass

    id_list = X['icustay_id'].unique()
    id_train, id_test_val = train_test_split(id_list, test_size=SPLIT_SIZE, random_state=42)
    id_valid, id_test = train_test_split(id_test_val, test_size=0.5, random_state=42)

    X = X.reindex(columns=[col for col in X.columns if col != 'aki_stage'] + ['aki_stage'])

    train = X[X.icustay_id.isin(id_train)].sort_values(by=['icustay_id', 'charttime'])
    # print rows with first icustay_id
    print(train[train.icustay_id == id_train[0]])
    
    test = X[X.icustay_id.isin(id_test)].sort_values(by=['icustay_id', 'charttime'])
    validation = X[X.icustay_id.isin(id_valid)].sort_values(by=['icustay_id', 'charttime'])

    train.drop(['charttime'], axis=1, inplace=True)
    test.drop(['charttime'], axis=1, inplace=True)
    validation.drop(['charttime'], axis=1, inplace=True)

    train = list(train.groupby(['icustay_id']).apply(pd.DataFrame.to_numpy))
    train.sort(key=len, reverse=True)  # Sort by sequence length in descending order
    print("Number of sequences in train:", len(train))
    print("Longest sequence in train:", max(len(seq) for seq in train))
    print("Shortest sequence in train:", min(len(seq) for seq in train))
    print("Number of columns within a sequence in train:", len(train[0][0]))
    test = list(test.groupby(['icustay_id']).apply(pd.DataFrame.to_numpy))
    validation = list(validation.groupby(['icustay_id']).apply(pd.DataFrame.to_numpy))

    batch_size = 32  # You may need to adjust this
    X_train, y_train = batch(train, batch_size)
    X_test, y_test = batch(test, batch_size)
    X_val, y_val = batch(validation, batch_size)

    print(f"Number of batches in train: {len(X_train)}")
    os.makedirs(os.path.join(base_data_path, "models", tail), exist_ok=True)
    writer = SummaryWriter(os.path.join(base_data_path, "logs", tail, datetime.now().strftime("%Y%m%d-%H%M%S")))

    input_size = X_train[0].shape[2]
    output_size = 1
    emb_size = round(input_size / 1)
    number_layers = 3
    dropout = 0.1
    bi_directional = True

    nn_model = Net(input_size, emb_size, output_size, bi_directional, number_layers, dropout).to(device)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(nn_model.parameters(), lr=0.01)

    use_pretrained = False
    best_auc = 0
    start_epoch = 0

    if use_pretrained:
        model_path = f'data/models/{tail}/LSTM_best.pth'
        if os.path.exists(model_path):
            checkpoint = torch.load(model_path)
            nn_model.load_state_dict(checkpoint['model_state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
            best_auc = checkpoint.get('best_auc', 0)
            start_epoch = checkpoint.get('epoch', 0)
            print(f"Loaded pretrained model from {model_path} with AUC {best_auc} at epoch {start_epoch}.")
        else:
            print(f"No pretrained model found, starting training from scratch.")

    patience = 10  # Number of epochs to wait for improvement
    no_improvement_counter = 0  # Counter to track epochs without improvement

    n_epochs = 200
    best_auc = 0  # Initialize the best AUC for comparison

    for epoch in range(start_epoch + 1, n_epochs):
        nn_model.train()
        running_loss = 0.0
        running_accuracy = 0.0

        for i, (X_batch, y_batch) in enumerate(zip(X_train, y_train)):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            outputs = nn_model(X_batch)
            loss = criterion(outputs, y_batch.float())
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

            predicted = torch.sigmoid(outputs) > 0.5
            train_accuracy = accuracy_score(y_batch.cpu().numpy(), predicted.cpu().numpy())
            running_accuracy += train_accuracy

            writer.add_scalar('Training/Loss', loss.item(), epoch * len(X_train) + i)
            writer.add_scalar('Training/Accuracy', train_accuracy, epoch * len(X_train) + i)

        # Validation phase
        nn_model.eval()
        total_v_loss = 0
        all_y_val = []
        all_val_prob = []
        all_accuracy = 0

        for X_val_batch, y_val_batch in zip(X_val, y_val):
            X_val_batch, y_val_batch = X_val_batch.to(device), y_val_batch.to(device)
            with torch.no_grad():
                v_out = nn_model(X_val_batch)
                v_loss = criterion(v_out, y_val_batch.float())
                val_prob = torch.sigmoid(v_out)
                total_v_loss += v_loss.item()
                all_y_val.extend(y_val_batch.cpu().numpy())
                all_val_prob.extend(val_prob.cpu().numpy())

                predicted = val_prob > 0.5
                val_accuracy = accuracy_score(y_val_batch.cpu().numpy(), predicted.cpu().numpy())
                all_accuracy += val_accuracy

        avg_v_loss = total_v_loss / len(X_val)
        roc_auc = roc_auc_score(all_y_val, all_val_prob)
        avg_accuracy = all_accuracy / len(X_val)
        brier_score = brier_score_loss(all_y_val, all_val_prob)

        writer.add_scalar('Validation/Loss', avg_v_loss, epoch)
        writer.add_scalar('Validation/AUC', roc_auc, epoch)
        writer.add_scalar('Validation/Accuracy', avg_accuracy, epoch)
        writer.add_scalar('Validation/Brier_Score', brier_score, epoch)

        print(f"Epoch {epoch+1}/{n_epochs}, "
            f"Train Loss: {running_loss/len(X_train):.4f}, "
            f"Train Accuracy: {running_accuracy/len(X_train):.4f}, "
            f"Val Loss: {avg_v_loss:.4f}, "
            f"Val AUC: {roc_auc:.4f}, "
            f"Val Accuracy: {avg_accuracy:.4f}",
            f"Val Brier Score: {brier_score:.4f}")
        # Early stopping logic
        if roc_auc > best_auc:
            best_auc = roc_auc
            no_improvement_counter = 0  # Reset counter
            save_path = f'data/models/{tail}/LSTM_best.pth'
            os.makedirs(os.path.dirname(save_path), exist_ok=True)
            torch.save({
                'model_state_dict': nn_model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'best_auc': best_auc,
                'epoch': epoch,
                'brier_score': brier_score
            }, save_path)
            print(f"Model saved with AUC: {roc_auc:.4f} and Brier Score: {brier_score:.4f}")
        else:
            no_improvement_counter += 1
            print(f"No improvement for {no_improvement_counter} epochs.")

        # Stop if no improvement for 'patience' epochs
        if no_improvement_counter >= patience:
            print(f"Early stopping at epoch {epoch+1}. Best AUC: {best_auc:.4f}. Best Brier Score: {brier_score:.4f}")
            break

    writer.close()
    gc.collect()

Training on cuda:0
Processing aki_stage_X_extended_6H.csv
Normalizing in [0,1] with min-max normalization
         icustay_id            charttime  albumin_mean  aniongap_mean  \
1494043      288185  2141-06-22 00:00:00      0.000000       0.135135   
1494044      288185  2141-06-22 06:00:00      0.000000       0.135135   
1494045      288185  2141-06-22 12:00:00      0.000000       0.000000   
1494046      288185  2141-06-22 18:00:00      0.000000       0.000000   
1494047      288185  2141-06-23 00:00:00      0.000000       0.148649   
...             ...                  ...           ...            ...   
1494171      288185  2141-07-24 00:00:00      0.434783       0.148649   
1494172      288185  2141-07-24 06:00:00      0.000000       0.000000   
1494173      288185  2141-07-24 12:00:00      0.000000       0.000000   
1494174      288185  2141-07-24 18:00:00      0.000000       0.000000   
1494175      288185  2141-07-25 00:00:00      0.000000       0.000000   

         bands_me

  train = list(train.groupby(['icustay_id']).apply(pd.DataFrame.to_numpy))


Number of sequences in train: 37801
Longest sequence in train: 133
Shortest sequence in train: 1
Number of columns within a sequence in train: 46


  test = list(test.groupby(['icustay_id']).apply(pd.DataFrame.to_numpy))
  validation = list(validation.groupby(['icustay_id']).apply(pd.DataFrame.to_numpy))


Number of batches in train: 1182
Epoch 2/200, Train Loss: 0.3512, Train Accuracy: 0.8705, Val Loss: 0.0445, Val AUC: 0.9872, Val Accuracy: 0.9956 Val Brier Score: 0.0049
Model saved with AUC: 0.9872 and Brier Score: 0.0049
Epoch 3/200, Train Loss: 0.3273, Train Accuracy: 0.8750, Val Loss: 0.0344, Val AUC: 0.9905, Val Accuracy: 0.9945 Val Brier Score: 0.0045
Model saved with AUC: 0.9905 and Brier Score: 0.0045
Epoch 4/200, Train Loss: 0.3211, Train Accuracy: 0.8756, Val Loss: 0.0138, Val AUC: 0.9921, Val Accuracy: 0.9945 Val Brier Score: 0.0037
Model saved with AUC: 0.9921 and Brier Score: 0.0037
Epoch 5/200, Train Loss: 0.3165, Train Accuracy: 0.8780, Val Loss: 0.0160, Val AUC: 0.9897, Val Accuracy: 0.9951 Val Brier Score: 0.0047
No improvement for 1 epochs.
Epoch 6/200, Train Loss: 0.3141, Train Accuracy: 0.8794, Val Loss: 0.0116, Val AUC: 0.9944, Val Accuracy: 0.9951 Val Brier Score: 0.0034
Model saved with AUC: 0.9944 and Brier Score: 0.0034
Epoch 7/200, Train Loss: 0.3111, Train Ac

In [16]:
# lstm continous (create subsequences)

import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score, brier_score_loss
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import math
from datetime import datetime
import os
import gc
from torch.utils.tensorboard import SummaryWriter

results = {}

datasets = [
    # "aki_stage_X_extended_1H.csv",
    # "aki_stage_X_extended_2H.csv",
    # "aki_stage_X_extended_4H.csv",
    # "aki_stage_X_extended_6H.csv",
    # "aki_stage_X_extended_8H.csv",
    "aki_stage_X_extended_12H.csv",
    # "aki_stage_X_extended_24H.csv",
    # "aki_stage_X_original_1H.csv",
    # "aki_stage_X_original_2H.csv",
    # "aki_stage_X_original_4H.csv",
    # "aki_stage_X_original_6H.csv",
    # "aki_stage_X_original_8H.csv",
    # "aki_stage_X_original_12H.csv",
    # "aki_stage_X_original_24H.csv",
]

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f'Training on {device}')

def batch(data, batch_size):
    X_batches = []
    y_batches = []
    
    for i in range(0, len(data), batch_size):
        batch_data = data[i:i+batch_size]
        
        X_batch = []
        y_batch = []
        
        # Generate subsequences for each sequence in the batch
        for seq in batch_data:
            for j in range(1, len(seq) + 1):
                subsequence = seq[:j]  # From index 0 to j (increasing subsequence)
                X_batch.append(subsequence[:, 1:-1])  # Exclude icustay_id and aki_stage
                y_batch.append(subsequence[-1, -1])  # Label of the last row of the subsequence
        
        # Pad sequences within the batch to the same length
        max_seq_length = max(len(seq) for seq in X_batch)
        X_batch_padded = [np.pad(seq, ((0, max_seq_length - len(seq)), (0, 0)), mode='constant') for seq in X_batch]
        
        X_batches.append(torch.FloatTensor(X_batch_padded))
        y_batches.append(torch.LongTensor(y_batch))
    
    return X_batches, y_batches

class Net(nn.Module):
    def __init__(self, input_size, emb_size, output_size, bi_directional, number_layers, dropout):
        super(Net, self).__init__()
        self.lstm = nn.LSTM(input_size, emb_size, num_layers=number_layers, 
                            bidirectional=bi_directional, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(emb_size * (2 if bi_directional else 1), output_size)

    def forward(self, x):
        _, (hidden, _) = self.lstm(x)
        
        if self.lstm.bidirectional:
            hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        else:
            hidden = hidden[-1]
        
        out = self.fc(hidden)
        return out.squeeze(1)

for dataset in datasets:
    tail = dataset
    data_path = os.path.join(base_data_path, "resampled", dataset)
    print(f"Processing {tail}")

    X = pd.read_csv(data_path)

    numeric_feat = X.select_dtypes(include=[np.number]).columns.tolist()
    numeric_feat.remove('aki_stage')
    numeric_feat.remove('icustay_id')

    X, normalization_parameters = normalise_data(X, numeric_feat)
    X = X.sort_values(by=['icustay_id'])

    try:
        X.drop(['hadm_id'], axis=1, inplace=True)
    except:
        pass

    id_list = X['icustay_id'].unique()
    id_train, id_test_val = train_test_split(id_list, test_size=SPLIT_SIZE, random_state=42)
    id_valid, id_test = train_test_split(id_test_val, test_size=0.5, random_state=42)

    X = X.reindex(columns=[col for col in X.columns if col != 'aki_stage'] + ['aki_stage'])

    train = X[X.icustay_id.isin(id_train)].sort_values(by=['icustay_id', 'charttime'])
    test = X[X.icustay_id.isin(id_test)].sort_values(by=['icustay_id', 'charttime'])
    validation = X[X.icustay_id.isin(id_valid)].sort_values(by=['icustay_id', 'charttime'])

    train.drop(['charttime'], axis=1, inplace=True)
    test.drop(['charttime'], axis=1, inplace=True)
    validation.drop(['charttime'], axis=1, inplace=True)

    train = list(train.groupby(['icustay_id']).apply(pd.DataFrame.to_numpy))
    train.sort(key=len, reverse=True)
    test = list(test.groupby(['icustay_id']).apply(pd.DataFrame.to_numpy))
    validation = list(validation.groupby(['icustay_id']).apply(pd.DataFrame.to_numpy))

    batch_size = 32
    X_train, y_train = batch(train, batch_size)
    X_test, y_test = batch(test, batch_size)
    X_val, y_val = batch(validation, batch_size)

    print(f"Number of batches in train: {len(X_train)}")
    os.makedirs(os.path.join(base_data_path, "models", tail), exist_ok=True)
    writer = SummaryWriter(os.path.join(base_data_path, "logs", tail, datetime.now().strftime("%Y%m%d-%H%M%S")))

    input_size = X_train[0].shape[2]
    print(f"Input size: {input_size}")
    output_size = 1
    emb_size = round(input_size / 2) 
    number_layers = 3
    dropout = 0.2
    bi_directional = False

    nn_model = Net(input_size, emb_size, output_size, bi_directional, number_layers, dropout).to(device)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(nn_model.parameters(), lr=0.001)

    use_pretrained = False
    best_auc = 0
    start_epoch = 0

    patience = 50
    no_improvement_counter = 0
    n_epochs = 200

    for epoch in range(start_epoch + 1, n_epochs):
        nn_model.train()
        running_loss = 0.0
        running_accuracy = 0.0

        for i, (X_batch, y_batch) in enumerate(zip(X_train, y_train)):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            outputs = nn_model(X_batch)
            loss = criterion(outputs, y_batch.float())
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

            predicted = torch.sigmoid(outputs) > 0.5
            train_accuracy = accuracy_score(y_batch.cpu().numpy(), predicted.cpu().numpy())
            running_accuracy += train_accuracy

            writer.add_scalar('Training/Loss', loss.item(), epoch * len(X_train) + i)
            writer.add_scalar('Training/Accuracy', train_accuracy, epoch * len(X_train) + i)

        nn_model.eval()
        total_v_loss = 0
        all_y_val = []
        all_val_prob = []
        all_accuracy = 0

        for X_val_batch, y_val_batch in zip(X_val, y_val):
            X_val_batch, y_val_batch = X_val_batch.to(device), y_val_batch.to(device)
            with torch.no_grad():
                v_out = nn_model(X_val_batch)
                v_loss = criterion(v_out, y_val_batch.float())
                val_prob = torch.sigmoid(v_out)
                total_v_loss += v_loss.item()
                all_y_val.extend(y_val_batch.cpu().numpy())
                all_val_prob.extend(val_prob.cpu().numpy())

                predicted = val_prob > 0.5
                val_accuracy = accuracy_score(y_val_batch.cpu().numpy(), predicted.cpu().numpy())
                all_accuracy += val_accuracy

        avg_v_loss = total_v_loss / len(X_val)
        roc_auc = roc_auc_score(all_y_val, all_val_prob)
        avg_accuracy = all_accuracy / len(X_val)
        brier_score = brier_score_loss(all_y_val, all_val_prob)

        writer.add_scalar('Validation/Loss', avg_v_loss, epoch)
        writer.add_scalar('Validation/AUC', roc_auc, epoch)
        writer.add_scalar('Validation/Accuracy', avg_accuracy, epoch)
        writer.add_scalar('Validation/Brier_Score', brier_score, epoch)

        print(f"Epoch {epoch+1}/{n_epochs}, "
              f"Train Loss: {running_loss/len(X_train):.4f}, "
              f"Train Accuracy: {running_accuracy/len(X_train):.4f}, "
              f"Val Loss: {avg_v_loss:.4f}, "
              f"Val AUC: {roc_auc:.4f}, "
              f"Val Accuracy: {avg_accuracy:.4f}, "
              f"Val Brier Score: {brier_score:.4f}")

        if roc_auc > best_auc:
            best_auc = roc_auc
            no_improvement_counter = 0
            save_path = f'data/models/{tail}/LSTM_best.pth'
            os.makedirs(os.path.dirname(save_path), exist_ok=True)
            torch.save({
                'model_state_dict': nn_model.state_dict(),
                'optimizer_state_dict': optimizer,
                'best_auc': best_auc,
                'epoch': epoch,
                'brier_score': brier_score
            }, save_path)
            print(f"Model saved with AUC: {roc_auc:.4f} and Brier Score: {brier_score:.4f}")
        else:
            no_improvement_counter += 1
            print(f"No improvement for {no_improvement_counter} epochs.")

        # Stop if no improvement for 'patience' epochs
        if no_improvement_counter >= patience:
            print(f"Early stopping at epoch {epoch+1}. Best AUC: {best_auc:.4f}. Best Brier Score: {brier_score:.4f}")
            break

    writer.close()
    gc.collect()

Training on cuda:0
Processing aki_stage_X_extended_12H.csv
Normalizing in [0,1] with min-max normalization


  train = list(train.groupby(['icustay_id']).apply(pd.DataFrame.to_numpy))
  test = list(test.groupby(['icustay_id']).apply(pd.DataFrame.to_numpy))
  validation = list(validation.groupby(['icustay_id']).apply(pd.DataFrame.to_numpy))


Number of batches in train: 1182
Input size: 44
Epoch 2/200, Train Loss: 0.4792, Train Accuracy: 0.8008, Val Loss: 0.8679, Val AUC: 0.5251, Val Accuracy: 0.2408, Val Brier Score: 0.3336
Model saved with AUC: 0.5251 and Brier Score: 0.3336
Epoch 3/200, Train Loss: 0.4596, Train Accuracy: 0.8004, Val Loss: 0.7814, Val AUC: 0.5752, Val Accuracy: 0.2203, Val Brier Score: 0.2942
Model saved with AUC: 0.5752 and Brier Score: 0.2942
Epoch 4/200, Train Loss: 0.4582, Train Accuracy: 0.7983, Val Loss: 0.8088, Val AUC: 0.4585, Val Accuracy: 0.2328, Val Brier Score: 0.3082
No improvement for 1 epochs.
Epoch 5/200, Train Loss: 0.4536, Train Accuracy: 0.7996, Val Loss: 0.7246, Val AUC: 0.4383, Val Accuracy: 0.2509, Val Brier Score: 0.2669
No improvement for 2 epochs.
Epoch 6/200, Train Loss: 0.4694, Train Accuracy: 0.8009, Val Loss: 0.5679, Val AUC: 0.6527, Val Accuracy: 0.7920, Val Brier Score: 0.1654
Model saved with AUC: 0.6527 and Brier Score: 0.1654
Epoch 7/200, Train Loss: 0.4480, Train Accura

In [None]:
# evaluate pretrained lstm

import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score
import numpy as np
import pandas as pd
import math
import collections
from datetime import datetime
import os
import gc

data_path = "data/preprocessed/preprocessed_data_6H.csv"
tail = data_path.split("/")[-1]
print(f"Processing {tail}")

X = pd.read_csv(data_path)

# Preprocessing steps (similar to XGBoost)
numeric_feat = X.select_dtypes(include=[np.number]).columns.tolist()
numeric_feat.remove('aki_stage')
numeric_feat.remove('icustay_id')

X, normalization_parameters = normalise_data(X, numeric_feat)
X = X.sort_values(by=['icustay_id'])

try:
    X.drop(['hadm_id'], axis=1, inplace=True)
except:
    pass

# Split data (you may want to use the same splitting logic as in XGBoost)
id_list = X['icustay_id'].unique()
# id_list = common_id_list
id_train, id_test_val = train_test_split(id_list, test_size=SPLIT_SIZE, random_state=42)
id_valid, id_test = train_test_split(id_test_val, test_size=0.5, random_state=42)

X = X.reindex(columns=[col for col in X.columns if col != 'aki_stage'] + ['aki_stage'])

train = X[X.icustay_id.isin(id_train)].sort_values(by=['icustay_id'])
test = X[X.icustay_id.isin(id_test)].sort_values(by=['icustay_id'], ignore_index=True)
validation = X[X.icustay_id.isin(id_valid)].sort_values(by=['icustay_id'])

train.drop(['charttime'], axis=1, inplace=True)
test.drop(['charttime'], axis=1, inplace=True)
validation.drop(['charttime'], axis=1, inplace=True)

train = train.groupby(['icustay_id'],as_index=False).apply(pd.DataFrame.to_numpy)
test = test.groupby(['icustay_id'],as_index=False).apply(pd.DataFrame.to_numpy)
validation = validation.groupby(['icustay_id'],as_index=False).apply(pd.DataFrame.to_numpy)
# Prepare data for LSTM
# X_train, y_train = batch(train.to_numpy(), batch_size)
# X_test, y_test = batch(test.to_numpy(), test.shape[0])
# X_val, y_val = batch(validation.to_numpy(), validation.shape[0])
X_train, y_train = batch(train, batch_size)
X_test, y_test = batch(test, batch_size)
X_val, y_val = batch(validation, batch_size)

# LSTM parameters
input_size = X_train[0].shape[2]  # Subtract 2 for icustay_id and aki_stage
output_size = 1
emb_size = round(input_size / 1)
number_layers = 3
dropout = 0
bi_directional = True


# Assuming Net is defined elsewhere
# Assuming X_train, y_train, X_val, y_val, X_test, y_test are defined and split into batches if necessary

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

nn_model = Net(input_size, emb_size, output_size, bi_directional, number_layers, dropout).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(nn_model.parameters(), lr=0.001)

# Test evaluation with F1 score
nn_model.load_state_dict(torch.load(f'data/models/{tail}/LSTM_best.pth')['model_state_dict'])
nn_model.eval()
total_test_loss = 0
all_y_test = []
all_test_prob = []
all_test_f1 = 0

for X_test_batch, y_test_batch in zip(X_test, y_test):
    X_test_batch, y_test_batch = X_test_batch.to(device), y_test_batch.to(device)
    with torch.no_grad():
        t_out = nn_model(X_test_batch)
        t_out = torch.flatten(t_out)
        y_test_batch = y_test_batch.type_as(t_out)
        test_loss = criterion(t_out, y_test_batch)
        test_prob = torch.sigmoid(t_out)
        total_test_loss += test_loss.item()
        all_y_test.extend(y_test_batch.cpu().numpy())
        all_test_prob.extend(test_prob.cpu().numpy())
        
        predicted = torch.sigmoid(t_out) > 0.08
        test_f1 = f1_score(y_test_batch.cpu().numpy(), predicted.cpu().numpy(), zero_division=1)
        all_test_f1 += test_f1
        
# ROC Curve
fpr, tpr, _ = roc_curve(all_y_test, all_test_prob)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
os.makedirs('data/plots', exist_ok=True)
plt.savefig(f'data/plots/ROC_{tail}.png')  # Save ROC curve
plt.close()

# Precision-Recall Curve
precision, recall, _ = precision_recall_curve(all_y_test, all_test_prob)
pr_auc = auc(recall, precision)

plt.figure()
plt.plot(recall, precision, color='blue', lw=2, label='PR curve (area = %0.2f)' % pr_auc)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc="lower left")
os.makedirs('data/plots', exist_ok=True)
plt.savefig(f'data/plots/PR_{tail}.png')  # Save PR curve
plt.close()

print(f"Test Loss: {total_test_loss / len(X_test):.4f}, "
    f"Test AUC: {roc_auc:.4f}, "
    f"Test F1: {all_test_f1 / len(X_test):.4f}, "
    f"Test PR AUC: {pr_auc:.4f}")
      



In [None]:
print(torch.load(f'data/models/{tail}/LSTM_best.pth')['model_state_dict'])


In [None]:
from sklearn.metrics import roc_curve
import numpy as np

fpr, tpr, thresholds = roc_curve(all_y_test, all_test_prob)
# Find the optimal threshold
optimal_idx = np.argmin(np.sqrt(np.square(1-tpr) + np.square(fpr)))
optimal_threshold = thresholds[optimal_idx]

print(f"Optimal threshold: {optimal_threshold}")

In [None]:
from sklearn.metrics import precision_recall_curve, f1_score

precision, recall, thresholds = precision_recall_curve(all_y_test, all_test_prob)
# Add a last threshold corresponding to recall = 0.
thresholds = np.append(thresholds, 1)

f1_scores = 2 * (precision * recall) / (precision + recall)
# Find the optimal threshold
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]

print(f"Optimal threshold: {optimal_threshold}")

In [None]:
# Convert all_y_test and all_test_prob to numpy arrays for easier manipulation
all_y_test = np.array(all_y_test)
all_test_prob = np.array(all_test_prob)

# Initialize variables to store the best threshold and its corresponding F1 score
best_threshold = 0.0
best_f1 = 0.0

# Iterate over a range of possible threshold values (e.g., 0 to 1, step 0.01)
for threshold in np.arange(0.0, 1.01, 0.01):
    # Convert probabilities to binary predictions based on the current threshold
    predictions = (all_test_prob >= threshold).astype(int)
    
    # Calculate F1 score for the current threshold
    f1 = f1_score(all_y_test, predictions, zero_division=1)
    
    # Update best threshold and F1 score if the current F1 score is better
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

# Print the best threshold and its corresponding F1 score
print(f"Best Threshold: {best_threshold}")
print(f"Best F1 Score: {best_f1}")

# LSTM

In [None]:
if (torch.cuda.is_available()):
    print('Training on GPU')
else:
    print('Training on CPU') # On mac book GPU is not possible =() 
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')


In [None]:
print(train[0][0])

In [None]:
np.isnan(np.array(train)).any()

In [None]:
def batch(data, batch_size):
    X_batches = []
    y_batches = []
    times = math.floor(data.shape[0]/batch_size)
    remainder = data.shape[0]%times
    a = 0
    start = 0
    end = start+batch_size
    if remainder ==0:
        a +=1
    while a<times:
        temp = pad(data[start:end,],0)
        x = torch.from_numpy(temp[:,:,1:-1]).float() # without icustay_id and without aki_stage columns
        y = torch.flatten(torch.from_numpy(temp[:, :,-1].reshape(-1,1)).float()).long()
        X_batches.append(x)
        y_batches.append(y)
        start = end
        end = start+batch_size
        a +=1
    temp = pad(data[start:data.shape[0]],0)
    x = torch.from_numpy(temp[:,:,1:-1]).float()
    y = torch.flatten(torch.from_numpy(temp[:, :,-1].reshape(-1,1)).float()).long()
    X_batches.append(x)
    y_batches.append(y)
    if len(X_batches) != len(y_batches):
        print("length error")
    return X_batches, y_batches # arrays

# batching
X_train, y_train = batch(train, batch_size) # to count weights

# counting balance of the classes
y = []
for i in y_train:
    for element in i:
        y.append(element.item())

#  weights
counter=collections.Counter(y)
print(counter)
zeroes = counter[0]
ones = counter[1]

X_test, y_test = batch(test, test.shape[0]) 
X_val, y_val = batch(validation, validation.shape[0])
X_val = X_val[0]
y_val = y_val[0]
X_test = X_test[0]
y_test = y_test[0]
print(y_test.shape)


In [None]:
print(X_val[0][0])

In [None]:
#####################
# setup

bi_directional = True
n_epochs = 50
lr = 0.001
features = len(X_train[0][0][0])
print(features)
# features = 
emb_size = round(features/1)
number_layers = 3
dropout = 0 # dropout

##########################
input_size = features
output_size = 1

In [None]:
class Net(nn.Module):
    def __init__(self, input_size, emb_size, output_size, bi_directional, number_layers, dropout):
        super(Net, self).__init__()
        self.input_size = input_size
        self.emb_size = emb_size 
        self.output_size = output_size
        self.number_layers = number_layers
        self.fc1 = nn.Linear(self.input_size, self.emb_size, bias = True) # I can have a few (IV) within this line - documentation        
        self.fc2 = nn.LSTM(self.emb_size, self.output_size,num_layers=self.number_layers, batch_first = True, bidirectional = bi_directional) 
        # in bidirectional encoder we have  forward and backward hidden states
        self.encoding_size = self.output_size * 2 if bi_directional else self.output_size
        self.combination_layer = nn.Linear(self.encoding_size, self.encoding_size)
        # Create affine layer to project to the classes 
        self.projection = nn.Linear(self.encoding_size, self.output_size)
        #dropout layer for regularizetion of a sequence
        self.dropout_layer = nn.Dropout(p = dropout)  
        self.relu = nn.ReLU()
        
    def forward(self, x):
        h = self.relu(self.fc1(x))
        h, _ = self.fc2(h) # h, _ : as I have 2outputs (tuple), only take the real output [0]. 
        #print(type(h)) # Underscore throughs away the rest, _ "I do not care" variable notation in python
        h = self.relu(self.combination_layer(h))
        h = self.dropout_layer(h)
        h = self.projection(h) 
        return h

#create a network 
nn_model = Net(input_size, emb_size, output_size,bi_directional, number_layers, dropout)
#print(nn_model)
#print(list(nn_model.parameters()))


# BCE Loss and optimizer
criterion = nn.BCEWithLogitsLoss() # class imbalance
# criterion = nn.BCEWithLogitsLoss(pos_weight = torch.tensor(round(zeroes/ones,0))) # class imbalance
#print(round(zeroes/ones,0))
optimizer = optim.Adam(nn_model.parameters(), lr=lr) 
    

In [None]:
import numpy as np

# Count unique values
unique, counts = np.unique(y_val, return_counts=True)
print(dict(zip(unique, counts)))

# Count NaN values
nan_count = np.isnan(y_val).sum()
print(f"Number of NaN values: {nan_count}")

In [None]:
print(X_val)

In [None]:
# replace all nans in X_val with 0
X_val[torch.isnan(X_val)] = 0

In [None]:
X_val_original = X_val.clone()

In [None]:
X_val_original = X_val.clone()

In [None]:
X_val = X_val.clone()

In [None]:
# training loop (full data 3.5 hours)

epochs = n_epochs
starttime = datetime.now() # datetime object containing current date and time
train_losses, validation_losses = [], []
best = 0

for epoch in range(epochs):  # loop over the dataset multiple times
    print ("\n Epoch [%d] out of %d" % (epoch + 1, epochs))
    running_loss = 0.0
    validation_loss = 0.0
    roc_auc = 0.0
    pr_auc = 0.0
    m = 0
    
    #train
    #print(list(nn_model.parameters())[0])
    # pbar = tqdm(X_train, desc=f"Epoch {epoch+1}")
    # for i in pbar:
    #     # zero the parameter gradients
    #     optimizer.zero_grad() # zero the gradient buffers not to consider gradients of previous iterations
    #     X_batch = X_train[m]
    #     y_batch = y_train[m]
    #     # print(X_batch.shape)
    #     # forward + backward + optimize
    #     outputs = nn_model(X_batch)
    #     outputs = torch.flatten(outputs)
    #     y_batch = y_batch.type_as(outputs)
    #     loss = criterion(outputs, y_batch)
    #     loss.backward()
    #     optimizer.step() # Does the update
    #     running_loss += loss.item()
    #     m +=1
    #     pbar.set_postfix({"Training Loss": running_loss/len(X_train)})
        
   
    #validation 
    nn_model.eval()
    with torch.no_grad():
        v_out = nn_model(X_val) 
        v_out = torch.flatten(v_out) 
        y_val = y_val.type_as(v_out)
        v_loss = criterion(v_out, y_val)
        validation_loss = v_loss.item()
        # auc and pr auc
        val_prob = torch.nn.Sigmoid() (v_out)
        print(type(v_out))
        print(v_out)
        print(val_prob)
        print(y_val)
        roc_auc = roc_auc_score(y_val,val_prob) 
        
    validation_losses.append(validation_loss) 
    train_losses.append(running_loss/len(X_train)) 
    print(f"Training loss: {running_loss/len(X_train):.3f}.. " f"Validation loss: {validation_loss:.3f}.. ")
    print(f"AUC: {roc_auc:.2f}")  
    nn_model.train()
    
    
    if roc_auc > best:
        best = roc_auc
        PATH = './LSTMbest.pth' 
        torch.save(nn_model.state_dict(), PATH) # save the model
    else:
        pass
    
       
print('Finished Training')
print("starttime =", starttime)
now = datetime.now()
print("now =", now)

In [None]:
# save the model
PATH = './LSTM.pth' 
torch.save(nn_model.state_dict(), PATH) # save the model


In [None]:
# evaluate the model on the test set
PATH = './LSTM.pth'
nn_model.load_state_dict(torch.load(PATH))
nn_model.eval()
with torch.no_grad():
    t_out = nn_model(X_test)
    t_out = torch.flatten(t_out)
    y_test = y_test.type_as(t_out)
    t_loss = criterion(t_out, y_test)
    test_loss = t_loss.item()
    # auc and pr auc
    test_prob = torch.nn.Sigmoid() (t_out)
    roc_auc = roc_auc_score(y_test,test_prob) 
    pr_auc = average_precision_score(y_test,test_prob)
    # convert output probabilities to class labels
    test_pred = (test_prob > 0.5).float()

    # calculate accuracy
    accuracy = accuracy_score(y_test.cpu().numpy(), test_pred.cpu().numpy())

print(f"Accuracy: {accuracy:.2f}")
print(f"Test loss: {test_loss:.3f}.. " f"ROC AUC: {roc_auc:.2f}.. " f"PR AUC: {pr_auc:.2f}.. ")
    


In [None]:
# evaluate a freshly initialized model on test
nn_model = Net(input_size, emb_size, output_size,bi_directional, number_layers, dropout)
# nn_model.load_state_dict(torch.load(PATH))
nn_model.eval()
with torch.no_grad():
    t_out = nn_model(X_test)
    t_out = torch.flatten(t_out)
    y_test = y_test.type_as(t_out)
    t_loss = criterion(t_out, y_test)
    test_loss = t_loss.item()
    # auc and pr auc
    test_prob = torch.nn.Sigmoid() (t_out)
    roc_auc = roc_auc_score(y_test,test_prob) 
    pr_auc = average_precision_score(y_test,test_prob)
    print(f"Test loss: {test_loss:.3f}.. " f"ROC AUC: {roc_auc:.2f}.. " f"PR AUC: {pr_auc:.2f}.. ")

In [None]:
PATH = './i-Bidir_3_lr_0.001_nodropbest.pth'

# save the model
#torch.save(nn_model.state_dict(), PATH)

# code to load saved model
nn_model = Net(input_size, emb_size, output_size,bi_directional, number_layers, dropout)
nn_model.load_state_dict(torch.load(PATH))

In [None]:
len(y_test) # single batch with zero padding to the max shape 635208

# Next step testing the model

# Continuous performance

In [None]:
logits = nn_model(X_test)
pred = torch.nn.Sigmoid() (logits)
pred = pred.detach().numpy()
pred = pred.reshape(-1,1)
print("Performance on full X_test where it has no batching: is padded to max dimentions. \n")
print ("Area Under ROC Curve: %0.2f" % roc_auc_score(y_test, pred, average = 'micro')  )
brier = round(metrics.brier_score_loss(y_test, pred, sample_weight=None, pos_label=None),3)
print("Brier score : {:.3f}".format(brier))

In [None]:
with open('padded_lstm.npy', 'wb') as f:
    np.save(f, y_test)
    np.save(f, pred)

In [None]:
timestamps = X_test.shape[1] #133
icustays = X_test.shape[0]
times = []
auc_s = []
t = 0

while t < timestamps:
    times.append(t+1)
    row = t
    i = 0
    prob_t = []
    y_t = []
    while i < icustays:
        prob_t.append(pred[row])
        y_t.append(y_test[row])
        row += timestamps
        i +=1
    prob_t = np.array(prob_t).reshape(-1,1)
    y_t = np.array(y_t).reshape(-1,1)
    auc_s.append(roc_auc_score(y_t, prob_t, average = 'micro'))
    t +=1


In [None]:
df =  pd.DataFrame(auc_s, columns = ['AUC'])
df['Timestamps'] = times
#df[120:133]

In [None]:
# Plot 
sns.lineplot(x="Timestamps", y="AUC", color = 'g',
             data=df)

# Comparing to LogR, XGB, RF models

In [None]:
X_test, y_test = batch(test, test.shape[0]) 
X_test = X_test[0]
y_test = y_test[0]


def to_one_label (model, label_list,X_test,index_list):
    # evaluate on a test set
    labels = np.array(label_list)
    labels = labels.reshape(-1,1)
    labels = labels.astype(int)
    logits = model(X_test)
    pred = torch.nn.Sigmoid() (logits)
    max_rows = pred.shape[1]
    predictions = pred.detach().numpy()
    predictions = predictions.reshape(-1,1) 
    # select 1 per icu stay id by index
    prob_1_label = []
    row = 0
    prev = 0
    for i in index_list:
        prob_1_label.append(predictions[row+i-prev])
        row += pred.shape[1]
        prev = i
    prob_1_label = np.array(prob_1_label).reshape(-1,1)
    
    return labels, prob_1_label

In [None]:
def performance (y_test, pred_probabilities):
    # performance
    fpr, tpr, thresholds = roc_curve(y_test, pred_probabilities)
    # compute roc auc
    roc_auc = roc_auc_score(y_test, pred_probabilities, average = 'micro')
    # compute Precision_Recall curves
    precision, recall, _ = precision_recall_curve(y_test, pred_probabilities)
    # compute PR_AUC
    pr_auc = metrics.auc(recall, precision)
       
    # I add confusion matrix
    optimal_cut_off = round(thresholds[np.argmax(tpr - fpr)],4)
    a = np.where(pred_probabilities > optimal_cut_off, 1, 0)
    brier = round(metrics.brier_score_loss(y_test, pred_probabilities, sample_weight=None, pos_label=None),3)
    predictions = np.where(pred_probabilities > optimal_cut_off, 1, 0)  
    
    print ("Area Under ROC Curve: %0.2f" % roc_auc  )
    #print ("Area Under PR Curve(AP): %0.2f" % pr_auc  ) 
    print("Brier score : {:.3f}".format(brier))
    #print('Accuracy for Classifier : {:.2f}'.format(accuracy_score(y_test, predictions)))
    #print('Cut off: ' + str(optimal_cut_off))
    matrix = metrics.confusion_matrix(y_test, a, labels=None, normalize=None)
    #print(str(matrix))
    
    #f.write("\n Area Under ROC Curve: " +str(roc_auc))
    #f.write("\n Area Under PR Curve(AP): " + str(pr_auc))
    #f.write("\n Brier score: " +str(brier))
    #f.write('\n Accuracy for Classifier '+str(round((accuracy_score(labels, predictions)),3)))
    #f.write("\n Cut off: " +str(optimal_cut_off))
    #f.write(str(matrix))
    

In [None]:
labels, prob_1_label = to_one_label (nn_model, label_list,X_test,index_list)
performance(labels,prob_1_label)

In [None]:
# save labels, prob_1_label

with open('test.npy', 'wb') as f:
    #np.save(f, labels)
    np.save(f, prob_1_label)
with open('test.npy', 'rb') as f:
    #lstm_labels = np.load(f)
    lstm_prob = np.load(f)

    


# Interpretability

In [None]:
# To apply integrated gradients, we first create an IntegratedGradients object, providing the model object.
ig = IntegratedGradients(nn_model)
# To compute the integrated gradients, we use the attribute method of the IntegratedGradients object. The method takes
# tensor(s) of input examples (matching the forward function of the model), and returns the input attributions for the
# given examples. A target index, defining the index of the output for which gradients are computed is 1, 
# corresponding to AKI (1/0).

#The input tensor provided should require grad, so we call requires_grad_ on the tensor. The attribute method also 
# takes a baseline, which is the starting point from which gradients are integrated. The default value is just the 
# 0 tensor, which is a reasonable baseline / default for this task.

#The returned values of the attribute method are the attributions, which match the size of the given inputs, and delta,
# which approximates the error between the approximated integral and true integral.
print(datetime.now())
X_test.requires_grad_()
attr, delta = ig.attribute(X_test,target=1, return_convergence_delta=True)
attr = attr.detach().numpy()
attr= np.reshape(attr,(-1,35))
importances = np.mean(attr, axis=0)
print(datetime.now())

In [None]:
attr[:,0].mean()

In [None]:
attr[:,4].mean()

In [None]:
importances

In [None]:
def visualize_feature_importances(feature_names, importances, title="LSTM Average Feature Importances", axis_title="Features"):
    print(title)
    i = 0
    while i < features:
        print(feature_names[i], ": ", '%.3f'%(importances[i]))
        i +=1
    x_pos = (np.arange(len(feature_names)))
    
visualize_feature_importances(feature_names, importances)


In [None]:
lstm_df =  pd.DataFrame(importances, columns = ['Feature Importance'])
lstm_df['Features'] = feature_names
lstm_df = lstm_df.sort_values(by = ['Feature Importance'], ascending = False, ignore_index = True)
#lstm_df["Feature Importance"] =  lstm_df["Feature Importance"]
#lstm_df

In [None]:
lstm_df["Feature Importance"].sum()

In [None]:
#ax = sns.barplot(x='Feature Importance', y='Features', data=lstm_df)
ax = sns.barplot(x='Feature Importance', y='Features', data=lstm_df, color = 'grey')
ax.set_xlabel('Feature Importance', fontsize = 15)
ax.set_ylabel("Features",fontsize=15)
ax.set_yticklabels(ax.get_ymajorticklabels(), fontsize = 6)
plt.title('LSTM feature Importances')
plt.savefig('LSTM_feature_importance_grey.png', dpi = 300, bbox_inches='tight')

In [None]:
lstm_df['abs'] = abs(lstm_df['Feature Importance'])
lstm_df = lstm_df.sort_values(by = ['abs'], ascending = False, ignore_index = True)
lstm_df_10 = lstm_df.head(10)
#lstm_df_10

In [None]:
#ax = sns.barplot(x='Feature Importance', y='Features', data=lstm_df_10, palette="mako")

ax = sns.barplot(x='Feature Importance', y='Features', data=lstm_df_10, color = 'darkgreen')
ax.set_xlabel('Feature Importance', fontsize = 15)
ax.set_ylabel("Features",fontsize=15)
ax.set_yticklabels(ax.get_ymajorticklabels(), fontsize = 10)
plt.title('LSTM top 10 features by feature importance')
plt.savefig('LSTM_top10_feature_importance_darkgreen.png', dpi = 300, bbox_inches='tight')

# Plots

In [None]:
def build_graphs (y_test,pred_probabilities, classifier_name, plot_name, algorithm):
    
    def bin_total(y_true, y_prob, n_bins):
        bins = np.linspace(0., 1. + 1e-8, n_bins + 1)

        # In sklearn.calibration.calibration_curve, the last value in the array is always 0.
        binids = np.digitize(y_prob, bins) - 1

        return np.bincount(binids, minlength=len(bins))

    def missing_bin(bin_array):
        midpoint = " "    
        if bin_array[0]==0:
            midpoint = "5%, "
        if bin_array[1]==0:
            midpoint = midpoint + "15%, "
        if bin_array[2]==0:
            midpoint = midpoint + "25%, "
        if bin_array[3]==0:
            midpoint = midpoint + "35%, " 
        if bin_array[4]==0:
            midpoint = midpoint + "45%, "
        if bin_array[5]==0:
            midpoint = midpoint + "55%, "
        if bin_array[6]==0:
            midpoint = midpoint + "65%, "
        if bin_array[7]==0:
            midpoint = midpoint + "75%, "
        if bin_array[8]==0:
            midpoint = midpoint + "85%, "
        if bin_array[9]==0:
            midpoint = midpoint + "95%, "
        return "The missing bins have midpoint values of "+ str(midpoint)
    
    # performance
    fpr, tpr, thresholds = roc_curve(y_test, pred_probabilities)
    # compute roc auc
    roc_auc = roc_auc_score(y_test, pred_probabilities, average = 'micro')
    # compute Precision_Recall curves
    precision, recall, _ = precision_recall_curve(y_test, pred_probabilities)
    # compute PR_AUC
    pr_auc = metrics.auc(recall, precision)

    # compute calibration curve
    LR_y, LR_x = calibration_curve(y_test, pred_probabilities, n_bins=10)
    #find out which one are the missing bins
    bin_array = bin_total(y_test, pred_probabilities , n_bins=10)
    print(missing_bin(bin_array))

    print("plot curves and save in one png file")
    #save three plots in one png file
    fig, (ax1, ax2, ax3) = plt.subplots(3, figsize=(7, 24))
    fig.subplots_adjust(wspace=0.3, hspace= 0.3)
    fig.suptitle('Evaluation of '+ plot_name)

    fpr, tpr, thresholds = roc_curve(y_test, pred_probabilities)
    
    # plot roc curve
    ax1.plot(fpr, tpr,'C2', label=algorithm+" "+"Classifier " + str(classifier_name) + ", auc=" +str(round(roc_auc,2)))
    ax1.title.set_text('ROC AUC')
    ax1.set(xlabel='False Positive Rate', ylabel='True Positive Rate')
    ax1.legend(loc="lower right")

    # plot PR curve
    ax2.plot(recall, precision,'C2', label=algorithm+" "+"Classifier " + str(classifier_name) + ", auc="+str(round(pr_auc,2)))
    ax2.title.set_text('PR AUC')
    ax2.set(xlabel='Recall', ylabel='Precision')
    ax2.legend(loc="lower right")

    # plot calibration curve
    ax3.plot(LR_x, LR_y, 'C2',marker='o', linewidth=1, label='LR')
    line = mlines.Line2D([0, 1], [0, 1], color='black')
    transform = ax3.transAxes
    line.set_transform(transform)
    ax3.add_line(line)
    ax3.title.set_text('Calibration plot for '+str(plot_name))
    ax3.set(xlabel= 'Predicted probability', ylabel= 'True probability in each bin')
    ax3.legend(loc="lower right")

    plt.savefig(plot_name+".png")
    plt.show()
    

In [None]:
def distribution(pred_probabilities, y_test, dist_name):
    #probabilities distributions graphs
    true_1 = pd.DataFrame(pred_probabilities, columns=['Predicted probabilities'])
    true_1['labels'] = y_test.tolist()
    true_0 = true_1.copy(deep = True) 
    indexNames = true_1[true_1['labels'] == 0].index
    true_1.drop(indexNames , inplace=True)
    indexNames = true_0[ true_0['labels'] == 1 ].index
    true_0.drop(indexNames , inplace=True)
    true_1.drop(columns=['labels'], inplace = True)
    true_0.drop(columns=['labels'], inplace = True)
    
    sns.distplot(true_1['Predicted probabilities'], hist = False, kde = True,
                 kde_kws = {'shade': True, 'linewidth': 3,"color": "g"}, label = 'Class 1')
    plt.ylabel('Density')
    sns.distplot(true_0['Predicted probabilities'], hist = False, kde = True,
                     kde_kws = {'shade': True, 'linewidth': 3}, label = 'Class 0')
    plt.title('Density Plot'+ dist_name)
    

In [None]:
distribution(prob_1_label, labels.flatten(), " Bidirectional LSTM no imputation ")
plt.savefig('dist_LSTM_bi_NOimp.png')

In [None]:
classifier_name = "None vs. Any AKI"    ###change every time #Moderate vs. Severe #None vs. Any #Others vs. Severe
plot_name = "LSTM NO imputation"
build_graphs(labels.flatten(), prob_1_label.flatten(), classifier_name, plot_name, "LSTM")


In [None]:
precision, recall, thresholds = precision_recall_curve(labels, prob_1_label)
fpr, tpr, thresholds = roc_curve(labels, prob_1_label)
optimal_cut_off = round(thresholds[np.argmax(tpr - fpr)],2)
prediction = np.where(prob_1_label > optimal_cut_off, 1, 0)
f1 = f1_score(labels,prediction)
prauc =auc(recall, precision)
print('F1 = %.3f, PR auc =%.3f' % (f1,prauc))

# plot the precision-recall curves
no_skill = len(labels[labels==1]) / len(labels)
plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
plt.plot(recall,precision, marker='.', label='LSTM')
# axis labels
plt.xlabel('Recall')
plt.ylabel('Precision')
# show the legend
plt.legend()
# show the plot
plt.show()



# Hyperparameters tuning

In [None]:
# search grid 
layers = [1,2,3]
l_rate = [0.001, 0.0001]
drop = [0,0.2]
bidirectionality = [True,False]
#loops count
hypercount = 0
# static parameters
n_epochs = 80
emb_size = round(features/1)
input_size = features
output_size = 1
###############################

f = open('lstm_no_imp_uni.txt', 'w+') #change with or without imp

for q1 in bidirectionality:
    for q2 in layers:
        for q3 in drop:
            for q4 in l_rate:
                hypercount +=1
                name = "i-Bidir_" if q1 else "i-Onedir_"
                name = name+str(q2) + "_lr_"+str(q4)
                name = name+"_drop"+str(q3) if q3 == 0.2 else name+"_nodrop"
                #set parameters
                bi_directional = q1
                lr = q4
                number_layers = q2
                dropout = q3 # dropout
                print('hypercount: %d' % hypercount)
                print('\n')
                print(name)
                f.write('\n\n' + str(name)+ '\n\n')
                    
                # create the NN
                class Net(nn.Module):
                    def __init__(self, input_size, emb_size, output_size, bi_directional, number_layers, dropout):
                        super(Net, self).__init__()
                        self.input_size = input_size
                        self.emb_size = emb_size 
                        self.output_size = output_size
                        self.number_layers = number_layers
                        self.fc1 = nn.Linear(self.input_size, self.emb_size, bias = True) # I can have a few (IV) within this line - documentation        
                        self.fc2 = nn.LSTM(self.emb_size, self.output_size,num_layers=self.number_layers, batch_first = True, bidirectional = bi_directional) 
                        # in bidirectional encoder we have  forward and backward hidden states
                        self.encoding_size = self.output_size * 2 if bi_directional else self.output_size
                        self.combination_layer = nn.Linear(self.encoding_size, self.encoding_size)
                        # Create affine layer to project to the classes 
                        self.projection = nn.Linear(self.encoding_size, self.output_size)
                        #dropout layer for regularizetion of a sequence
                        self.dropout_layer = nn.Dropout(p = dropout)  
                        self.relu = nn.ReLU()

                    def forward(self, x):
                        h = self.relu(self.fc1(x))
                        h, _ = self.fc2(h) # h, _ : as I have 2outputs (tuple), only take the real output [0]. 
                        #print(type(h)) # Underscore throughs away the rest, _ "I do not care" variable notation in python
                        h = self.relu(self.combination_layer(h))
                        h = self.dropout_layer(h)
                        h = self.projection(h) 
                        return h

                #create a network 
                nn_model = Net(input_size, emb_size, output_size,bi_directional, number_layers, dropout)
                print(nn_model)
                #print(list(nn_model.parameters()))
                
                # BCE Loss and optimizer
                criterion = nn.BCEWithLogitsLoss(pos_weight = torch.tensor(round(zeroes/ones,0))) # class imbalance
                #print(round(zeroes/ones,0))
                optimizer = optim.Adam(nn_model.parameters(), lr=lr) 
    
    
                # TRAINING LOOP 
                epochs = n_epochs
                starttime = datetime.now() # datetime object containing current date and time
                train_losses, validation_losses = [], []
                best = 0
                patience = 0
                old_auc = 0
                old_pr = 0

                for epoch in range(epochs):  # loop over the dataset multiple times
                    print ("\n Epoch [%d] out of %d" % (epoch + 1, epochs))
                    running_loss = 0.0
                    validation_loss = 0.0
                    roc_auc = 0.0
                    pr_auc = 0.0
                    m = 0
                    
                    #train
                    #print(list(nn_model.parameters())[0])
                    for i in X_train:
                        # zero the parameter gradients
                        optimizer.zero_grad() # zero the gradient buffers not to consider gradients of previous iterations
                        X_batch = X_train[m]
                        y_batch = y_train[m]
                        # forward + backward + optimize
                        outputs = nn_model(X_batch)
                        outputs = torch.flatten(outputs)
                        y_batch = y_batch.type_as(outputs)
                        loss = criterion(outputs, y_batch)
                        loss.backward()
                        optimizer.step() # Does the update
                        running_loss += loss.item()
                        m +=1
                    #validation 
                    nn_model.eval()
                    with torch.no_grad():
                        v_out = nn_model(X_val) 
                        v_out = torch.flatten(v_out) 
                        y_val = y_val.type_as(v_out)
                        v_loss = criterion(v_out, y_val)
                        validation_loss = v_loss.item()
                        # auc and pr auc
                        val_prob = torch.nn.Sigmoid() (v_out)
                        precision, recall, thresholds = precision_recall_curve(y_val, val_prob)
                        pr_auc = auc(recall, precision)
                        roc_auc = roc_auc_score(y_val,val_prob) 

                    validation_losses.append(validation_loss) 
                    train_losses.append(running_loss/len(X_train)) 
                    print(f"Training loss: {running_loss/len(X_train):.3f}.. " f"Validation loss: {validation_loss:.3f}.. ")
                    print(f"AUC: {roc_auc:.2f} " f"PR AUC: {pr_auc:.2f} ")  
                    nn_model.train()

                    
                    if roc_auc > best:
                        best = roc_auc
                        PATH1 = './'+str(name)+'best.pth' 
                        torch.save(nn_model.state_dict(), PATH1) # save the model
                    else:
                        pass
                    
                    if roc_auc == old_auc and pr_auc==old_pr:
                        patience +=1
                    old_auc = roc_auc
                    old_pr = pr_auc
                    if patience ==10:
                        print("out of patience")
                        break

                print('\n Finished Training')
                print("starttime =", starttime)
                now = datetime.now()
                print("endtime =", now)
                # end of training loop
                
                PATH2 = './'+str(name)+'last.pth' 
                torch.save(nn_model.state_dict(), PATH2) # save the model
                print('\n Last model \n')
                labels, probs = to_one_label(nn_model,label_list,X_test,index_list)
                performance (nn_model, labels, probs)
                
                #load the best model
                best_model = Net(input_size, emb_size, output_size,bi_directional, number_layers, dropout)
                best_model.load_state_dict(torch.load(PATH1))
                print('\n Best model \n')
                labels, probs = to_one_label(best_model,label_list,X_test,index_list)
                performance (best_model, labels, probs)
f.close() 
        