In [1]:
# Import libraries
import os
import sys

import pandas as pd
import numpy as np

import random

from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.utils import class_weight

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.constraints import maxnorm
from keras.optimizers import SGD
from keras import backend as K

Using TensorFlow backend.


In [2]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

In [3]:
# Check virtual environment: should be: '/Users/James/anaconda3/envs/mimic/bin/python'
sys.executable

'/Users/James/anaconda3/envs/mimic/bin/python'

In [4]:
# Set up paths
project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
src_folder = os.path.join(project_root, 'src')

In [5]:
# Import src functions
sys.path.insert(0, src_folder)
from modeling import *
from stats_and_visualisations import *

In [6]:
### ---- PARAMETERS
iterations = 50
n_folds = 4
epochs = 25
batch_size = 64

In [7]:
# Import data
train = pd.read_csv(os.path.abspath(os.path.join(os.getcwd(), os.pardir, 'data', 'acute_respiratory_failure_train.csv')),index_col=0)
print('--> Importing done')

--> Importing done


In [8]:
X_train, y_train = final_cleaning(ids = ['subject_id', 'hadm_id'], target = 'target', train = train)
print('--> Cleaning done')

--> Cleaning done


In [9]:
# Function to create model, required for KerasClassifier
def create_model(input_shape, neurons=1500, weight_constraint=2, dropout_rate=0.1,
                 hidden_layers=3, learn_rate=0.1, momentum=0.1):
        
    # Initialize the constructor
    model = Sequential()
    
    # Add an input layer
    model.add(Dense(neurons,
                    input_shape=input_shape,
                    activation='relu',
                    kernel_initializer = 'normal',
                    kernel_constraint=maxnorm(weight_constraint)))
    model.add(Dropout(dropout_rate))

    for i in range(hidden_layers):
        # Add one hidden layer
        model.add(Dense(neurons,
                        activation='relu',
                        kernel_initializer = 'normal',
                        kernel_constraint=maxnorm(weight_constraint)))
        model.add(Dropout(dropout_rate))

    # Add an output layer 
    model.add(Dense(1, kernel_initializer = 'normal', activation='softmax'))
    
    #compile model
    optimizer = SGD(lr=learn_rate, momentum=momentum)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    return model

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

model = create_model(input_shape=(X_train.shape[1],))

# Find the class weights so predictions match these weights
class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)

# Train the model
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=1,
         class_weight=class_weights, validation_data=(X_val, y_val))

train_predictions = model.predict(X_train)
valid_predictions = model.predict(X_val)

train_score = roc_auc_score(y_train, train_predictions)
valid_score = roc_auc_score(y_val, valid_predictions)

print('--> Training score: ', train_score)
print('--> Valid score: ', valid_score)

Train on 18185 samples, validate on 4547 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/10

In [None]:
######################

In [None]:
# define the grid search parameters
param_grid = {    
    'learn_rate' : [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1],
    'momentum': [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3],
    'neurons': list(np.arange(500, 2000, 100)),
    'hidden_layers': list(np.arange(0, 6)),
    'weight_constraint': list(np.arange(1, 8)),
    'dropout_rate': list(np.linspace(0.01, 1))
}

In [None]:
def train_model(X_train, y_train, n_folds, params, epochs, batch_size):
    
    # Create the kfold object
    k_fold = KFold(n_splits = n_folds, shuffle = False, random_state = 50)
    
    # Empty array for out of fold validation predictions
    out_of_fold = np.zeros(X_train.shape[0])
    
    # Lists for recording validation and training scores
    valid_scores = []
    train_scores = []
        
    fold = 0
    
    # Iterate through each fold
    for train_indices, valid_indices in k_fold.split(X_train):
        
        fold += 1
        print('----> Fold: ', fold)
        
        # Training data for the fold
        train_features  = X_train[train_indices]
        train_labels = y_train[train_indices]
        
        # Validation data for the fold
        valid_features = X_train[valid_indices]
        valid_labels = y_train[valid_indices]
        
        # Create the model
        params={}
        params['input_shape'] = (train_features.shape[1],)
        model = create_model(**params)
        
        # Find the class weights so predictions match these weights
        class_weights = class_weight.compute_class_weight('balanced', np.unique(train_labels), train_labels)
        
        # Train the model
        model.fit(train_features, train_labels, epochs=epochs, batch_size=batch_size, verbose=0,
                 class_weight=class_weights, validation_data=(valid_features, valid_labels))
        
        # Record the training and validation scores and append to the training & valid scores list
        train_predict = model.predict(train_features, batch_size=batch_size)
        train_predict = [item for sublist in train_predict for item in sublist]
        train_score = roc_auc_score(train_labels, train_predict)
        train_scores.append(train_score)
        print('training: ', train_score)
        
        # Record the out of fold predictions
        valid_predictions = model.predict(valid_features, batch_size=batch_size)
        out_of_fold[valid_indices] = [item for sublist in valid_predictions for item in sublist]
        valid_score = roc_auc_score(valid_labels, out_of_fold[valid_indices])
        valid_scores.append(valid_score)
        print('cross validation: ', valid_score)
        
        # Clear the model to free up memory
        K.clear_session()
    
    # Overall validation score
    valid_auc = roc_auc_score(y_train, out_of_fold)

    # Overall training score
    train_auc = np.mean(train_scores)
    
    print('------------------------')
    print('OVERALL ---- > Training score: ', train_auc, ', Valid score: ', valid_auc)
    print('------------------------')
    
    # Add the overall scores to the metrics
    valid_scores.append(valid_auc)
    train_scores.append(train_auc)
    
    # Needed for creating dataframe of validation scores
    fold_names = list(range(n_folds))
    fold_names.append('overall')
    
    # Dataframe of validation scores
    metrics = pd.DataFrame({'fold': fold_names,
                            'train': train_scores,
                            'valid': valid_scores})
        
    return metrics, train_auc, valid_auc

In [None]:
def tune_model(X_train, y_train, param_grid, runs, n_folds, epochs, batch_size):
    
    ## -- Create output dataframe showing scores and associated hyperparameters
    df_cols = list(param_grid.keys())
    df_cols = df_cols + ['training_score', 'valid_score']

    runs_df = pd.DataFrame(columns=df_cols)
    total_runs = runs
    run =0

    while run < total_runs:

        run += 1

        # Select the random parameters
        random_params = {k: random.sample(v, 1)[0] for k, v in param_grid.items()}

        print('=========')
        print('RUN IS ' + str(run))
        print('=========')
        
        print('PARAMS: ', random_params)

        metrics, train_score, valid_score = train_model(X_train=X_train,
                                                      y_train=y_train,
                                                      n_folds = n_folds,
                                                      params = random_params,
                                                      epochs=epochs,
                                                      batch_size=batch_size)

        temp_df = pd.DataFrame(columns=df_cols)

        for c in list(param_grid.keys()):
            temp_df.loc[0, c] = random_params[c]

        temp_df.loc[0, 'training_score'] = train_score
        temp_df.loc[0, 'valid_score'] = valid_score

        runs_df = runs_df.append(temp_df)

        del temp_df, train_score, valid_score
        
    return runs_df

In [None]:
runs_df = tune_model(X_train, y_train, param_grid, runs=iterations,
                    n_folds=n_folds, epochs=epochs, batch_size=batch_size)

runs_df.sort_values(by='valid_score', ascending=False)