In [1]:
# Import libraries
import os
import sys

import pandas as pd
import numpy as np
import random
import math
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import statistics
import datetime as dt

from sklearn.preprocessing import MinMaxScaler, Imputer
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC

from sklearn.utils import class_weight

#import lightgbm as lgb

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.constraints import maxnorm
from keras.optimizers import SGD
from keras import backend as K

Using TensorFlow backend.


In [2]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

In [3]:
# Set some display options
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
np.set_printoptions(threshold=np.nan)

In [4]:
# Check virtual environment: should be: '/Users/James/anaconda3/envs/mimic/bin/python'
sys.executable

'/Users/James/anaconda3/envs/mimic/bin/python'

In [5]:
# Set up paths
project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
src_folder = os.path.join(project_root, 'src')

src_preparation_folder = os.path.join(src_folder, 'preparation')
src_processing_folder = os.path.join(src_folder, 'processing')
src_modeling_folder = os.path.join(src_folder, 'modeling')

In [6]:
# # Import src functions
# sys.path.insert(0, src_preparation_folder)
# from import_data import get_table
# from import_data import get_data_simple
# from import_data import get_patient_admissions_diagnoses
# from import_data import get_admission_data
# from import_data import get_chartevents
# from import_data import get_labevents
# from extract_codes import find_ndc_codes

# sys.path.insert(0, src_processing_folder)
# from stats import plot_KDE
# from stats import plot_perc_bar_chart
# from stats import compare_groups
# from stats import graph_comparisons
# from patient_selection import select_test_groups
# from clean import replace_itemid_with_label
# from clean import find_populated_cols

# sys.path.insert(0, src_modeling_folder)
# from models import train_lgb

In [7]:
# Import data
train = pd.read_csv(os.path.abspath(os.path.join(os.getcwd(), os.pardir, 'data', 'acute_respiratory_failure_train.csv')),index_col=0)
test = pd.read_csv(os.path.abspath(os.path.join(os.getcwd(), os.pardir, 'data', 'acute_respiratory_failure_test.csv')),index_col=0)

In [8]:
def final_cleaning(train, test):

    # Shuffle
    train = train.sample(frac=1).reset_index(drop=True)
    test = test.sample(frac=1).reset_index(drop=True)

    # Split features and labels
    X_train = train.drop(columns=['subject_id', 'hadm_id', 'target'])
    y_train = np.array(train.target.tolist())
    
    X_test = test.drop(columns=['subject_id', 'hadm_id', 'target'])
    y_test = np.array(test.target.tolist())

    # Impute missing values
    imputer = Imputer(strategy = 'median')
    imputer.fit(X_train)
    X_train = imputer.transform(X_train)
    X_test = imputer.transform(X_test)

    # Scale each feature to 0-1
    scaler = MinMaxScaler(feature_range = (0, 1)) 
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    print(X_train.shape)
    print(X_test.shape)
    print(y_train.shape)
    print(y_test.shape)
    
    return X_train, X_test, y_train, y_test

In [9]:
X_train, X_test, y_train, y_test = final_cleaning(train, test)

(22732, 42)
(5684, 42)
(22732,)
(5684,)




In [10]:
# define the grid search parameters
param_grid = {    
#    'learn_rate' : [0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1],
#    'momentum': [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3],
#    'neurons': list(np.arange(5, 100, 5)),
#    'hidden_layers': list(np.arange(0, 10, 1)),
    'weight_constraint': list(np.arange(0, 10, 1)),
    'dropout_rate': list(np.linspace(0, 1))
}

In [11]:
# Function to create model, required for KerasClassifier
def create_model(input_shape, neurons=100, weight_constraint=0, dropout_rate=0,
                 hidden_layers=10, learn_rate=0.0001, momentum=0):
        
    # Initialize the constructor
    model = Sequential()
    print('Sequential created')
    
    # Add an input layer
    model.add(Dense(neurons,
                    input_shape=input_shape,
                    activation='relu',
                    kernel_initializer = 'normal',
                    kernel_constraint=maxnorm(weight_constraint)))
    model.add(Dropout(dropout_rate))
    print('Layer 1 added')

    for i in range(hidden_layers):
        # Add one hidden layer
        model.add(Dense(neurons,
                        activation='relu',
                        kernel_initializer = 'normal',
                        kernel_constraint=maxnorm(weight_constraint)))
        model.add(Dropout(dropout_rate))
        print('Hidden layer {} added'.format(i))

    # Add an output layer 
    model.add(Dense(1, kernel_initializer = 'normal', activation='sigmoid'))
    print('Output layer added')
    
    #compile model
    optimizer = SGD(lr=learn_rate, momentum=momentum)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    print('model compiled')
    
    return model

In [12]:
def train_model(X_train, y_train, n_folds, params, epochs, batch_size):
    
    # Create the kfold object
    k_fold = KFold(n_splits = n_folds, shuffle = False, random_state = 50)
    
    # Empty array for out of fold validation predictions
    out_of_fold = np.zeros(X_train.shape[0])
    
    # Lists for recording validation and training scores
    valid_scores = []
    train_scores = []
        
    # Iterate through each fold
    for train_indices, valid_indices in k_fold.split(X_train):
        
        # Training data for the fold
        train_features  = X_train[train_indices]
        train_labels = y_train[train_indices]
        
        # Validation data for the fold
        valid_features = X_train[valid_indices]
        valid_labels = y_train[valid_indices]
        
        # Create the model
        params['input_shape'] = (train_features.shape[1],)
        model = create_model(**params)
        
        print(model.summary())
        print('-->', 'training shape: ', train_features.shape)
        print('-->', 'labels shape: ', train_labels.shape)
        
        # Find the class weights so predictions match these weights
        #class_weights = class_weight.compute_class_weight('balanced', np.unique(train_labels), train_labels)
        #print(class_weights)
        
        # Train the model
        model.fit(train_features, train_labels, epochs=epochs, batch_size=batch_size, verbose=1,
                 #class_weight=class_weights,
                 validation_data=(valid_features, valid_labels)
                 )
        
        # Record the training and validation scores and append to the training & valid scores list
        train_predict = model.predict(train_features, batch_size=batch_size)
        train_predict = [item for sublist in train_predict for item in sublist]
        train_score = roc_auc_score(train_labels, train_predict)
        train_scores.append(train_score)
        print('-->', 'training: ', train_score)
        #print(train_predict)
        
        # Record the out of fold predictions
        valid_predictions = model.predict(valid_features, batch_size=batch_size)
        out_of_fold[valid_indices] = [item for sublist in valid_predictions for item in sublist]
        valid_score = roc_auc_score(valid_labels, out_of_fold[valid_indices])
        valid_scores.append(valid_score)
        print('-->', 'cross validation: ', valid_score)
        #print(out_of_fold[valid_indices])
        
        # Clear the model to free up memory
        #K.clear_session()
    
    # Overall validation score
    valid_auc = roc_auc_score(y_train, out_of_fold)

    # Overall training score
    train_auc = np.mean(train_scores)
    
    # Add the overall scores to the metrics
    valid_scores.append(valid_auc)
    train_scores.append(train_auc)
    
    # Needed for creating dataframe of validation scores
    fold_names = list(range(n_folds))
    fold_names.append('overall')
    
    # Dataframe of validation scores
    metrics = pd.DataFrame({'fold': fold_names,
                            'train': train_scores,
                            'valid': valid_scores})
    
    print(metrics)
    
    return metrics, train_auc, valid_auc, model

In [13]:
def tune_model(X_train, y_train, param_grid, runs, n_folds, epochs, batch_size):
    
    ## -- Create output dataframe showing scores and associated hyperparameters
    df_cols = list(param_grid.keys())
    df_cols = df_cols + ['training_score', 'valid_score']

    runs_df = pd.DataFrame(columns=df_cols)
    total_runs = runs
    run =0

    while run < total_runs:

        run += 1

        # Select the random parameters
        random_params = {k: random.sample(v, 1)[0] for k, v in param_grid.items()}
        print(random_params)

        print('=========')
        print('RUN IS ' + str(run))
        print('=========')

        metrics, train_score, valid_score, model = train_model(X_train=X_train,
                                                      y_train=y_train,
                                                      n_folds = n_folds,
                                                      params = random_params,
                                                      epochs=epochs,
                                                      batch_size=batch_size)

        temp_df = pd.DataFrame(columns=df_cols)

        for c in list(param_grid.keys()):
            temp_df.loc[0, c] = random_params[c]

        temp_df.loc[0, 'training_score'] = train_score
        temp_df.loc[0, 'valid_score'] = valid_score

        runs_df = runs_df.append(temp_df)

        del temp_df, train_score, valid_score
        
    return runs_df, model

In [14]:
runs_df, model = tune_model(X_train, y_train, param_grid, runs=1, n_folds=3, epochs=100, batch_size=64)
runs_df.sort_values(by='valid_score', ascending=False)

{'weight_constraint': 0, 'dropout_rate': 0.26530612244897955}
RUN IS 1
Sequential created
Layer 1 added
Hidden layer 0 added
Hidden layer 1 added
Hidden layer 2 added
Hidden layer 3 added
Hidden layer 4 added
Hidden layer 5 added
Hidden layer 6 added
Hidden layer 7 added
Hidden layer 8 added
Hidden layer 9 added
Output layer added
model compiled
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 100)               4300      
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 100)               10100     
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
__________________________________________________________

Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100


Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
--> training:  0.5
--> cross validation:  0.5
Sequential created
Layer 1 added
Hidden layer 0 added
Hidden layer 1 added
Hidden layer 2 added
Hidden layer 3 added
Hidden layer 4 added
Hidden layer 5 added
Hidden layer 6 added
Hidden layer 7 added
Hidden layer 8 added
Hidden layer 9 added
Output layer added
model compiled
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_13 (Dense)             (None, 100)               4300      
_________________________________________________________________
dropout_12 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_14 (Dense)             (None, 100)               10100     
_________________________________________________________________
dropout_13 (Dropou

Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100


Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
--> training:  0.5
--> cross validation:  0.5
Sequential created
Layer 1 added
Hidden layer 0 added
Hidden layer 1 added
Hidden layer 2 added
Hidden layer 3 added
Hidden layer 4 added
Hidden layer 5 added
Hidden layer 6 added
Hidden layer 7 added
Hidden layer 8 added
Hidden layer 9 added
Output layer added
model compiled
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_25 (Dense)             (None, 100)               4300      
_________________________________________________________________
dropout_23 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_2

Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100


Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
--> training:  0.5
--> cross validation:  0.5
      fold  train     valid
0        0    0.5  0.500000
1        1    0.5  0.500000
2        2    0.5  0.500000
3  overall    0.5  0.496305


Unnamed: 0,weight_constraint,dropout_rate,training_score,valid_score
0,0,0.265306,0.5,0.496305


In [15]:
print(model.predict(X_test))

[[0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33415255]
 [0.33