# 1Ch-NoMC
- Input = single slices
- Standard Dropout applied: Only at training time, not at test time
- 5 fold cross validation

In [None]:
# !nvidia-smi

In [None]:
# Import libraries
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as imgplot
import time
import h5py
import os
import pandas as pd
import random
from os.path import exists
import tensorflow as tf
tf.set_random_seed(3004)

In [None]:
# Create a folder for the output
output_folder = "C:/Users/hezo/Dropbox/PhD/Stroke/Stroke_classification/Analyses_Oct_2018/outputs/5fold_CV_bl_without_mc_dropout/"

if not exists(output_folder):
    os.makedirs(output_folder)
    os.makedirs(output_folder + "/checkpoints")

## Data processing

In [None]:
# Read the data from the hdf5 file
import h5py
import numpy as np

def decode_data(string):
    decoded_string = [n.decode("UTF-8", "ignore") for n in string]
    return(decoded_string)

with h5py.File('data/data_oct_18.h5', 'r') as h5:
    print('H5-file: ', list(h5.keys()))
    
    # Image matrices
    X = h5["X"][:]
    # Image labels (1=stroke, 0=no-stroke)
    Y_img = h5["Y"][:]
    # Patient ID's
    pat = h5["pat"][:]
    # Path to images
    path = decode_data(h5["path"][:])
    # Image names/number
    img = decode_data(h5["img_id"][:])
    # Patient labels (1=stroke, 0=TIA)
    Y_pat = h5["stroke"][:]
    
print(len(X), len(Y_img), len(Y_pat), len(pat), len(path), len(img))

### Remove black images

In [None]:
# Get the functions that are used to remove the black images
from functions.get_quantiles import get_quantiles

# Help function to delete images by index
def delete_by_index(X, Y_img, Y_pat, pat, path, img, idx):
    X = np.delete(X, idx, axis=0) 
    Y_img = np.delete(Y_img, idx, axis=0)
    pat = np.delete(pat, idx, axis=0)
    path = np.delete(path, idx, axis=0)
    img = np.delete(img, idx, axis=0)
    Y_pat = np.delete(Y_pat, idx, axis=0)
    return(X, Y_img, Y_pat, pat, path, img)

In [None]:
print('Before: ', len(X), len(Y_img), len(Y_pat), len(pat), len(path), len(img))

# Get the 1st and 99th quantiles
q1, q99 = get_quantiles(X)

# If the 1st and 99th quantil are equivalent --> image is black
idx_black_img = np.where(q99[:,0]==q1[:,0])
X, Y_img, Y_pat, pat, path, img = delete_by_index(X, Y_img, Y_pat, pat, path, img, idx_black_img)
q1=np.delete(q1, idx_black_img, axis=0) 
q99=np.delete(q99, idx_black_img, axis=0) 

# If the 99th quantile is smaller 10 --> image is black
idx_black_img=np.where((q99[:,0]<10))
X, Y_img, Y_pat, pat, path, img = delete_by_index(X, Y_img, Y_pat, pat, path, img, idx_black_img)
q1=np.delete(q1, idx_black_img, axis=0) 
q99=np.delete(q99, idx_black_img, axis=0) 

print('After: ', len(X), len(Y_img), len(Y_pat), len(pat), len(path), len(img))

### Define datasets for Cross Validation
Split the indices of stroke and no-stroke patients, such that the percentage of stroke and no-stroke patients is the same within each dataset. We have 355 stroke patients (out of 511) ~70%. We use the following splits for test1,...,test4
- train1: 196 stroke, 83 no-stroke
- valid1: 35 stroke, 15 no-stroke
- valid2: 54 stroke, 26 no-stroke
- test: 71 stroke, 31 no-stroke

for test5 we use:
- train1: 196 stroke, 82 no-stroke
- valid1: 35 stroke, 15 no-stroke
- valid2: 54 stroke, 26 no-stroke
- test: 71 stroke, 32 no-stroke

In [None]:
# check if the seed works and we always get the same results
# np.random.seed(1)
# random_pat = np.random.choice(np.unique(pat), size=len(np.unique(pat)), replace=False)
# print(random_pat[:10])
# np.random.seed(1) 
# random_pat = np.random.choice(np.unique(pat), size=len(np.unique(pat)), replace=False)
# print(random_pat[:10])
# np.random.seed(1) 
# random_pat = np.random.choice(np.unique(pat), size=len(np.unique(pat)), replace=False)
# print(random_pat[:10])

In [None]:
# consider stroke and no-stroke patients separately
idx = np.where(Y_pat == 1)[0]
stroke_patients = np.unique(pat[idx])
idx = np.where(Y_pat == 0)[0]
non_stroke_patients = np.unique(pat[idx])
print(len(stroke_patients), len(non_stroke_patients))

In [None]:
# randomly shuffle indices of stroke and no-stroke patients
np.random.seed(1)
stroke_patients_test = np.random.choice(stroke_patients, size=len(stroke_patients), replace=False)
non_stroke_patients_test = np.random.choice(non_stroke_patients, size=len(non_stroke_patients), replace=False)
print(stroke_patients_test[:3])

# define the different test sets:
# 71 stroke and 31 no-stroke patients within each test set
test_tmp_1 = np.concatenate([stroke_patients_test[:71], non_stroke_patients_test[:31]], axis=0)
test_tmp_2 = np.concatenate([stroke_patients_test[71:142], non_stroke_patients_test[31:62]], axis=0)
test_tmp_3 = np.concatenate([stroke_patients_test[142:213], non_stroke_patients_test[62:93]], axis=0)
test_tmp_4 = np.concatenate([stroke_patients_test[213:284], non_stroke_patients_test[93:124]], axis=0)
# the last dataset contains 32 no-stroke
test_tmp_5 = np.concatenate([stroke_patients_test[284:355], non_stroke_patients_test[124:156]], axis=0)

# randomly shuffle the data sets such that stroke and no-stroke patients are mixed
test_1 = np.random.choice(test_tmp_1, size=len(test_tmp_1), replace=False)
test_2 = np.random.choice(test_tmp_2, size=len(test_tmp_2), replace=False)
test_3 = np.random.choice(test_tmp_3, size=len(test_tmp_3), replace=False)
test_4 = np.random.choice(test_tmp_4, size=len(test_tmp_4), replace=False)
test_5 = np.random.choice(test_tmp_5, size=len(test_tmp_5), replace=False)

In [None]:
test_1 # looks good

In [None]:
#### RUN 1

# get the patients that are not conatined in test1
stroke_patients_run = [i for i in stroke_patients if i not in test_1]
non_stroke_patients_run = [i for i in non_stroke_patients if i not in test_1]

# randomply shuffle the data
np.random.seed(100)
stroke_patients_tmp = np.random.choice(stroke_patients_run, size=len(stroke_patients_run), replace=False)
non_stroke_patients_tmp = np.random.choice(non_stroke_patients_run, size=len(non_stroke_patients_run), replace=False)
print(len(stroke_patients_tmp), len(non_stroke_patients_tmp))

# take the patients for the different datasets
# - train1: 196 stroke, 83 no-stroke
# - valid1: 35 stroke, 15 no-stroke
# - valid2: 54 stroke, 26 no-stroke
# - test: 71 stroke, 31 no-stroke
train1_tmp = np.concatenate([stroke_patients_tmp[0:196],non_stroke_patients_tmp[:83]], axis=0)
valid1_tmp = np.concatenate([stroke_patients_tmp[196:231], non_stroke_patients_tmp[83:98]], axis=0)
train2_tmp = np.concatenate([train1_tmp,valid1_tmp], axis=0)
valid2_tmp = np.concatenate([stroke_patients_tmp[231:285], non_stroke_patients_tmp[98:124]], axis=0)
print(len(train1_tmp), train1_tmp[:10])
print(len(valid1_tmp), valid1_tmp[:10])
print(len(train2_tmp), train2_tmp[:10])
print(len(valid2_tmp), valid2_tmp[:10])

# randomly shuffle the datasets such that stroke and no-stroke patients are mixed
train1_1 = np.random.choice(train1_tmp, size=len(train1_tmp), replace=False)
valid1_1 = np.random.choice(valid1_tmp, size=len(valid1_tmp), replace=False)
train2_1 = np.random.choice(train2_tmp, size=len(train2_tmp), replace=False)
valid2_1 = np.random.choice(valid2_tmp, size=len(valid2_tmp), replace=False)
test_1 = np.random.choice(test_1, size=len(test_1), replace=False)

In [None]:
#### RUN 2

# get the patients that are not conatined in test2
stroke_patients_run = [i for i in stroke_patients if i not in test_2]
non_stroke_patients_run = [i for i in non_stroke_patients if i not in test_2]

# randomply shuffle the data
np.random.seed(200)
stroke_patients_tmp = np.random.choice(stroke_patients_run, size=len(stroke_patients_run), replace=False)
non_stroke_patients_tmp = np.random.choice(non_stroke_patients_run, size=len(non_stroke_patients_run), replace=False)
print(len(stroke_patients_tmp), len(non_stroke_patients_tmp))

# take the patients for the different datasets
# - train1: 196 stroke, 83 no-stroke
# - valid1: 35 stroke, 15 no-stroke
# - valid2: 54 stroke, 26 no-stroke
# - test: 71 stroke, 31 no-stroke
train1_tmp = np.concatenate([stroke_patients_tmp[0:196],non_stroke_patients_tmp[:83]], axis=0)
valid1_tmp = np.concatenate([stroke_patients_tmp[196:231], non_stroke_patients_tmp[83:98]], axis=0)
train2_tmp = np.concatenate([train1_tmp,valid1_tmp], axis=0)
valid2_tmp = np.concatenate([stroke_patients_tmp[231:285], non_stroke_patients_tmp[98:124]], axis=0)
print(len(train1_tmp), train1_tmp[:10])
print(len(valid1_tmp), valid1_tmp[:10])
print(len(train2_tmp), train2_tmp[:10])
print(len(valid2_tmp), valid2_tmp[:10])

# randomly shuffle the datasets such that stroke and no-stroke patients are mixed
train1_2 = np.random.choice(train1_tmp, size=len(train1_tmp), replace=False)
valid1_2 = np.random.choice(valid1_tmp, size=len(valid1_tmp), replace=False)
train2_2 = np.random.choice(train2_tmp, size=len(train2_tmp), replace=False)
valid2_2 = np.random.choice(valid2_tmp, size=len(valid2_tmp), replace=False)
test_2 = np.random.choice(test_2, size=len(test_2), replace=False)

In [None]:
#### RUN 3

# get the patients that are not conatined in test3
stroke_patients_run = [i for i in stroke_patients if i not in test_3]
non_stroke_patients_run = [i for i in non_stroke_patients if i not in test_3]

# randomply shuffle the data
np.random.seed(300)
stroke_patients_tmp = np.random.choice(stroke_patients_run, size=len(stroke_patients_run), replace=False)
non_stroke_patients_tmp = np.random.choice(non_stroke_patients_run, size=len(non_stroke_patients_run), replace=False)
print(len(stroke_patients_tmp), len(non_stroke_patients_tmp))

# take the patients for the different datasets
# - train1: 196 stroke, 83 no-stroke
# - valid1: 35 stroke, 15 no-stroke
# - valid2: 54 stroke, 26 no-stroke
# - test: 71 stroke, 31 no-stroke
train1_tmp = np.concatenate([stroke_patients_tmp[0:196],non_stroke_patients_tmp[:83]], axis=0)
valid1_tmp = np.concatenate([stroke_patients_tmp[196:231], non_stroke_patients_tmp[83:98]], axis=0)
train2_tmp = np.concatenate([train1_tmp,valid1_tmp], axis=0)
valid2_tmp = np.concatenate([stroke_patients_tmp[231:285], non_stroke_patients_tmp[98:124]], axis=0)
print(len(train1_tmp), train1_tmp[:10])
print(len(valid1_tmp), valid1_tmp[:10])
print(len(train2_tmp), train2_tmp[:10])
print(len(valid2_tmp), valid2_tmp[:10])

# randomly shuffle the datasets such that stroke and no-stroke patients are mixed
train1_3 = np.random.choice(train1_tmp, size=len(train1_tmp), replace=False)
valid1_3 = np.random.choice(valid1_tmp, size=len(valid1_tmp), replace=False)
train2_3 = np.random.choice(train2_tmp, size=len(train2_tmp), replace=False)
valid2_3 = np.random.choice(valid2_tmp, size=len(valid2_tmp), replace=False)
test_3 = np.random.choice(test_3, size=len(test_3), replace=False)

In [None]:
#### RUN 4

# get the patients that are not conatined in test4
stroke_patients_run = [i for i in stroke_patients if i not in test_4]
non_stroke_patients_run = [i for i in non_stroke_patients if i not in test_4]

# randomply shuffle the data
np.random.seed(400)
stroke_patients_tmp = np.random.choice(stroke_patients_run, size=len(stroke_patients_run), replace=False)
non_stroke_patients_tmp = np.random.choice(non_stroke_patients_run, size=len(non_stroke_patients_run), replace=False)
print(len(stroke_patients_tmp), len(non_stroke_patients_tmp))

# take the patients for the different datasets
# - train1: 196 stroke, 83 no-stroke
# - valid1: 35 stroke, 15 no-stroke
# - valid2: 54 stroke, 26 no-stroke
# - test: 71 stroke, 31 no-stroke
train1_tmp = np.concatenate([stroke_patients_tmp[0:196],non_stroke_patients_tmp[:83]], axis=0)
valid1_tmp = np.concatenate([stroke_patients_tmp[196:231], non_stroke_patients_tmp[83:98]], axis=0)
train2_tmp = np.concatenate([train1_tmp,valid1_tmp], axis=0)
valid2_tmp = np.concatenate([stroke_patients_tmp[231:285], non_stroke_patients_tmp[98:124]], axis=0)
print(len(train1_tmp), train1_tmp[:10])
print(len(valid1_tmp), valid1_tmp[:10])
print(len(train2_tmp), train2_tmp[:10])
print(len(valid2_tmp), valid2_tmp[:10])

# randomly shuffle the datasets such that stroke and no-stroke patients are mixed
train1_4 = np.random.choice(train1_tmp, size=len(train1_tmp), replace=False)
valid1_4 = np.random.choice(valid1_tmp, size=len(valid1_tmp), replace=False)
train2_4 = np.random.choice(train2_tmp, size=len(train2_tmp), replace=False)
valid2_4 = np.random.choice(valid2_tmp, size=len(valid2_tmp), replace=False)
test_4 = np.random.choice(test_4, size=len(test_4), replace=False)

In [None]:
#### RUN 5

# get the patients that are not conatined in test5
stroke_patients_run = [i for i in stroke_patients if i not in test_5]
non_stroke_patients_run = [i for i in non_stroke_patients if i not in test_5]

# randomply shuffle the data
np.random.seed(500)
stroke_patients_tmp = np.random.choice(stroke_patients_run, size=len(stroke_patients_run), replace=False)
non_stroke_patients_tmp = np.random.choice(non_stroke_patients_run, size=len(non_stroke_patients_run), replace=False)
print(len(stroke_patients_tmp), len(non_stroke_patients_tmp))

# take the patients for the different datasets
# - train1: 196 stroke, 83 no-stroke
# - valid1: 35 stroke, 15 no-stroke
# - valid2: 54 stroke, 26 no-stroke
# - test: 71 stroke, 31 no-stroke
train1_tmp = np.concatenate([stroke_patients_tmp[0:196],non_stroke_patients_tmp[:83]], axis=0)
valid1_tmp = np.concatenate([stroke_patients_tmp[196:231], non_stroke_patients_tmp[83:98]], axis=0)
train2_tmp = np.concatenate([train1_tmp,valid1_tmp], axis=0)
valid2_tmp = np.concatenate([stroke_patients_tmp[231:285], non_stroke_patients_tmp[98:124]], axis=0)
print(len(train1_tmp), train1_tmp[:10])
print(len(valid1_tmp), valid1_tmp[:10])
print(len(train2_tmp), train2_tmp[:10])
print(len(valid2_tmp), valid2_tmp[:10])

# randomly shuffle the datasets such that stroke and no-stroke patients are mixed
train1_5 = np.random.choice(train1_tmp, size=len(train1_tmp), replace=False)
valid1_5 = np.random.choice(valid1_tmp, size=len(valid1_tmp), replace=False)
train2_5 = np.random.choice(train2_tmp, size=len(train2_tmp), replace=False)
valid2_5 = np.random.choice(valid2_tmp, size=len(valid2_tmp), replace=False)
test_5 = np.random.choice(test_5, size=len(test_5), replace=False)

### Normalize images
- make sure that image values range between -1 and 1

In [None]:
n_images = 3
plt.figure(figsize=(10,10))
for i in range(n_images):
    fig = plt.subplot(1,n_images,i+1)
    fig.imshow(X[1,:,:,i], cmap='gray')
print(np.min(X[1]), np.max(X[1]))

In [None]:
from keras.applications.vgg16 import preprocess_input
X_norm = X
X_norm = preprocess_input(X_norm, mode='tf')

In [None]:
print(np.min(X_norm), np.max(X_norm))

In [None]:
X_norm.shape

In [None]:
plt.figure(figsize=(10,10))
for i in range(n_images):
    fig = plt.subplot(1,n_images,i+1)
    fig.imshow(X_norm[0,:,:,i], cmap='gray')

## Train and predict with the CNN

In [None]:
def get_datasets(set_i, X, Y_img, Y_pat, pat, path, img):
    Y_img_set = []
    Y_pat_set = []
    pat_set = []
    path_set = []
    img_set = []
    # Find the indices corresponding to the patient_i in set_i
    idx = [i for i, pat_i in enumerate(pat) if pat_i in set_i]
    X_set = X[idx,:,:,:]
    for i in idx:
        Y_img_set.append(Y_img[i])
        Y_pat_set.append(Y_pat[i])
        pat_set.append(pat[i])
        path_set.append(path[i])
        img_set.append(img[i])     
    return(X_set, np.array(Y_img_set), np.array(Y_pat_set), np.array(pat_set), np.array(path_set), np.array(img_set))

### Define the CNN and the function for prediction

In [None]:
# Initialize hyperparameters
input_shape = (X_norm.shape[1], X_norm.shape[2], X_norm.shape[3])
batch_size = 64
n_epochs = 400

In [None]:
# import libraries
import pandas as pd
import keras
from keras.utils import np_utils
from keras import backend as K
from keras.callbacks import TensorBoard, ModelCheckpoint
from keras import layers
from keras.models import Model
from keras.layers import Input, Dense, Dropout, Activation, Flatten, Lambda, Convolution2D, MaxPooling2D, Reshape, concatenate
from keras.layers.normalization import BatchNormalization
from keras import initializers
from keras.optimizers import Adam
from keras.preprocessing.image import ImageDataGenerator


# define the model
print('Define model')

def conv_block2_all_dropout(input_x, size, dropout_level):
    x = Convolution2D(size, (3,3), kernel_initializer=initializers.he_normal(seed=3004), padding='same')(input_x)
    x = BatchNormalization(axis=3)(x)
    x = Activation('relu')(x)
    x = Dropout(dropout_level)(x)
    x = Convolution2D(size, (3,3), kernel_initializer=initializers.he_normal(seed=3004), padding='same')(x)
    x = BatchNormalization(axis=3)(x)
    x = Activation('relu')(x)
    x = Dropout(dropout_level)(x)
    x = MaxPooling2D(pool_size=(2, 2))(x)
    return x

def conv_block3_all_dropout(input_x, size, dropout_level):
    x = Convolution2D(size, (3,3), kernel_initializer=initializers.he_normal(seed=3004), padding='same')(input_x)
    x = BatchNormalization(axis=3)(x)
    x = Activation('relu')(x)
    x = Dropout(dropout_level)(x)
    x = Convolution2D(size, (3,3), kernel_initializer=initializers.he_normal(seed=3004), padding='same')(x)
    x = BatchNormalization(axis=3)(x)
    x = Activation('relu')(x)
    x = Dropout(dropout_level)(x)
    x = Convolution2D(size, (3,3), kernel_initializer=initializers.he_normal(seed=3004), padding='same')(x)
    x = BatchNormalization(axis=3)(x)
    x = Activation('relu')(x)
    x = Dropout(dropout_level)(x)
    x = MaxPooling2D(pool_size=(2, 2))(x) 
    return x


def cnn_all_dropout(input_shape):
    
    drop_level = 0.3
    img_input = Input(shape=input_shape)
    
    # Convolutional part
    x = conv_block2_all_dropout(img_input,32,dropout_level=drop_level)
    x = conv_block2_all_dropout(x,64,dropout_level=drop_level)
    x = conv_block3_all_dropout(x,128,dropout_level=drop_level)
    x = conv_block3_all_dropout(x,256,dropout_level=drop_level)
    x = conv_block3_all_dropout(x,512,dropout_level=drop_level)
    x = conv_block3_all_dropout(x,512,dropout_level=drop_level)
    
    # Dense part
    x = Flatten()(x)
    x = Dense(400, kernel_initializer=initializers.he_normal(seed=3004))(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dropout(drop_level)(x)
    x = Dense(100, kernel_initializer=initializers.he_normal(seed=3004))(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dropout(drop_level)(x)
    x = Dense(2, kernel_initializer=initializers.he_normal(seed=3004), activation='softmax')(x)
    
    model = Model(img_input, x)
    return model

In [None]:
model = cnn_all_dropout((X.shape[1],X.shape[2],X.shape[3]))
model.summary()

In [None]:
import tensorflow as tf

# The function we use to get the T predictions within the T forward passes through the network with MC Dropout
# As we don't apply MC Dropout in this example, we get the same prediction within each run (by using the regular Dropout function in Keras)
def get_predictions_cnn_all_dropout(X, Y, output_folder, name, mod, pat, img, stroke, path):
    
    # Convert labeles back from one hot encoding:
    Y = np.argmax(Y, axis=1)
            
    # start a new session
    print('Start new session for predictions')
    tf.reset_default_graph()
    sess = tf.Session()
    K.set_session(sess)
    
    # Load the model with the lambda layers (given as function input)
    print('Load model cnn_all_dropout with lambda layers with old weights')
    model2 = cnn_all_dropout((X.shape[1],X.shape[2],X.shape[3]))
    model2.load_weights(output_folder + '/checkpoints/' + mod + '.hdf5')

    
    # get the predictions
    print('Get the predictions')
    n_classes = 2
    
    predictions = np.zeros((len(X), 500, n_classes))
    mean0 = np.zeros((X.shape[0]))
    mean1 = np.zeros((X.shape[0]))
    mean2 = np.zeros((X.shape[0]))
    sd0 = np.zeros((X.shape[0]))
    sd1 = np.zeros((X.shape[0]))
    votes0 = np.zeros((X.shape[0]))
    votes1 = np.zeros((X.shape[0]))
    total_var = np.zeros((X.shape[0]))
    total_sd = np.zeros((X.shape[0]))
    vr = []
    pe = []
    mi = []  
    for j in range(len(X)):
        # repeat the current image 500 times
        X_rep = np.empty((500,X.shape[1],X.shape[2],X.shape[3]))
        X_rep[:] = X[j:j+1]
        # get 500 predictions for this image
        pred = sess.run(model2.output, feed_dict={model2.input: X_rep})
        
        # output of mean and sd == #classes
        predictions[j] = pred # save the raw predictions
        mean0[j], mean1[j] = np.mean(pred, axis=0)
        votes0[j] = len(np.where(pred[:,0]>=0.5)[0])/500
        votes1[j] = len(np.where(pred[:,1]>=0.5)[0])/500
        sd0[j], sd1[j] = np.array(np.std(pred, ddof=1, axis=0))
        total_var[j] = sd0[j]**2 + sd1[j]**2
        total_sd[j] = np.sqrt(sd0[j]**2 + sd1[j]**2)
        pred[pred==0]=1e-40
        vr.append(1-(np.max(np.histogram(np.argmax(pred, axis=1), bins=n_classes, range=[0,n_classes])[0])/len(pred)))
        pe_tmp = (-1)*np.sum(np.mean(pred, axis=0)*np.log(np.mean(pred, axis=0)))
        pe.append(pe_tmp)
        mi.append(pe_tmp + np.sum(np.array([np.sum(pred[:,i]*np.log(pred[:,i]))for i in range(0,n_classes)]))/len(pred))
    
    # Save the predictions with additional information
    dat = pd.DataFrame({'p_id':pat, 'img':img, 'pat_true':stroke, 'img_true':Y, 'path':path,  
                        'mean0':mean0, 'mean1':mean1, 'vr':vr, 'pe':pe, 'mi':mi, 'sd0':sd0, 'sd1':sd1,
                        'votes0':votes0, 'votes1':votes1, 'total_var':total_var, 'total_sd':total_sd})
    dat.to_csv(output_folder + '/predictions_dropout_' + name + '.csv', index=False)
    # save the predictions separately
    pred0 = predictions[:,:,0]
    pred1 = predictions[:,:,1]
    pred0_df = pd.DataFrame(pred0)
    pred1_df = pd.DataFrame(pred1)
    pred0_df = pred0_df.assign(p_id=pat, img=img, pat_true=stroke, img_true=Y)
    pred1_df = pred1_df.assign(p_id=pat, img=img, pat_true=stroke, img_true=Y)
    pred0_df.to_csv(output_folder + '/raw_predictions_' + name + '_pred0.csv', index=False)
    pred1_df.to_csv(output_folder + '/raw_predictions_' + name + '_pred1.csv', index=False)
    np.save(output_folder + '/raw_predictions_dropout_' + name, predictions)
    
    K.clear_session()

In [None]:
# Function to convert labels to one hot
def convertToOneHot(vector, num_classes=None):
    result = np.zeros((len(vector), num_classes), dtype='int32')
    result[np.arange(len(vector)), vector] = 1
    return result

## CV training

In [None]:
train1_cv = np.array([train1_1, train1_2, train1_3, train1_4, train1_5])
valid1_cv = np.array([valid1_1, valid1_2, valid1_3, valid1_4, valid1_5])
train2_cv = np.array([train2_1, train2_2, train2_3, train2_4, train2_5])
valid2_cv = np.array([valid2_1, valid2_2, valid2_3, valid2_4, valid2_5])
test_cv = np.array([test_1, test_2, test_3, test_4, test_5])
print(train1_cv.shape, valid1_cv.shape, train2_cv.shape, valid2_cv.shape, test_cv.shape)

In [None]:
# train1_cv = np.array([train1_1[:2], train1_2[:2], train1_3[:2], train1_4[:2], train1_5[:2]])
# valid1_cv = np.array([valid1_1[:2], valid1_2[:2], valid1_3[:2], valid1_4[:2], valid1_5[:2]])
# train2_cv = np.array([train2_1[:2], train2_2[:2], train2_3[:2], train2_4[:2], train2_5[:2]])
# valid2_cv = np.array([valid2_1[:2], valid2_2[:2], valid2_3[:2], valid2_4[:2], valid2_5[:2]])
# test_cv = np.array([test_1[:2], test_2[:2], test_3[:2], test_4[:2], test_5[:2]])
# print(train1_cv.shape, valid1_cv.shape, train2_cv.shape, valid2_cv.shape, test_cv.shape)

In [None]:
def cv(train1, valid1, train2, valid2, test, X, Y_img, Y_pat, pat, path, img, batch_size, n_epochs):
    
    # iterate over all runs
    for i in range(5):
        
        print('####### Run ', i, '#######')
        
        
        #### Get the information for one run
    
        # extract the data
        print('Get data')
        train1_run = train1[i]
        valid1_run = valid1[i]
        train2_run = train2[i]
        valid2_run = valid2[i]
        test_run = test[i]
        
        # assign the data for the patients in train1_run, etc.
        X_train1, Y_img_train1, Y_pat_train1, pat_train1, path_train1, img_train1 = get_datasets(train1_run, X, Y_img, Y_pat, pat, path, img)
        X_valid1, Y_img_valid1, Y_pat_valid1, pat_valid1, path_valid1, img_valid1 = get_datasets(valid1_run, X, Y_img, Y_pat, pat, path, img)
        X_train2, Y_img_train2, Y_pat_train2, pat_train2, path_train2, img_train2 = get_datasets(train2_run, X, Y_img, Y_pat, pat, path, img)
        X_valid2, Y_img_valid2, Y_pat_valid2, pat_valid2, path_valid2, img_valid2 = get_datasets(valid2_run, X, Y_img, Y_pat, pat, path, img)
        X_test, Y_img_test, Y_pat_test, pat_test, path_test, img_test = get_datasets(test_run, X, Y_img, Y_pat, pat, path, img)
        
        
        #### Duplicate the stroke images in train1
        
        # get the indices corresponding to the stroke images
        idx = np.where(np.array(Y_img_train1) == 1)[0]
        
        # attach the stroke images
        print("train1 before stroke duplication: ", len(X_train1), len(Y_img_train1), len(Y_pat_train1), len(pat_train1), len(path_train1), len(img_train1))
        X_train1 = np.concatenate((X_train1, X_train1[idx]), axis=0)
        Y_img_train1 = np.concatenate((Y_img_train1, Y_img_train1[idx]), axis=0)
        Y_pat_train1 = np.concatenate((Y_pat_train1, Y_pat_train1[idx]), axis=0)
        pat_train1 = np.concatenate((pat_train1, pat_train1[idx]), axis=0)
        path_train1 = np.concatenate((path_train1, path_train1[idx]), axis=0)
        img_train1 = np.concatenate((img_train1, img_train1[idx]), axis=0)
        print("train1 after stroke duplication: ", len(X_train1), len(Y_img_train1), len(Y_pat_train1), len(pat_train1), len(path_train1), len(img_train1))
        
        
        #### One hot encoding
        
        print('One hot encoding')
        Y_img_train1 = convertToOneHot(Y_img_train1.astype(int), 2)
        Y_img_valid1 = convertToOneHot(Y_img_valid1.astype(int), 2)
        Y_img_train2 = convertToOneHot(Y_img_train2.astype(int), 2)
        Y_img_valid2 = convertToOneHot(Y_img_valid2.astype(int), 2)
        Y_img_test = convertToOneHot(Y_img_test.astype(int), 2)
        
        
        #### Train the model
        
        print('#### Training')
        
        # generate an output folder which contains the output of the current run
        print('Generate an output folder')
        output_folder_tmp = output_folder + '/run' + str(i)
        if not exists(output_folder_tmp):
            os.makedirs(output_folder_tmp)
            os.makedirs(output_folder_tmp + '/checkpoints')
        
        # start a new session
        # print('Start new session')
        # tf.reset_default_graph()
        # sess = tf.Session()
        # K.set_session(sess)
        
        # load the model
        print('Load model and compile')
        # Define hyperparameters
        input_shape = (X_train1.shape[1], X_train1.shape[2], X_train1.shape[3])
        
        # load the model and compile
        model = cnn_all_dropout(input_shape)
        model.compile(loss='categorical_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
        
        # Data augmentation
        print('Data Augmentation')
        datagen = ImageDataGenerator(width_shift_range=0.2, 
                                     height_shift_range=0.2, 
                                     rotation_range=20, 
                                     zoom_range=0.5, 
                                     shear_range=0.2,
                                     vertical_flip=True)
        datagen.fit(X_train1, seed=3004)
        
        # tb_callback = TensorBoard(log_dir=output_folder_tmp + '/tb_output', histogram_freq=0, write_graph=True, write_images=True)
        cp_callback = ModelCheckpoint(output_folder_tmp + '/checkpoints/model-{epoch:02d}.hdf5', monitor='val_loss', verbose=0, save_best_only=False, save_weights_only=False, mode='auto', period=1)
        
        print('Start training')
        results = model.fit_generator(datagen.flow(X_train1, Y_img_train1, batch_size=batch_size, shuffle = True), 
                                      steps_per_epoch=len(X_train1)/batch_size, 
                                      epochs=n_epochs, 
                                      callbacks=[cp_callback], # tb_callback], 
                                      validation_data=(X_valid1, Y_img_valid1))
        
        # save the loss, acc etc. in a csv
        pd.DataFrame(results.history).to_csv(output_folder_tmp + '/history.csv', index=False)
            
        
        #### Predictions
        
        print('#### Predicting')

        # Load the history file and find the model with the lowest validation loss
        dat = pd.DataFrame.from_csv(output_folder_tmp + '/history.csv', index_col=None)
        
        best_epoch = str(np.where(dat.val_loss == np.min(dat.val_loss))[0][0] +1)
        if int(best_epoch) < 10 :
            best_epoch = "0" + best_epoch
        else:
            best_epoch = best_epoch
            
        mod = 'model-' + best_epoch
        
        # Apply the function to the differnt sets
        print('predict train1')
        get_predictions_cnn_all_dropout(X_train1, Y_img_train1, output_folder_tmp, 'train1', mod, 
                                        pat_train1, img_train1, Y_pat_train1, path_train1)
        print('predict valid1')
        get_predictions_cnn_all_dropout(X_valid1, Y_img_valid1, output_folder_tmp, 'valid1', mod, 
                                        pat_valid1, img_valid1, Y_pat_valid1, path_valid1)
        print('predict valid2')
        get_predictions_cnn_all_dropout(X_valid2, Y_img_valid2, output_folder_tmp, 'valid2', mod, 
                                        pat_valid2, img_valid2, Y_pat_valid2, path_valid2)
        print('predict test')
        get_predictions_cnn_all_dropout(X_test, Y_img_test, output_folder_tmp, 'test', mod, 
                                        pat_test, img_test, Y_pat_test, path_test)
    

In [None]:
cv(train1_cv, valid1_cv, train2_cv, valid2_cv, test_cv, X_norm, Y_img, Y_pat, pat, path, img, 64, 400)