# Test S3 for training

Let's run an end-to-end Keras training script with data from our S3 bucket. The data is stored on the S3 bucket in an HDF5 file. This test will give us an idea of the speed and cost of the training.

In [2]:
import keras
import numpy as np
import h5py
import os

Using TensorFlow backend.


## Load HDF5 on the S3 bucket for training in Keras

This assumes you have [goofys](https://github.com/kahing/goofys) setup on your local machine.
You'll probably first need to download and install the [AWS CLI](https://aws.amazon.com/cli/). If AWS CLI is properly installed then you should be able to run this command from your local Linux machine:

` aws s3 ls s3://dse-cohort3-group5`

If that works, then you can create a local directory with the command:

`mkdir -p s3bucket`

If that works, then you can use goofys to link the local directory with the s3 bucket.

`./goofys dse-cohort3-group5 s3bucket`

Once that is done, then you can access the s3bucket as if it were a local folder on your Linux machine.


In [3]:
s3bucket_path = '/Users/aluthra/Documents/DSE/s3bucket/' # remote S3 via goofys
#s3bucket_path = '/Users/aluthra/Documents/DSE/ucsd-dse-capstone/' # Local storage (for sanity test)
path_to_hdf5 = s3bucket_path + 'LUNA16/hdf5-files/32dim_patches.hdf5'
hdf5_file = h5py.File(path_to_hdf5, 'r') # open in read-only mode

In [4]:
print("Valid hdf5 file in 'read' mode: " + str(hdf5_file))
file_size = os.path.getsize(path_to_hdf5)
print('Size of hdf5 file: {:.3f} GB'.format(file_size/2.0**30))

Valid hdf5 file in 'read' mode: <HDF5 file "32dim_patches.hdf5" (mode r)>
Size of hdf5 file: 0.012 GB


## Custom HDF5 dataloader

This is the first pass at our custom HDF5 data loader.
We'll need to add data augmentation and class balancing to this.

In [82]:
def get_random_idx(hdf5_file, batch_size = 32, exclude_subset=0):
    '''
    Batch size needs to be even.
    This is yield a balanced set of random indices for each class. 
    '''
    
    # 1. Select indices from class 0 and 1
    classes = hdf5_file["classes"][:,0]
    
    idx0 = np.where(classes == 0)[0]  # Indices for class 0
    idx1 = np.where(classes == 1)[0]  # Indices for class 1
    #TODO -  Anil add try catch to check len(idx1) is greater than batch_size
    #owing imbalance classes

    subsets = hdf5_file["subset"]
    excluded_idx = np.where(subsets == exclude_subset)[0] # indices
    idx0 = np.setdiff1d(idx0, excluded_idx)  # Remove the indices of the excluded subset
    idx1 = np.setdiff1d(idx1, excluded_idx)  # Remove the indices of the excluded subset
    
    # 2. Shuffle the two indices
    np.random.shuffle(idx0)  # This shuffles in place
    np.random.shuffle(idx1)  # This shuffles in place

    # 3. Take half of the batch from each class
    idx0_shuffle = idx0[0:(batch_size//2)]
    idx1_shuffle = idx1[0:(batch_size//2)]

    # Need to sort final list in order to slice
    return np.sort(np.append(idx0_shuffle, idx1_shuffle))


In [103]:
shape = [32,32,32,1]

ax = np.random.choice(len(shape)-1,2, replace=False) + 1

In [104]:
ax

array([2, 3])

In [106]:
np.random.rand()

0.36830395318360354

In [None]:
def img_rotate(img):
    '''
    Perform a random rotation on the tensor
    `img` is the tensor
    '''
    shape = img.shape
    # This will flip along n-1 axes. (If we flipped all n axes then we'd get the same result every time)
    ax = np.random.choice(len(shape)-1,2, replace=False) + 1 # Choose randomly which axes to flip
    return np.flip(img.swapaxes(ax[0], ax[1]), ax[0]) # Random +90 or -90 rotation

In [None]:
def img_flip(img):
    '''
    Performs a random flip on the tensor.
    If the tensor is C x H x W x D this will perform flips on two of the C, H, D dimensions
    If the tensor is C x H x W this will perform flip on either the H or the W dimension.
    `img` is the tensor
    '''
    shape = img.shape
    # This will flip along n-1 axes. (If we flipped all n axes then we'd get the same result every time)
    ax = np.random.choice(len(shape)-1,len(shape)-2, replace=False) + 1 # Choose randomly which axes to flip
    for i in ax:
        img = np.flip(img, i) # Randomly flip along all but one axis
    return img

In [107]:
def augment_data(imgs):
    
    imgs_length = imgs.shape[0]
    
    for idx in range(imgs_length):
        img = imgs[idx, :]
        
        if (np.random.rand() > 0.5):
            
            if (np.random.rand() > 0.5):
                img = img_rotate(img)

            if (np.random.rand() > 0.5):
                img = img_flip(img)
        
        else:
            
            if (np.random.rand() > 0.5):
            img = img_rotate(img)

            if (np.random.rand() > 0.5):
                img = img_flip(img)

        imgs[idx,:] = img
        
    return imgs

In [4]:
def generate_data(hdf5_file, batch_size=50, num_rows=96, input_shape = (32,32,32,1)):
    """Replaces Keras' native ImageDataGenerator."""
    """ Randomly select batch_size rows from the hdf5 file dataset """
    
    input_shape = tuple([batch_size] + list(input_shape))
    while True:
        
        random_idx = get_random_idx(hdf5_File, batch_size)
        imgs = hdf5_file["patches"][random_idx,:]
        imgs = imgs.reshape(input_shape)
        ## Need to augment 
        imgs = augment_data(imgs)
        
        classes = hdf5_file["classes"][random_idx,0]
        yield imgs, classes

## 3D CNN

This is a very simple 3D CNN just to test the pipeline.

In [5]:
from keras.layers import Dense, Activation,Conv3D,MaxPooling3D,Flatten,Dropout, Input
from keras.models import Model

input_shape = (32,32,32,1)
inputs = Input(input_shape, name='Images')

conv1 = Conv3D(filters=96, kernel_size=(3, 3, 3), activation='relu', padding='valid',
              kernel_initializer='glorot_uniform')(inputs)

max2 = MaxPooling3D(pool_size=(2,2,2))(conv1)

layer6 = Flatten()(max2)

layer7 = Dense(32, activation='relu')(layer6)

layer8 = Dropout(0.5)(layer7)

layer9 = Dense(4, activation='relu')(layer8)

layer10 = Dense(1, activation='sigmoid')(layer9)

model = Model(inputs=[inputs], outputs=[layer10])
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Images (InputLayer)          (None, 32, 32, 32, 1)     0         
_________________________________________________________________
conv3d_1 (Conv3D)            (None, 30, 30, 30, 96)    2688      
_________________________________________________________________
max_pooling3d_1 (MaxPooling3 (None, 15, 15, 15, 96)    0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 324000)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                10368032  
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 4)                 132       
__________

## Train with fit_generator


In [None]:
batch_size = 50
history = model.fit_generator(generate_data(hdf5_file, batch_size, input_shape = (32,32,32,1)),
                    steps_per_epoch=10000, epochs=10)

Epoch 1/10
 2207/10000 [=====>........................] - ETA: 38:42 - loss: 0.0028 - acc: 0.9998