# Test 2D patches for training

Let's run an end-to-end Keras training script with data from our S3 bucket. The data is stored on the S3 bucket in an HDF5 file. This test will give us an idea of the speed and cost of the training.

In [1]:
import keras
import numpy as np
import h5py
import os

Using TensorFlow backend.


In [2]:
root_dir = !pwd
s3bucket_path = root_dir[0] + '/../s3bucket_goofys/' # remote S3 via goofys
path_to_hdf5 = s3bucket_path + 'LUNA16/hdf5-files/64x64x1-patch.hdf5'
hdf5_file = h5py.File(path_to_hdf5, 'r') # open in read-only mode

In [3]:
print("Valid hdf5 file in 'read' mode: " + str(hdf5_file))
file_size = os.path.getsize(path_to_hdf5)
print('Size of hdf5 file: {:.3f} GB'.format(file_size/2.0**30))

Valid hdf5 file in 'read' mode: <HDF5 file "64x64x1-patch.hdf5" (mode r)>
Size of hdf5 file: 11.851 GB


In [4]:
print("There are {} images in the dataset.".format(hdf5_file['input'].shape[0]))

There are 754962 images in the dataset.


In [5]:
print("The datasets within the HDF5 file are:\n {}".format(list(hdf5_file.values())))

The datasets within the HDF5 file are:
 [<HDF5 dataset "centroid": shape (754962, 3), type "<f8">, <HDF5 dataset "input": shape (754962, 4096), type "<f4">, <HDF5 dataset "notrain": shape (754962, 1), type "<i8">, <HDF5 dataset "output": shape (754962, 1), type "<i8">, <HDF5 dataset "subsets": shape (754962, 1), type "<i8">, <HDF5 dataset "uuid": shape (754962, 1), type "|O">]


In [6]:
less_idx = hdf5_file["output"][50:75]
type (less_idx)

numpy.ndarray

In [None]:
## Following line makes m/c crumble......AL
tst1 = (hdf5_file['output'][:] == 0)

In [7]:
for_training = (hdf5_file["notrain"][:,0] == 1)
for_testing = (hdf5_file["notrain"][:,0] == 0)

In [8]:
def get_class_idx(hdf5_file, classid = 0):
    '''
    Get the indices for the class classid and valid for training 
    '''
    # 1. Select indices from class 0 and 1
    #idx = np.where(hdf5_file["output"][:,0] == classid)[0]  # Indices for class classid
    #idx = np.where(hdf5_file['output'][:] == classid)[0]  #Valid for MNIST hdfs
    idx = np.where( (hdf5_file['output'][:,0] == classid) & for_training )[0]  #Valid 2D Patches with No train option
    return idx


In [9]:
def remove_exclude_subset_idx(hdf5_file, idx, excluded_subset=0):
    '''
    Remove indices for the subset excluded_subset
    '''   

    subsets = hdf5_file["subsets"][:,0]
    excluded_idx = np.where(subsets == excluded_subset)[0] # indices
    
    return np.setdiff1d(idx, excluded_idx)  # Remove the indices of the excluded subset
    

In [None]:
idx[0] = get_class_idx(hdf5_file, 0)

In [46]:
def get_idx_for_classes(hdf5_file, excluded_subset=0):
    '''
    Get the indices for each class but don't include indices from excluded subset
    '''
    
    idx = {}
    idx[0] = get_class_idx(hdf5_file, 0)
    idx[1] = get_class_idx(hdf5_file, 1)
#     idx[0] = get_class_idx(hdf5_file, 4)   #used for mist hdfs file
#     idx[1] = get_class_idx(hdf5_file, 5)    #used for mist hdfs file
    
    idx[0] = remove_exclude_subset_idx(hdf5_file, idx[0], excluded_subset)
    idx[1] = remove_exclude_subset_idx(hdf5_file, idx[1], excluded_subset)
    
    return idx

## Custom HDF5 dataloader

This is the first pass at our custom HDF5 data loader.
We'll need to add data augmentation and class balancing to this.

In [50]:
def get_random_idx(hdf5_file, idx, batch_size = 20):
    '''
    Batch size needs to be even.
    This is yield a balanced set of random indices for each class. 
    '''
        
    idx0 = idx[0]
    idx1 = idx[1]
    
    # 2. Shuffle the two indices
    np.random.shuffle(idx0)  # This shuffles in place
    np.random.shuffle(idx1)  # This shuffles in place

    # 3. Take half of the batch from each class
    idx0_shuffle = idx0[0:(batch_size//2)]
    idx1_shuffle = idx1[0:(batch_size//2)]

    # Need to sort final list in order to slice
    return np.sort(np.append(idx0_shuffle, idx1_shuffle))


In [51]:
def img_rotate(img):
    '''
    Perform a random rotation on the tensor
    `img` is the tensor
    '''
    shape = img.shape
    # This will flip along n-1 axes. (If we flipped all n axes then we'd get the same result every time)
    ax = np.random.choice(len(shape)-1,2, replace=False) # Choose randomly which axes to flip
    return np.flip(img.swapaxes(ax[0], ax[1]), ax[0]) # Random +90 or -90 rotation

In [52]:
def img_flip(img):
    '''
    Performs a random flip on the tensor.
    If the tensor is C x H x W x D this will perform flips on two of the C, H, D dimensions
    If the tensor is C x H x W this will perform flip on either the H or the W dimension.
    `img` is the tensor
    '''
    shape = img.shape
    # This will flip along n-1 axes. (If we flipped all n axes then we'd get the same result every time)
    ax = np.random.choice(len(shape)-1,len(shape)-2, replace=False) + 1 # Choose randomly which axes to flip
    for i in ax:
        img = np.flip(img, i) # Randomly flip along all but one axis
    return img

In [53]:
def augment_data(imgs):
    ''' 
    Performs random flips, rotations, and other operations on the image tensors.
    '''
    
    imgs_length = imgs.shape[0]
    
    for idx in range(imgs_length):
        img = imgs[idx, :]
        
        if (np.random.rand() > 0.5):
            
            if (np.random.rand() > 0.5):
                img = img_rotate(img)

            if (np.random.rand() > 0.5):
                img = img_flip(img)
        
        else:
            
            if (np.random.rand() > 0.5):
                img = img_flip(img)
                
            if (np.random.rand() > 0.5):
                img = img_rotate(img)

        imgs[idx,:] = img
        
    return imgs

In [65]:
def generate_data(hdf5_file, batch_size=50, exclude_subset=0):
    """Replaces Keras' native ImageDataGenerator."""
    """ Randomly select batch_size rows from the hdf5 file dataset """
    
#    input_shape = tuple([batch_size] + list(hdf5_file['input'].attrs['lshape']) + [1])
    input_shape = tuple(list(hdf5_file["input"].attrs["lshape"]) [:-1])  # Removing last dim - discuss Tony on lshape
    idx_master = get_idx_for_classes(hdf5_file, exclude_subset) 
    
    while True:
        
        random_idx = get_random_idx(hdf5_file, idx_master, batch_size)
        imgs = hdf5_file["input"][random_idx,:]
        imgs = imgs.reshape(input_shape)
        ## Need to augment 
        imgs = augment_data(imgs)
        
        #classes = hdf5_file["output"][random_idx,0]
#        classes = hdf5_file["output"][list(random_idx)] - 4  #NOTE:Change me back,Anil
        classes = hdf5_file["output"][list(random_idx, 0)] 
        
        yield imgs, classes

In [68]:
#input_shape = tuple(list(hdf5_file["input"].attrs["lshape"]) + [1])  # Get the original shape of the tensor
input_shape = tuple(list(hdf5_file["input"].attrs["lshape"]) [:-1])  # Removing last dim
batch_size = 20   # Batch size to use
print (input_shape)

In [None]:
#imgs, classes = generate_data(hdf5_file, batch_size=batch_size, exclude_subset=2)

In [None]:
#imgs = hdf5_file["inputs"][random_idx,:]

In [None]:
#imgs = imgs.reshape(10,64,64,64, 1)

In [70]:
import matplotlib.pyplot as plt
%matplotlib inline

In [75]:
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K

In [76]:
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3),
                 activation='relu',
                 input_shape=input_shape))
model.add(Conv2D(64, (3,3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

In [77]:
batch_size

20

In [78]:
tb_log = keras.callbacks.TensorBoard(log_dir='./tb_2D_logs', histogram_freq=0, batch_size=batch_size, 
                            write_graph=True, 
                            write_grads=True, write_images=True, 
                            embeddings_freq=0, embeddings_layer_names=None, 
                            embeddings_metadata=None)

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_3 (Conv2D)            (None, 62, 62, 32)        320       
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 60, 60, 64)        18496     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 30, 30, 64)        0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 30, 30, 64)        0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 57600)             0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               7372928   
_________________________________________________________________
dropout_4 (Dropout)          (None, 128)               0         
__________

## Train with fit_generator


In [None]:
history = model.fit_generator(generate_data(hdf5_file, batch_size, exclude_subset=2),
                    steps_per_epoch=10, epochs=3)
# history = model.fit_generator(generate_data(hdf5_file, batch_size, exclude_subset=2),
#                     steps_per_epoch=10, epochs=3, callbacks=[tb_log])

In [None]:
from keras.utils import plot_model
plot_model(model, to_file='model.png')

In [None]:
from IPython.display import Image
Image("model.png")