In [1]:
#######################################################################################################
# Summary
# 1. Keras Multi-GPU example
#######################################################################################################

In [2]:
MULTI_GPU = True

In [3]:
import os
import sys
import time
import pandas as pd
import numpy as np
os.environ['KERAS_BACKEND'] = "tensorflow"
import keras as K 
import tensorflow
import multiprocessing
import random
from keras.preprocessing.image import ImageDataGenerator
from keras.applications.densenet import DenseNet121
from keras.applications.imagenet_utils import preprocess_input
from keras.optimizers import Adam
from keras.callbacks import ReduceLROnPlateau, Callback, ModelCheckpoint
from keras.layers import Dense
from keras.models import Model
from keras.utils import multi_gpu_model
from sklearn.metrics.ranking import roc_auc_score
from sklearn.model_selection import train_test_split
from PIL import Image
from common.utils import download_data_chextxray, get_imgloc_labels, get_train_valid_test_split
from common.utils import compute_roc_auc, get_cuda_version, get_cudnn_version, get_gpu_name
from common.params_dense import *

Using TensorFlow backend.


In [4]:
print("OS: ", sys.platform)
print("Python: ", sys.version)
print("Keras: ", K.__version__)
print("Numpy: ", np.__version__)
print("Tensorflow: ", tensorflow.__version__)
print(K.backend.backend())
print(K.backend.image_data_format())
print("GPU: ", get_gpu_name())
print(get_cuda_version())
print("CuDNN Version ", get_cudnn_version())

OS:  linux
Python:  3.5.2 |Anaconda custom (64-bit)| (default, Jul  2 2016, 17:53:06) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]
Keras:  2.1.4
Numpy:  1.14.2
Tensorflow:  1.6.0
tensorflow
channels_last
GPU:  ['Tesla P100-PCIE-16GB', 'Tesla P100-PCIE-16GB', 'Tesla P100-PCIE-16GB', 'Tesla P100-PCIE-16GB']
CUDA Version 9.0.176
CuDNN Version  7.0.5


In [5]:
if K.__version__ != "2.1.4":
    raise Exception("Keras 2.1.5 introduces some breaking changes for data-loader")

In [6]:
CPU_COUNT = multiprocessing.cpu_count()
GPU_COUNT = len(get_gpu_name())
print("CPUs: ", CPU_COUNT)
print("GPUs: ", GPU_COUNT)

CPUs:  24
GPUs:  4


In [7]:
# Model-params
# Normalising done by keras.applications.densenet.preprocess_input()
# Paths
CSV_DEST = "chestxray"
IMAGE_FOLDER = os.path.join(CSV_DEST, "images")
LABEL_FILE = os.path.join(CSV_DEST, "Data_Entry_2017.csv")
print(IMAGE_FOLDER, LABEL_FILE)

chestxray/images chestxray/Data_Entry_2017.csv


In [8]:
# Manually scale to multi-gpu
if MULTI_GPU:
    LR *= GPU_COUNT 
    BATCHSIZE *= GPU_COUNT
#Make sure channels-first (not last)
K.backend.set_image_data_format('channels_first')

In [9]:
%%time
# Download data
print("Please make sure to download")
print("https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-linux#download-and-install-azcopy")
download_data_chextxray(CSV_DEST)

Please make sure to download
https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-linux#download-and-install-azcopy
Data already exists
CPU times: user 667 ms, sys: 257 ms, total: 925 ms
Wall time: 923 ms


In [10]:
#####################################################################################################
## Data Loading

In [12]:
class XrayData():
    
    def __init__(self, img_dir, lbl_file, patient_ids, 
                 width=WIDTH, height=HEIGHT, batch_size=BATCHSIZE, num_classes=CLASSES,
                 shuffle=True, seed=None, augment=False):
        
        self.patient_ids = patient_ids
        self.lbl_file = lbl_file
        self.width = width
        self.height = height
        
        # Hack for flow_from_directory to work, give it path above
        # Otherwise it requires images to be kept in folders
        self.child_path  = os.path.split(img_dir)[-1]
        self.parent_path =  img_dir.replace(self.child_path,'')
        
        # With version 2.1.5 the input has become a picture not array
        if augment:
            datagen = ImageDataGenerator(
                horizontal_flip=True,
                preprocessing_function=self.preprocess_fn_augment)
        else:
            datagen = ImageDataGenerator(
                preprocessing_function=self._preprocess_fn)   

        # Create flow-from-directory
        flowgen = datagen.flow_from_directory(
            directory=self.parent_path,  # hack: this is one directory up
            target_size=(width, height),
            batch_size=batch_size,
            shuffle=shuffle,
            seed=seed,
            class_mode='binary') # this can be none since overwritten   
        
        # Override previously created classes variables
        # filenames, classes
        flowgen.filenames, flowgen.classes = get_imgloc_labels(
            self.child_path, lbl_file, patient_ids)
        # number of files
        flowgen.n = len(flowgen.filenames)
        flowgen.num_classes = num_classes
        
        self.generator = flowgen
        print("Loaded {} labels and {} images".format(
            len(self.generator.classes), len(self.generator.filenames)))
        
    def preprocess_fn_augment(self, x):
        return self._preprocess_fn(x, augment=True)
    
    def _preprocess_fn(self, x, augment=False):
        # K 2.1.4 and below return CHW array
        # k 2.1.5 onwards returns an image
        x = preprocess_input(x, data_format='channels_first', mode='torch')
        # Data augmentation
        if augment:
            x = random_crop(x, (self.height, self.width))
        return x
                        
# Random crop has to be appied with preprocessing function
def random_crop(img, size):
    """
    Args:
        img (~numpy.ndarray): An image array to be cropped. This is in
            CHW format.
        size (tuple): The size of output image after cropping.
            This value is :math:`(height, width)`.
    """
    H, W = size
    y_offset = random.randint(0, img.shape[1] - H)
    y_slice = slice(y_offset, y_offset + H)
    x_offset = random.randint(0, img.shape[2] - W)
    x_slice = slice(x_offset, x_offset + W)
    img = img[:,y_slice, x_slice]
    return img  

In [13]:
train_set, valid_set, test_set = get_train_valid_test_split(TOT_PATIENT_NUMBER)

train:21563 valid:3080 test:6162


In [14]:
train_dataset = XrayData(IMAGE_FOLDER, LABEL_FILE, train_set, augment=True).generator

Found 112120 images belonging to 1 classes.
Loaded 87306 labels and 87306 images


In [15]:
valid_dataset = XrayData(IMAGE_FOLDER, LABEL_FILE, valid_set, shuffle=False).generator
test_dataset = XrayData(IMAGE_FOLDER, LABEL_FILE, test_set, shuffle=False).generator

Found 112120 images belonging to 1 classes.
Loaded 7616 labels and 7616 images
Found 112120 images belonging to 1 classes.
Loaded 17198 labels and 17198 images


In [16]:
#####################################################################################################
## Helper Functions

In [17]:
def get_symbol(out_features=CLASSES):
    model = DenseNet121(include_top=False, weights='imagenet', 
                        input_shape=(3, 224, 224), pooling='avg')
    # Add classifier to model FC-14
    classifier = Dense(out_features, activation='sigmoid')(model.output)
    model = Model(inputs=model.input, outputs=classifier)
    return model

In [18]:
def init_symbol(sym, lr=LR):
    # BCE Loss since classes not mutually exclusive + Sigmoid FC-layer
    sym.compile(
        loss = "binary_crossentropy",
        optimizer = Adam(lr, beta_1=0.9, beta_2=0.999, epsilon=None))
    # Callbacks
    sch = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, verbose=1)
    #This doesnt work with Keras multi-gpu
    #FLAG: Check if fixed in future versions
    #chp = ModelCheckpoint('best_chexnet.pth.hdf5', monitor='val_loss', save_weights_only=False)
    callbacks = [sch]
    return sym, callbacks

In [19]:
#####################################################################################################
## Train CheXNet

In [20]:
%%time
if MULTI_GPU:
    with tensorflow.device('/cpu:0'):
        # Recommended to instantiate base model on CPU
        # https://keras.io/utils/#multi_gpu_model
        sym = get_symbol()
    chexnet_sym = multi_gpu_model(sym, gpus=GPU_COUNT)
else:
    chexnet_sym = get_symbol()

CPU times: user 1min 26s, sys: 6.95 s, total: 1min 33s
Wall time: 1min 30s


In [21]:
%%time
# Load optimiser, loss
model, callbacks = init_symbol(chexnet_sym)

CPU times: user 35.8 ms, sys: 7.14 ms, total: 43 ms
Wall time: 41.6 ms


In [22]:
%%time
# 1 GPU - Main training loop: 51min 16s
# 4 GPU - Main training loop: 22min 10s
model.fit_generator(train_dataset,
                    epochs=EPOCHS,
                    verbose=1,
                    callbacks=callbacks,
                    workers=CPU_COUNT,  # Num of CPUs since multiprocessing
                    use_multiprocessing=True,  # Faster than with threading
                    validation_data=valid_dataset,
                    max_queue_size=20)  

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 1h 8min 48s, sys: 22min 49s, total: 1h 31min 37s
Wall time: 22min 10s


<keras.callbacks.History at 0x7fa4d64c3f98>

In [23]:
#####################################################################################################
## Test CheXNet

In [24]:
# Load model for testing
# Currently multi-GPU checkpointing is broken on Keras
# For now use in-RAM model

In [25]:
%%time
## Evaluate
y_guess = model.predict_generator(test_dataset, workers=CPU_COUNT)

CPU times: user 2min 2s, sys: 31.3 s, total: 2min 33s
Wall time: 41.9 s


In [26]:
# AUC: 0.8165
print("Validation AUC: {0:.4f}".format(compute_roc_auc(test_dataset.classes, y_guess, CLASSES)))

Full AUC [0.8088044653439874, 0.8775121361627931, 0.7921371625311039, 0.8875306269275469, 0.8860785919116722, 0.9217052193065758, 0.7410311050741578, 0.8605489258004981, 0.6237352093672446, 0.8509947887638167, 0.7407802675379115, 0.8048855712684062, 0.7561975433724842, 0.878709287595659]
Validation AUC: 0.8165


In [27]:
#####################################################################################################
## Synthetic Data (Pure Training)

In [28]:
# Test on fake-data -> no IO lag
batch_in_epoch = train_dataset.n//BATCHSIZE
tot_num = batch_in_epoch * BATCHSIZE
fake_X = np.random.rand(tot_num, 3, 224, 224).astype(np.float32)
fake_y = np.random.rand(tot_num, CLASSES).astype(np.float32) 

In [29]:
%%time
# 4 GPU - Synthetic data: 18min 25s
model.fit(fake_X,
          fake_y,
          batch_size=BATCHSIZE,
          epochs=EPOCHS,
          verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 1h 5min 29s, sys: 16min 46s, total: 1h 22min 15s
Wall time: 18min 25s


<keras.callbacks.History at 0x7fa4d64c36a0>