In [1]:
#######################################################################################################
# Summary
# 1. Keras Multi-GPU example
#######################################################################################################

In [2]:
MULTI_GPU = True

In [3]:
import os
import sys
import time
import pandas as pd
import numpy as np
os.environ['KERAS_BACKEND'] = "tensorflow"
import keras as K  # 2.1.5 introduces breaking API change
import tensorflow
import multiprocessing
from keras.preprocessing.image import ImageDataGenerator
from keras.applications.densenet import DenseNet121, preprocess_input
from keras.optimizers import Adam
from keras.callbacks import ReduceLROnPlateau, Callback, ModelCheckpoint
from keras.layers import Dense
from keras.models import Model
from keras.utils import multi_gpu_model
from sklearn.metrics.ranking import roc_auc_score
from sklearn.model_selection import train_test_split
from common.utils import download_data_chextxray, get_imgloc_labels, get_train_valid_test_split
from common.utils import compute_roc_auc, get_cuda_version, get_cudnn_version, get_gpu_name
from common.params_dense import *

Using TensorFlow backend.


In [4]:
print("OS: ", sys.platform)
print("Python: ", sys.version)
print("Keras: ", K.__version__)
print("Numpy: ", np.__version__)
print("Tensorflow: ", tensorflow.__version__)
print(K.backend.backend())
print(K.backend.image_data_format())
print("GPU: ", get_gpu_name())
print(get_cuda_version())
print("CuDNN Version ", get_cudnn_version())

OS:  linux
Python:  3.5.2 |Anaconda custom (64-bit)| (default, Jul  2 2016, 17:53:06) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]
Keras:  2.1.4
Numpy:  1.14.2
Tensorflow:  1.6.0
tensorflow
channels_last
GPU:  ['Tesla P100-PCIE-16GB', 'Tesla P100-PCIE-16GB', 'Tesla P100-PCIE-16GB', 'Tesla P100-PCIE-16GB']
CUDA Version 9.0.176
CuDNN Version  7.0.5


In [5]:
CPU_COUNT = multiprocessing.cpu_count()
GPU_COUNT = len(get_gpu_name())
print("CPUs: ", CPU_COUNT)
print("GPUs: ", GPU_COUNT)

CPUs:  24
GPUs:  4


In [6]:
# Model-params
# Normalising done by keras.applications.densenet.preprocess_input()
# Paths
CSV_DEST = "chestxray"
IMAGE_FOLDER = os.path.join(CSV_DEST, "images")
LABEL_FILE = os.path.join(CSV_DEST, "Data_Entry_2017.csv")
print(IMAGE_FOLDER, LABEL_FILE)

chestxray/images chestxray/Data_Entry_2017.csv


In [7]:
# Manually scale to multi-gpu
if MULTI_GPU:
    LR *= GPU_COUNT 
    BATCHSIZE *= GPU_COUNT
#Make sure channels-first (not last)
K.backend.set_image_data_format('channels_first')

In [8]:
%%time
# Download data
print("Please make sure to download")
print("https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-linux#download-and-install-azcopy")
download_data_chextxray(CSV_DEST)

Please make sure to download
https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-linux#download-and-install-azcopy
Data already exists
CPU times: user 711 ms, sys: 209 ms, total: 920 ms
Wall time: 919 ms


In [9]:
#####################################################################################################
## Data Loading

In [10]:
class XrayData():
    
    def __init__(self, img_dir, lbl_file, patient_ids, 
                 width=WIDTH, height=HEIGHT, batch_size=BATCHSIZE, num_classes=CLASSES,
                 shuffle=True, seed=None, augment=False):
        
        self.patient_ids = patient_ids
        self.lbl_file = lbl_file
        
        # Hack for flow_from_directory to work, give it path above
        self.child_path  = os.path.split(img_dir)[-1]
        self.parent_path =  img_dir.replace(self.child_path,'')
        
        # Create ImageDataGenerator with DenseNet pre-processing
        # imagenet_utils.preprocess_input(x, data_format, mode='torch')
        if augment:
            datagen = ImageDataGenerator(
                horizontal_flip=True,
                # Best match to?
                # transforms.RandomResizedCrop(size=WIDTH),
                zoom_range=0.2,  
                rotation_range=10,
                preprocessing_function=preprocess_input)
        else:
             datagen = ImageDataGenerator(preprocessing_function=preprocess_input)    

        # Create flow-from-directory
        flowgen = datagen.flow_from_directory(
            directory=self.parent_path,  # hack: this is one directory up
            target_size=(width, height),
            batch_size=batch_size,
            shuffle=shuffle,
            seed=seed,
            class_mode='binary')    
        
        # Override previously created classes variables
        # filenames, classes
        flowgen.filenames, flowgen.classes = get_imgloc_labels(
            self.child_path, lbl_file, patient_ids)
        # number of files
        flowgen.n = len(flowgen.filenames)
        # number of classes (not sure if this last one needed)
        flowgen.num_classes = num_classes
        
        self.generator = flowgen
        print("Loaded {} labels and {} images".format(len(self.generator.classes), 
                                                      len(self.generator.filenames)))

In [11]:
train_set, valid_set, test_set = get_train_valid_test_split(TOT_PATIENT_NUMBER)

train:21563 valid:3080 test:6162


In [12]:
train_dataset = XrayData(IMAGE_FOLDER, LABEL_FILE, train_set, augment=True).generator

Found 112120 images belonging to 1 classes.
Loaded 87306 labels and 87306 images


In [13]:
valid_dataset = XrayData(IMAGE_FOLDER, LABEL_FILE, valid_set, shuffle=False).generator
test_dataset = XrayData(IMAGE_FOLDER, LABEL_FILE, test_set, shuffle=False).generator

Found 112120 images belonging to 1 classes.
Loaded 7616 labels and 7616 images
Found 112120 images belonging to 1 classes.
Loaded 17198 labels and 17198 images


In [14]:
#####################################################################################################
## Helper Functions

In [15]:
def get_symbol(model_name='densenet121', out_features=CLASSES):
    if model_name == 'densenet121':
        model = DenseNet121(include_top=False, weights='imagenet', input_shape=(3, 224, 224), pooling='avg')
    else:
        raise ValueError("Unknown model-name")
    # Add classifier to model FC-14
    classifier = Dense(out_features, activation='sigmoid')(model.output)
    model = Model(inputs=model.input, outputs=classifier)
    return model

In [16]:
def init_symbol(sym, lr=LR):
    # BCE Loss since classes not mutually exclusive + Sigmoid FC-layer
    sym.compile(
        loss = "binary_crossentropy",
        optimizer = Adam(lr, beta_1=0.9, beta_2=0.999, epsilon=None))
    # Callbacks
    sch = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, verbose=1)
    #This doesnt work with Keras multi-gpu
    #Don't want to add another hack to get it fixed
    #chp = ModelCheckpoint('best_chexnet.pth.hdf5', monitor='val_loss', save_weights_only=False)
    callbacks = [sch]
    return sym, callbacks

In [17]:
#####################################################################################################
## Train CheXNet

In [18]:
%%time
if MULTI_GPU:
    with tensorflow.device('/cpu:0'):
        # Recommended to instantiate base model on CPU
        # https://keras.io/utils/#multi_gpu_model
        sym = get_symbol()
    chexnet_sym = multi_gpu_model(sym, gpus=GPU_COUNT)
else:
    chexnet_sym = get_symbol()

CPU times: user 1min 22s, sys: 5.25 s, total: 1min 27s
Wall time: 1min 25s


In [19]:
%%time
# Load optimiser, loss
model, callbacks = init_symbol(chexnet_sym)

CPU times: user 32.6 ms, sys: 3.71 ms, total: 36.4 ms
Wall time: 35.1 ms


In [20]:
%%time
# 1 GPU - Main training loop: 51min 27s
# 2 GPU - Main training loop: 32min 1s
# 4 GPU - Main training loop: 22min 49s
model.fit_generator(train_dataset,
                    epochs=EPOCHS,
                    verbose=1,
                    callbacks=callbacks,
                    workers=CPU_COUNT,  # Num of CPUs since multiprocessing
                    use_multiprocessing=True,  # Faster than with threading
                    validation_data=valid_dataset,
                    max_queue_size=20)  # Default is 10 (most prob no difference)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 1h 7min 8s, sys: 23min 4s, total: 1h 30min 12s
Wall time: 22min 49s


<keras.callbacks.History at 0x7fdf6e7143c8>

In [21]:
#####################################################################################################
## Test CheXNet

In [22]:
# Load model for testing
# Currently multi-GPU checkpointing is broken on Keras
# For now use in-RAM model

In [23]:
%%time
## Evaluate
# AUC: 0.8216
y_guess = model.predict_generator(test_dataset, workers=CPU_COUNT)

CPU times: user 5min 35s, sys: 1min 44s, total: 7min 20s
Wall time: 2min 14s


In [24]:
print("Validation AUC: {0:.4f}".format(compute_roc_auc(test_dataset.classes, y_guess, CLASSES)))

Full AUC [0.810400224263596, 0.8642047989855159, 0.801330086449206, 0.9072074321344181, 0.8906798540400607, 0.9213575843667169, 0.7088805005859234, 0.9128299199053916, 0.6267736564423316, 0.8542487673046052, 0.7531549949370517, 0.803228785418665, 0.7709379338811964, 0.8884575500057307]
Validation AUC: 0.8224


In [25]:
#####################################################################################################
## Synthetic Data (Pure Training)

In [26]:
# Test on fake-data -> no IO lag
batch_in_epoch = train_dataset.n//BATCHSIZE
tot_num = batch_in_epoch * BATCHSIZE
fake_X = np.random.rand(tot_num, 3, 224, 224).astype(np.float32)
fake_y = np.random.rand(tot_num, CLASSES).astype(np.float32) 

In [29]:
%%time
# 4 GPU - Main training loop: 22min 49s
# 4 GPU - Synthetic data: 18min 30s
model.fit(fake_X,
          fake_y,
          batch_size=BATCHSIZE,
          epochs=EPOCHS,
          verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 1h 5min 19s, sys: 16min 44s, total: 1h 22min 3s
Wall time: 18min 30s


<keras.callbacks.History at 0x7fdda382a5c0>