In [1]:
%%bash 
# Download model check-point and module from below repo by pudae:
# Check if tf-slim will have densenet121 at some point
wget -N https://github.com/pudae/tensorflow-densenet/raw/master/nets/densenet.py
wget -N https://ikpublictutorial.blob.core.windows.net/deeplearningframeworks/tf-densenet121.tar.gz
tar xzvf tf-densenet121.tar.gz

tf-densenet121.ckpt.data-00000-of-00001
tf-densenet121.ckpt.index
tf-densenet121.ckpt.meta


File ‘tf-densenet121.tar.gz’ not modified on server. Omitting download.



In [2]:
#######################################################################################################
# Summary
# 1. Tensorflow Multi-GPU example using Estimator & Dataset high-APIs
# 2. On-the-fly data-augmentation (random crop, random flip)
# ToDo:
# 3. Investigate tfrecord speed improvement (to match MXNet)
# References:
# https://www.tensorflow.org/performance/performance_guide
# 1. https://jhui.github.io/2017/03/07/TensorFlow-Perforamnce-and-advance-topics/
# 2. https://www.tensorflow.org/versions/master/performance/datasets_performance
# 3. https://github.com/pudae/tensorflow-densenet
# 4. https://stackoverflow.com/a/48096625/6772173
# 5. https://stackoverflow.com/questions/47867748/transfer-learning-with-tf-estimator-estimator-framework
# 6. https://github.com/BobLiu20/Classification_Nets/blob/master/tensorflow/common/average_gradients.py
# 7. https://github.com/BobLiu20/Classification_Nets/blob/master/tensorflow/training/train_estimator.py
#######################################################################################################

In [3]:
MULTI_GPU = True  # TOGGLE THIS

In [4]:
import os
import sys
import time
import multiprocessing
import numpy as np
import pandas as pd
from PIL import Image
import random
import tensorflow as tf
from tensorflow.python.framework import dtypes
from tensorflow.python.framework.ops import convert_to_tensor
from common.utils import download_data_chextxray, get_imgloc_labels, get_train_valid_test_split
from common.utils import compute_roc_auc, get_cuda_version, get_cudnn_version, get_gpu_name
from common.params_dense import *
slim = tf.contrib.slim

  from ._conv import register_converters as _register_converters


In [5]:
#https://github.com/pudae/tensorflow-densenet/raw/master/nets/densenet.py
import densenet  # Download from https://github.com/pudae/tensorflow-densenet

In [6]:
print("OS: ", sys.platform)
print("Python: ", sys.version)
print("Numpy: ", np.__version__)
print("Tensorflow: ", tf.__version__)
print("GPU: ", get_gpu_name())
print(get_cuda_version())
print("CuDNN Version ", get_cudnn_version())

OS:  linux
Python:  3.5.4 |Anaconda custom (64-bit)| (default, Nov 20 2017, 18:44:38) 
[GCC 7.2.0]
Numpy:  1.14.1
Tensorflow:  1.8.0
GPU:  ['Tesla V100-PCIE-16GB', 'Tesla V100-PCIE-16GB', 'Tesla V100-PCIE-16GB', 'Tesla V100-PCIE-16GB']
CUDA Version 9.0.176
CuDNN Version  7.0.5


In [7]:
CPU_COUNT = multiprocessing.cpu_count()
GPU_COUNT = len(get_gpu_name())
print("CPUs: ", CPU_COUNT)
print("GPUs: ", GPU_COUNT)

CPUs:  24
GPUs:  4


In [8]:
# Model-params
IMAGENET_RGB_MEAN_CAFFE = np.array([123.68, 116.78, 103.94], dtype=np.float32)
IMAGENET_SCALE_FACTOR_CAFFE = 0.017
# Paths
CSV_DEST = "chestxray"
IMAGE_FOLDER = os.path.join(CSV_DEST, "images")
LABEL_FILE = os.path.join(CSV_DEST, "Data_Entry_2017.csv")
print(IMAGE_FOLDER, LABEL_FILE)
CHKPOINT = 'tf-densenet121.ckpt'  # Downloaded tensorflow-checkpoint

chestxray/images chestxray/Data_Entry_2017.csv


In [9]:
# Manually scale to multi-gpu
if MULTI_GPU:
    LR *= GPU_COUNT 
    BATCHSIZE *= GPU_COUNT

In [10]:
%%time
# Download data
print("Please make sure to download")
print("https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-linux#download-and-install-azcopy")
download_data_chextxray(CSV_DEST)

Please make sure to download
https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-linux#download-and-install-azcopy
Data already exists
CPU times: user 625 ms, sys: 225 ms, total: 850 ms
Wall time: 849 ms


In [11]:
#####################################################################################################
## Data Loading

In [12]:
class XrayData():
    
    def __init__(self, img_dir, lbl_file, patient_ids, mode, 
                 width=WIDTH, height=HEIGHT, batch_size=BATCHSIZE, 
                 imagenet_mean=IMAGENET_RGB_MEAN_CAFFE, imagenet_scaling = IMAGENET_SCALE_FACTOR_CAFFE,
                 buffer=10):

        self.img_locs, self.labels = get_imgloc_labels(img_dir, lbl_file, patient_ids)
        self.data_size = len(self.labels)
        self.imagenet_mean = imagenet_mean
        self.imagenet_scaling = imagenet_scaling
        self.width = width
        self.height = height
        data = tf.data.Dataset.from_tensor_slices((self.img_locs, self.labels))
        
        # Processing
        # Output as channels-last and TF model will reshape in densenet.py
        # inputs = tf.transpose(inputs, [0, 3, 1, 2])
        if mode == 'training':
            # Augmentation and repeat
            data = data.shuffle(self.data_size).repeat().apply(
                tf.contrib.data.map_and_batch(self._parse_function_train, batch_size)).prefetch(buffer)
        elif mode == "validation":
            # Repeat
             data = data.repeat().apply(
                tf.contrib.data.map_and_batch(self._parse_function_inference, batch_size)).prefetch(buffer)           
        elif mode == 'testing':
            # No repeat, no augmentation
            data = data.apply(
                tf.contrib.data.map_and_batch(self._parse_function_inference, batch_size)).prefetch(buffer)
        
        self.data = data        
        print("Loaded {} labels and {} images".format(len(self.labels), len(self.img_locs)))
        
        
    def _parse_function_train(self, filename, label):
        img_rgb, label = self._preprocess_image_labels(filename, label)
        # Random crop (from 264x264)
        img_rgb = tf.random_crop(img_rgb, [self.height, self.width, 3])
        # Random flip
        img_rgb = tf.image.random_flip_left_right(img_rgb)
        # Channels-first
        img_rgb = tf.transpose(img_rgb, [2, 0, 1])
        return img_rgb, label
        
        
    def _parse_function_inference(self, filename, label):
        img_rgb, label = self._preprocess_image_labels(filename, label)
        # Resize to final dimensions
        img_rgb = tf.image.resize_images(img_rgb, [self.height, self.width])
        # Channels-first
        img_rgb = tf.transpose(img_rgb, [2, 0, 1])
        return img_rgb, label 
       
    
    def _preprocess_image_labels(self, filename, label):
        # load and preprocess the image
        img_decoded = tf.to_float(tf.image.decode_png(tf.read_file(filename), channels=3))
        img_centered = tf.subtract(img_decoded, self.imagenet_mean)
        img_rgb = img_centered * self.imagenet_scaling
        return img_rgb, tf.cast(label, dtype=tf.float32)

In [13]:
train_set, valid_set, test_set = get_train_valid_test_split(TOT_PATIENT_NUMBER)

train:21563 valid:3080 test:6162


In [14]:
with tf.device('/cpu:0'):
    # Create dataset for iterator
    train_dataset = XrayData(img_dir=IMAGE_FOLDER, lbl_file=LABEL_FILE, patient_ids=train_set,  
                             mode='training')
    valid_dataset = XrayData(img_dir=IMAGE_FOLDER, lbl_file=LABEL_FILE, patient_ids=valid_set,
                             mode='validation')
    test_dataset  = XrayData(img_dir=IMAGE_FOLDER, lbl_file=LABEL_FILE, patient_ids=test_set,
                             mode='testing')

Loaded 87306 labels and 87306 images
Loaded 7616 labels and 7616 images
Loaded 17198 labels and 17198 images


In [15]:
#####################################################################################################
## Helper Functions

In [16]:
def average_gradients(tower_grads):
    average_grads = []
    for grad_and_vars in zip(*tower_grads):
        grads = []
        for g, _ in grad_and_vars:
            expanded_g = tf.expand_dims(g, 0)
            grads.append(expanded_g)
        grad = tf.concat(axis=0, values=grads)
        grad = tf.reduce_mean(grad, 0)
        v = grad_and_vars[0][1]
        grad_and_var = (grad, v)
        average_grads.append(grad_and_var)
    return average_grads

In [17]:
def get_symbol(in_tensor, out_features):
    # Import symbol
    # is_training=True? | https://github.com/tensorflow/models/issues/3556
    with slim.arg_scope(densenet.densenet_arg_scope(data_format="NCHW")):
        base_model, _ = densenet.densenet121(in_tensor,
                                             num_classes=out_features,
                                             is_training=True)
        # Need to reshape from (?, 1, 1, 14) to (?, 14)
        sym = tf.reshape(base_model, shape=[-1, out_features])
    return sym

In [18]:
def model_fn_single(features, labels, mode, params):
    sym = get_symbol(features, out_features=params["n_classes"])
    # Predictions
    predictions = tf.sigmoid(sym)
    # ModeKeys.PREDICT
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
    # Optimizer & Loss
    optimizer = tf.train.AdamOptimizer(params['lr'], beta1=0.9, beta2=0.999)
    loss_fn = tf.losses.sigmoid_cross_entropy(labels, sym)
    loss = tf.reduce_mean(loss_fn)
    train_op = optimizer.minimize(loss, tf.train.get_global_step())
    # Create eval metric ops
    eval_metric_ops = {"val_loss": slim.metrics.streaming_mean(
        tf.losses.sigmoid_cross_entropy(labels, predictions))}

    return tf.estimator.EstimatorSpec(
        mode=mode,
        loss=loss,
        train_op=train_op,
        eval_metric_ops=eval_metric_ops)

In [19]:
def multi_gpu_X_y_split(features, labels, batchsize, gpus):
    # Make sure splits sum to batch-size
    split_size = batchsize // len(gpus)
    splits = [split_size, ] * (len(gpus) - 1)
    splits.append(batchsize - split_size * (len(gpus) - 1))
    # Split the features and labels
    features_split = tf.split(features, splits, axis=0)
    labels_split = tf.split(labels, splits, axis=0)
    return features_split, labels_split

In [20]:
def model_fn_multigpu(features, labels, mode, params):
    if mode == tf.estimator.ModeKeys.PREDICT:
        # Create symbol
        sym = get_symbol(features, out_features=params["n_classes"])
        # Predictions
        predictions = tf.sigmoid(sym)   
        # ModeKeys.PREDICT
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
    
    # For multi-gpu split features and labels
    features_split, labels_split = multi_gpu_X_y_split( features, labels, params["batchsize"], params["gpus"])
    tower_grads = []
    eval_logits = []
    # Training operation
    global_step = tf.train.get_global_step()
    optimizer = tf.train.AdamOptimizer(LR, beta1=0.9, beta2=0.999)
    # Load model on multiple GPUs
    with tf.variable_scope(tf.get_variable_scope()):
        for i in range(len(params['gpus'])):
            with tf.device('/gpu:%d' % i), tf.name_scope('%s_%d' % ("classification", i)) as scope:
                # Symbol
                sym = get_symbol(features_split[i], out_features=params["n_classes"])
                # Loss
                tf.losses.sigmoid_cross_entropy(labels_split[i], sym)
                # Training-ops
                update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope)
                updates_op = tf.group(*update_ops)
                with tf.control_dependencies([updates_op]):
                    losses = tf.get_collection(tf.GraphKeys.LOSSES, scope)
                    total_loss = tf.add_n(losses, name='total_loss')
                # reuse var
                tf.get_variable_scope().reuse_variables()
                # grad compute
                grads = optimizer.compute_gradients(total_loss)
                tower_grads.append(grads)
                eval_logits.append(sym)

    # We must calculate the mean of each gradient
    grads = average_gradients(tower_grads)
    # Apply the gradients to adjust the shared variables.
    apply_gradient_op = optimizer.apply_gradients(grads, global_step=global_step)
    # Group all updates to into a single train op.
    train_op = tf.group(apply_gradient_op)
    # Create eval metric ops (predict on multi-gpu)
    predictions =  tf.concat(eval_logits, 0)
    eval_metric_ops = {"val_loss": slim.metrics.streaming_mean(
        tf.losses.sigmoid_cross_entropy(labels, predictions))}

    return tf.estimator.EstimatorSpec(
        mode=mode,
        loss=total_loss,
        train_op=train_op,
        eval_metric_ops=eval_metric_ops)

In [21]:
def train_input_fn():
    return train_dataset.data.make_one_shot_iterator().get_next()
def valid_input_fn():
    return valid_dataset.data.make_one_shot_iterator().get_next()
def test_input_fn():
    return test_dataset.data.make_one_shot_iterator().get_next()

In [22]:
# Warm start from saved checkpoint (not logits)
ws = tf.estimator.WarmStartSettings(ckpt_to_initialize_from=CHKPOINT, vars_to_warm_start="^(?!.*(logits))")
# Params
params={"lr":LR, "n_classes":CLASSES, "batchsize":BATCHSIZE, "gpus":list(range(GPU_COUNT))}
# Model functions
if MULTI_GPU:
    model_fn=model_fn_multigpu
else:
    model_fn=model_fn_single

In [23]:
%%time
# Create Estimator
nn = tf.estimator.Estimator(model_fn=model_fn, params=params, warm_start_from=ws)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_num_ps_replicas': 0, '_model_dir': '/tmp/tmpcfbb3x5w', '_log_step_count_steps': 100, '_master': '', '_is_chief': True, '_keep_checkpoint_every_n_hours': 10000, '_keep_checkpoint_max': 5, '_session_config': None, '_global_id_in_cluster': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f79611c6ef0>, '_task_id': 0, '_tf_random_seed': None, '_num_worker_replicas': 1, '_train_distribute': None, '_task_type': 'worker', '_evaluation_master': '', '_service': None}
CPU times: user 5.5 ms, sys: 162 µs, total: 5.66 ms
Wall time: 5.03 ms


In [24]:
%%time
# Create train & eval specs
train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn,
                                    max_steps=EPOCHS*(train_dataset.data_size//BATCHSIZE))
# Hard to run validation every epoch so playing around with throttle_secs to get 5 runs
eval_spec = tf.estimator.EvalSpec(input_fn=valid_input_fn,
                                  throttle_secs=400)

CPU times: user 99 µs, sys: 11 µs, total: 110 µs
Wall time: 115 µs


In [25]:
%%time
# 1 GPU - Main training loop: 33min 29s
# 4 GPU - Main training loop: 22min 11s
# Run train and evaluate (on validation data)
tf.estimator.train_and_evaluate(nn, train_spec, eval_spec)

INFO:tensorflow:Running training and evaluation locally (non-distributed).
Trimmed log ...CPU times: user 1h 57min 47s, sys: 9min 51s, total: 2h 7min 38s
Wall time: 22min 11s


In [26]:
%%time
# 1 GPU AUC: 0.8009
# 4 GPU AUC: 0.8120
predictions = list(nn.predict(test_input_fn))
y_truth = test_dataset.labels
y_guess = np.array(predictions)
print("Test AUC: {0:.4f}".format(compute_roc_auc(y_truth, y_guess, CLASSES))) 

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpcfbb3x5w/model.ckpt-1705
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Full AUC [0.8113181918227069, 0.8603255067467157, 0.7916277297097226, 0.8790063417235601, 0.8791099267683136, 0.9115475111698327, 0.7023541384026243, 0.858907165510939, 0.6190543626664032, 0.8471454439175482, 0.7650787561316246, 0.7907428147497167, 0.7714048056837656, 0.8798529046882989]
Test AUC: 0.8120
CPU times: user 2min 58s, sys: 12.9 s, total: 3min 11s
Wall time: 32.1 s


In [27]:
#####################################################################################################
## Synthetic Data (Pure Training)

In [28]:
# Test on fake-data -> no IO lag
batch_in_epoch = train_dataset.data_size//BATCHSIZE
tot_num = batch_in_epoch * BATCHSIZE
fake_X = np.random.rand(tot_num, 3, 224, 224).astype(np.float32)
fake_y = np.random.rand(tot_num, CLASSES).astype(np.float32) 

In [29]:
%%time
# Create Estimator
nn = tf.estimator.Estimator(model_fn=model_fn, params=params)  

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_num_ps_replicas': 0, '_model_dir': '/tmp/tmptsc3de0g', '_log_step_count_steps': 100, '_master': '', '_is_chief': True, '_keep_checkpoint_every_n_hours': 10000, '_keep_checkpoint_max': 5, '_session_config': None, '_global_id_in_cluster': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f796293a160>, '_task_id': 0, '_tf_random_seed': None, '_num_worker_replicas': 1, '_train_distribute': None, '_task_type': 'worker', '_evaluation_master': '', '_service': None}
CPU times: user 4.85 ms, sys: 54 µs, total: 4.91 ms
Wall time: 4.14 ms


In [30]:
%%time
# 1 GPU - Synthetic data: 25min 25s 
# 4 GPU - Synthetic data: 13min 55s
nn.train(tf.estimator.inputs.numpy_input_fn(
    fake_X,
    fake_y,
    shuffle=False,
    num_epochs=EPOCHS,
    batch_size=BATCHSIZE))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmptsc3de0g/model.ckpt.
INFO:tensorflow:loss = 0.76794666, step = 0
INFO:tensorflow:global_step/sec: 1.9302
INFO:tensorflow:loss = 0.6935522, step = 100 (51.812 sec)
INFO:tensorflow:global_step/sec: 2.44545
INFO:tensorflow:loss = 0.69338816, step = 200 (40.893 sec)
INFO:tensorflow:global_step/sec: 2.39242
INFO:tensorflow:loss = 0.69193834, step = 300 (41.798 sec)
INFO:tensorflow:global_step/sec: 2.40031
INFO:tensorflow:loss = 0.6942548, step = 400 (41.661 sec)
INFO:tensorflow:global_step/sec: 2.3377
INFO:tensorflow:loss = 0.6941431, step = 500 (42.778 sec)
INFO:tensorflow:global_step/sec: 2.34509
INFO:tensorflow:loss = 0.6940154, step = 600 (42.642 sec)
INFO:tensorflow:global_step/sec: 2.31676
INFO:tens

<tensorflow.python.estimator.estimator.Estimator at 0x7f796293a128>