# MXNet Gluon Multi-GPU

In [2]:
import os
import sys
import multiprocessing
import logging
import numpy as np
import pandas as pd

import mxnet as mx
from mxnet.io import DataDesc
from mxnet import nd, gluon, autograd
from mxnet.gluon.data import RecordFileDataset, ArrayDataset, Dataset
from mxnet.gluon.data.vision import transforms
from mxnet.gluon.data.vision.datasets import ImageFolderDataset
from mxnet.gluon.data.dataloader import DataLoader
from mxnet.gluon.model_zoo import vision as models
from mxnet import recordio

from sklearn.metrics.ranking import roc_auc_score
from sklearn.model_selection import train_test_split
from PIL import Image
from common.utils import *
from common.params_dense import *
import math
from time import time

%load_ext autoreload
%autoreload 2

In [3]:
print("OS: ", sys.platform)
print("Python: ", sys.version)
print("Numpy: ", np.__version__)
print("MXNet: ", mx.__version__)
print("GPU: ", get_gpu_name())
print(get_cuda_version())
print("CuDNN Version ", get_cudnn_version())

OS:  linux
Python:  3.6.3 |Anaconda custom (64-bit)| (default, Oct 13 2017, 12:02:49) 
[GCC 7.2.0]
Numpy:  1.13.3
MXNet:  1.3.0
GPU:  ['Tesla V100-SXM2-16GB', 'Tesla V100-SXM2-16GB', 'Tesla V100-SXM2-16GB', 'Tesla V100-SXM2-16GB']
CUDA Version 9.1.85
CuDNN Version  7.1.3


In [4]:
# User-set
# Note if NUM_GPUS > 1 then MULTI_GPU = True and ALL GPUs will be used
# Set below to affect batch-size
# E.g. 1 GPU = 64, 2 GPUs =64*2, 4 GPUs = 64*4
# Note that the effective learning-rate will be decreased this way
CPU_COUNT = multiprocessing.cpu_count() 
GPU_COUNT = len(get_gpu_name())
MULTI_GPU = GPU_COUNT > 1
print("CPUs: ", CPU_COUNT)
print("GPUs: ", GPU_COUNT)

CPUs:  32
GPUs:  4


In [5]:
# Manually scale to multi-gpu
if MULTI_GPU:
    LR *= GPU_COUNT
    BATCHSIZE *= (GPU_COUNT)
    BATCHSIZE = BATCHSIZE//GPU_COUNT*GPU_COUNT

## Data Download

In [6]:
# Model-params
# Paths
CSV_DEST = "/data/chestxray"
IMAGE_FOLDER = os.path.join(CSV_DEST, "images")
LABEL_FILE = os.path.join(CSV_DEST, "Data_Entry_2017.csv")

In [7]:
%%time
# Download data
print("Please make sure to download")
print("https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-linux#download-and-install-azcopy")
download_data_chextxray(CSV_DEST)

Please make sure to download
https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-linux#download-and-install-azcopy
Data already exists
CPU times: user 440 ms, sys: 176 ms, total: 616 ms
Wall time: 617 ms


## Data prep
https://github.com/apache/incubator-mxnet/issues/1480


In [8]:
train_set, valid_set, test_set = get_train_valid_test_split(TOT_PATIENT_NUMBER)

train:21563 valid:3080 test:6162


## Data Loading

### Creating the datasets

In [9]:
class XrayData(Dataset):
    def __init__(self, img_dir, lbl_file, patient_ids, transform=None):
        
        self.img_locs, self.labels = get_imgloc_labels(img_dir, lbl_file, patient_ids)
        self.transform = transform
        print("Loaded {} labels and {} images".format(len(self.labels), len(self.img_locs)))
    
    def __getitem__(self, idx):
        im_file = self.img_locs[idx]
        im_rgb = Image.open(im_file)
        label = self.labels[idx]
        im_rgb = mx.nd.array(im_rgb)
        if self.transform is not None:
            im_rgb = self.transform(im_rgb)

        return im_rgb, mx.nd.array(label)
        
    def __len__(self):
        return len(self.img_locs)

In [10]:
def no_augmentation_dataset(img_dir, lbl_file, patient_ids, normalize):
    dataset = XrayData(img_dir, lbl_file, patient_ids,
                       transform=transforms.Compose([
                           transforms.Resize(WIDTH),
                           transforms.ToTensor(),  
                           transforms.Normalize(IMAGENET_RGB_MEAN, IMAGENET_RGB_SD)]))
    return dataset

In [11]:
# Dataset for training
train_dataset = XrayData(img_dir=IMAGE_FOLDER,
                         lbl_file=LABEL_FILE,
                         patient_ids=train_set,
                         transform=transforms.Compose([
                             transforms.RandomResizedCrop(size=WIDTH),
                             transforms.RandomFlipLeftRight(),
                             transforms.ToTensor(),
                             transforms.Normalize(IMAGENET_RGB_MEAN, IMAGENET_RGB_SD)]))

Loaded 87306 labels and 87306 images


In [12]:
valid_dataset = no_augmentation_dataset(IMAGE_FOLDER, LABEL_FILE, valid_set, transforms.Normalize(IMAGENET_RGB_MEAN, IMAGENET_RGB_SD))
test_dataset = no_augmentation_dataset(IMAGE_FOLDER, LABEL_FILE, test_set, transforms.Normalize(IMAGENET_RGB_MEAN, IMAGENET_RGB_SD))

Loaded 7616 labels and 7616 images
Loaded 17198 labels and 17198 images


In [21]:
# DataLoaders
train_loader = DataLoader(dataset=train_dataset, batch_size=BATCHSIZE,
                          shuffle=True, num_workers=6, last_batch='discard')
valid_loader = DataLoader(dataset=valid_dataset, batch_size=BATCHSIZE,
                          shuffle=False, num_workers=6, last_batch='discard')
test_loader = DataLoader(dataset=test_dataset, batch_size=BATCHSIZE,
                         shuffle=False, num_workers=6, last_batch='discard')

## Creating the network

### Loading the pretrained model

In [14]:
ctx = [mx.gpu(i) for i in range(GPU_COUNT)]   

In [15]:
net = mx.gluon.model_zoo.vision.densenet121(pretrained=True, ctx=ctx)
with net.name_scope():
    net.output = mx.gluon.nn.Dense(CLASSES)
net.output.initialize(ctx=ctx)
net.hybridize()

## Trainer

In [16]:
trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': LR})

## Loss 

In [17]:
binary_cross_entropy = gluon.loss.SigmoidBinaryCrossEntropyLoss()

## Output

In [18]:
sig = gluon.nn.Activation('sigmoid')

## Evaluation loop

In [22]:
def evaluate_accuracy(data_iterator, net):
    acc = 0
    for i, (data, label) in enumerate(data_iterator):
        data_split = gluon.utils.split_and_load(data, ctx)
        label_split = gluon.utils.split_and_load(label, ctx)
        outputs = [(sig(net(X)),Y) for X, Y in zip(data_split, label_split)]
        for output, label in outputs:
            acc += float((label.asnumpy() == np.round(output.asnumpy())).sum()) / CLASSES / output.shape[0]
    data_split = gluon.utils.split_and_load(data, [mx.cpu()])
    label_split = gluon.utils.split_and_load(label, [mx.cpu()])
    return acc/i/len(ctx)

## Training loop

In [23]:
n_batch = 5 # Blocking call every 5 batches
n_print = 100 # Print every 100 batches

In [24]:
def train_epoch(net, dataloader, trainer, loss_fn, ctx, n_batch=7, n_print=100):
    losses_acc = [mx.nd.zeros((1), ctx=c) for c in ctx]
    print_loss = 0
    for i, (data, label) in enumerate(dataloader):        
        data_split = gluon.utils.split_and_load(data, ctx)
        label_split = gluon.utils.split_and_load(label, ctx)  
        
        if i > 0:
            for j, l in enumerate(losses):
                # Accumulate losses asynchronously on each GPU
                losses_acc[j] += l.mean()
            if i%n_batch == 0:
                # Blocking call
                print_loss = sum(losses_acc).asscalar()/i/len(ctx)
            if i%n_print == 0:
                print('Batch {0}: Loss: {1:.4f}'.format(i, print_loss))            
            
        with autograd.record():
            losses = [loss_fn(net(X), Y) for X, Y in zip(data_split, label_split)]
        for l in losses:
            l.backward()
        trainer.step(data.shape[0]) 

In [25]:
%%time
for e in range(EPOCHS):
    tick = time()
    train_epoch(net, train_loader, trainer, binary_cross_entropy, ctx)
    test_accuracy = evaluate_accuracy(valid_loader, net)
    print('Epoch {0}, {1:.6f} test_accuracy after {2:.2f} seconds'.format(e, test_accuracy, time()-tick))

Batch 100: Loss: 0.2135
Batch 200: Loss: 0.1906
Batch 300: Loss: 0.1821
Epoch 0, 0.959542 test_accuracy after 107.81 seconds
Batch 100: Loss: 0.1616
Batch 200: Loss: 0.1613
Batch 300: Loss: 0.1603
Epoch 1, 0.982243 test_accuracy after 100.29 seconds
Batch 100: Loss: 0.1568
Batch 200: Loss: 0.1572
Batch 300: Loss: 0.1564
Epoch 2, 0.985999 test_accuracy after 100.09 seconds
Batch 100: Loss: 0.1556
Batch 200: Loss: 0.1545
Batch 300: Loss: 0.1544
Epoch 3, 0.985760 test_accuracy after 100.09 seconds
Batch 100: Loss: 0.1523
Batch 200: Loss: 0.1527
Batch 300: Loss: 0.1531
Epoch 4, 0.985132 test_accuracy after 100.26 seconds
CPU times: user 30min 15s, sys: 11min 58s, total: 42min 14s
Wall time: 8min 28s


## Evaluate

In [26]:
%%time
predictions = np.zeros((0, CLASSES))
labels = np.zeros((0, CLASSES))
for (data, label) in (test_loader):        
    data_split = gluon.utils.split_and_load(data, ctx)
    label_split = gluon.utils.split_and_load(label, ctx)  
    outputs = [sig(net(X)) for X in data_split]
    predictions = np.concatenate([predictions, np.concatenate([output.asnumpy() for output in outputs])])
    labels = np.concatenate([labels, np.concatenate([label.asnumpy() for label in label_split])])

CPU times: user 18.9 s, sys: 8.77 s, total: 27.6 s
Wall time: 14.8 s


In [27]:
print("Validation AUC: {0:.4f}".format(compute_roc_auc(labels, predictions, CLASSES)))

Full AUC [0.8119550768682271, 0.86638286690232225, 0.79959289382709886, 0.88130488886646741, 0.88259198273314099, 0.93064172059202255, 0.74659162250557209, 0.85676299643367249, 0.62677039835496062, 0.84243622250569472, 0.7539441086823172, 0.81413582659915884, 0.74745772595671178, 0.88639745267190717]
Validation AUC: 0.8176


## Synthetic Data (Pure Training)

In [28]:
# Test on fake-data -> no IO lag
batch_in_epoch = len(train_dataset.labels)//BATCHSIZE
tot_num = batch_in_epoch * BATCHSIZE
print(tot_num)

87296


In [29]:
fake_X = mx.nd.ones((tot_num, 3, 224, 224), dtype=np.float32)
fake_y = mx.nd.ones((tot_num, CLASSES), dtype=np.float32)

In [30]:
train_dataset_synth = ArrayDataset(fake_X, fake_y)
train_dataloader_synth = DataLoader(train_dataset_synth, BATCHSIZE, shuffle=False, num_workers=0, last_batch='discard')

In [31]:
%%time
n_batch = 100
for e in range(EPOCHS):
    tick = time()
    train_epoch(net, train_dataloader_synth, trainer, binary_cross_entropy, ctx)
    nd.waitall()
    print('Epoch {0}, {1:.2f} seconds'.format(e, time()-tick))

Batch 100: Loss: 0.1683
Batch 200: Loss: 0.0842
Batch 300: Loss: 0.0561
Epoch 0, 85.67 seconds
Batch 100: Loss: 0.0000
Batch 200: Loss: 0.0000
Batch 300: Loss: 0.0000
Epoch 1, 85.55 seconds
Batch 100: Loss: 0.0000
Batch 200: Loss: 0.0000
Batch 300: Loss: 0.0000
Epoch 2, 85.53 seconds
Batch 100: Loss: 0.0000
Batch 200: Loss: 0.0000
Batch 300: Loss: 0.0000
Epoch 3, 85.72 seconds
Batch 100: Loss: 0.0000
Batch 200: Loss: 0.0000
Batch 300: Loss: 0.0000
Epoch 4, 85.43 seconds
CPU times: user 27min 53s, sys: 11min 22s, total: 39min 15s
Wall time: 7min 7s
