In [1]:
import os
import sys
import time
import multiprocessing
import logging
import numpy as np
import pandas as pd
import mxnet as mx
from mxnet.io import DataDesc
from mxnet.gluon.model_zoo import vision as models
from mxnet import nd
from sklearn.metrics.ranking import roc_auc_score
from sklearn.model_selection import train_test_split
from PIL import Image
from common.utils import *

%load_ext autoreload
%autoreload 2

In [2]:
print("OS: ", sys.platform)
print("Python: ", sys.version)
print("MXNet: ", mx.__version__)
print("Numpy: ", np.__version__)
print("GPU: ", get_gpu_name())
print(get_cuda_version())
print("CuDNN Version ", get_cudnn_version())
CPU_COUNT = multiprocessing.cpu_count()
print("CPUs: ", CPU_COUNT)

OS:  linux
Python:  3.5.4 |Anaconda custom (64-bit)| (default, Nov  3 2017, 20:01:27) 
[GCC 7.2.0]
MXNet:  0.12.0
Numpy:  1.13.3
GPU:  ['Tesla P100-PCIE-16GB', 'Tesla P100-PCIE-16GB', 'Tesla P100-PCIE-16GB', 'Tesla P100-PCIE-16GB']
CUDA Version 8.0.61
CuDNN Version  6.0.21
CPUs:  24


In [3]:
# User-set
# Note if NUM_GPUS > 1 then MULTI_GPU = True and ALL GPUs will be used
# Set below to affect batch-size
# E.g. 1 GPU = 64, 2 GPUs = 64*2, 4 GPUs = 64*4
# Note that the effective learning-rate will be decreased this way
NUM_GPUS = 4 # Scaling factor for batch
MULTI_GPU=NUM_GPUS>1

In [4]:
# Globals
CLASSES = 14
WIDTH = 224
HEIGHT = 224
CHANNELS = 3
LR = 0.0001*NUM_GPUS  # Effective learning-rate will decrease as BATCHSIZE rises
EPOCHS = 5
BATCHSIZE = 64*NUM_GPUS
IMAGENET_RGB_MEAN = [0.485, 0.456, 0.406]
IMAGENET_RGB_SD = [0.229, 0.224, 0.225]
TOT_PATIENT_NUMBER = 30805  # From data

In [5]:
# Paths
CSV_DEST = "chestxray"
IMAGE_FOLDER = os.path.join(CSV_DEST, "images")
LABEL_FILE = os.path.join(CSV_DEST, "Data_Entry_2017.csv")
TRAIN_LST = os.path.join(CSV_DEST, "train.lst")
VALID_LST = os.path.join(CSV_DEST, "valid.lst")
TEST_LST = os.path.join(CSV_DEST, "test.lst")


In [6]:
%%time
# Download data
print("Please make sure to download")
print("https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-linux#download-and-install-azcopy")
download_data_chextxray(CSV_DEST)

Please make sure to download
https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-linux#download-and-install-azcopy
Data already exists
CPU times: user 537 ms, sys: 221 ms, total: 758 ms
Wall time: 757 ms


## Data prep
https://github.com/apache/incubator-mxnet/issues/1480


In [7]:
df = pd.read_csv(LABEL_FILE)
df.head()    

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143,
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143,
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168,
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171,
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143,


In [10]:
def data_prep(df, img_dir, patient_ids):
    # Split labels on unfiltered data
    df_label = df['Finding Labels'].str.split(
        '|', expand=False).str.join(sep='*').str.get_dummies(sep='*')

    # Filter by patient-ids (both)
    df_label['Patient ID'] = df['Patient ID']
    df_label = df_label[df_label['Patient ID'].isin(patient_ids)]
    df = df[df['Patient ID'].isin(patient_ids)]
    # Remove unncessary columns
    df_label.drop(['Patient ID','No Finding'], axis=1, inplace=True)  

    # List of images (full-path)
    img_locs =  df['Image Index'].map(lambda im: os.path.join(img_dir, im)).values
    # One-hot encoded labels (float32 for BCE loss)
    df_label['Image_path'] = img_locs
    return df_label


In [14]:
# Training / Valid / Test split (70% / 10% / 20%)
train_set, valid_set, test_set = get_train_valid_test_split(TOT_PATIENT_NUMBER)
df_train = data_prep(df, IMAGE_FOLDER, train_set)
df_valid = data_prep(df, IMAGE_FOLDER, valid_set)
df_test = data_prep(df, IMAGE_FOLDER, test_set)
print(df_train.shape)
print(df_valid.shape)
print(df_test.shape)

train:21563 valid:3080 test:6162
(87306, 15)
(7616, 15)
(17198, 15)


In [15]:
df_train.to_csv(TRAIN_LST, sep='\t', header=False)
df_valid.to_csv(VALID_LST, sep='\t', header=False)
df_test.to_csv(TEST_LST, sep='\t', header=False)

In [24]:
!/anaconda/envs/py35/bin/python ./common/im2rec.py chestxray/train.lst $PWD --resize 250 --center-crop --quality 100 --num-thread 24
!/anaconda/envs/py35/bin/python ./common/im2rec.py chestxray/valid.lst $PWD --resize 250 --center-crop --quality 100 --num-thread 24
!/anaconda/envs/py35/bin/python ./common/im2rec.py chestxray/test.lst $PWD --resize 250 --center-crop --quality 100 --num-thread 24

Creating .rec file from /home/hoaphumanoid/notebooks/repos/DeepLearningFrameworks/notebooks/chestxray/train.lst in /home/hoaphumanoid/notebooks/repos/DeepLearningFrameworks/notebooks/chestxray
time: 6.997234344482422  count: 0
time: 0.7863016128540039  count: 1000
time: 0.7199516296386719  count: 2000
time: 0.7553024291992188  count: 3000
time: 0.875377893447876  count: 4000
time: 0.6988627910614014  count: 5000
time: 0.8525941371917725  count: 6000
time: 0.865654468536377  count: 7000
time: 0.7837262153625488  count: 8000
time: 0.846381664276123  count: 9000
time: 1.3273906707763672  count: 10000
time: 0.3824479579925537  count: 11000
time: 0.7547008991241455  count: 12000
time: 0.7022714614868164  count: 13000
time: 0.6992290019989014  count: 14000
time: 0.6906943321228027  count: 15000
time: 0.7144792079925537  count: 16000
time: 0.7061898708343506  count: 17000
time: 0.7346706390380859  count: 18000
time: 0.8415727615356445  count: 19000
time: 0.8398227691650391  count: 20000
time:

## Data Loading
https://mxnet.incubator.apache.org/architecture/note_data_loading.html#mxnet-io-python-interface

https://github.com/miraclewkf/multilabel-MXNet/blob/master/train_multilabel.py

In [25]:
#https://mxnet.incubator.apache.org/versions/0.11.0/api/python/io.html#mxnet.io.ImageRecordIter
train = mx.io.ImageRecordIter(
    path_imgrec = "chestxray/train.rec",
    data_shape = (3, HEIGHT, WIDTH),
    path_imglist = "chestxray/train.lst",
    label_width = CLASSES,
    batch_size = BATCHSIZE,
    shuffle = True,
    mean_r = IMAGENET_RGB_MEAN[0],
    mean_g = IMAGENET_RGB_MEAN[1],
    mean_b = IMAGENET_RGB_MEAN[2],
    std_r = IMAGENET_RGB_SD[0],
    std_g = IMAGENET_RGB_SD[1],
    std_b = IMAGENET_RGB_SD[2],
    rand_crop = 1,
    rand_mirror = 1, #flip horizontally
    max_rotate_angle = 10,
    preprocess_threads = 24
)

valid = mx.io.ImageRecordIter(
    path_imgrec = "chestxray/valid.rec",
    data_shape = (3, HEIGHT, WIDTH),
    path_imglist = "chestxray/valid.lst",
    label_width = CLASSES,
    batch_size = BATCHSIZE,
    shuffle = False,
    rand_crop = 0,
    rand_mirror = 0, #flip horizontally
    preprocess_threads = 24
)


test = mx.io.ImageRecordIter(
    path_imgrec = "chestxray/test.rec",
    data_shape = (3, HEIGHT, WIDTH),
    path_imglist = "chestxray/test.lst",
    label_width = CLASSES,
    batch_size = BATCHSIZE,
    shuffle = False,
    rand_mirror = 0, #flip horizontally
    round_batch = False,
    preprocess_threads = 24
)


## Helper Functions

In [26]:
#https://github.com/miraclewkf/multilabel-MXNet/blob/master/crossentropy.py
class CrossEntropyLoss(mx.operator.CustomOp):
    """An output layer that calculates gradient for cross-entropy loss
    y * log(p) + (1-y) * log(p)
    for label "y" and prediction "p".
    However, the output of this layer is the original prediction -- same as
    the "data" input, making it useful for tasks like "predict".
    If you actually want to use the calculated loss, see CrossEntropyLoss op.

    This is useful for multi-label prediction where each possible output
    label is considered independently.
    Cross-entropy loss provides a very large penalty for guessing
    the wrong answer (0 or 1) confidently.
    The gradient calculation is optimized for y only being 0 or 1.
    """

    eps = 1e-6 # Avoid -inf when taking log(0)
    eps1 = 1. + eps
    eps_1 = 1. - eps

    def forward(self, is_train, req, in_data, out_data, aux):
        # Shapes:
        #  b = minibatch size
        #  d = number of dimensions
        actually_calculate_loss = False
        if actually_calculate_loss:
            p = in_data[0]  # shape=(b,d)
            y = in_data[1]
            out = y * nd.log(p+self.eps) + (1.-y) * nd.log((self.eps1) - p)
            self.assign(out_data[0], req[0], out)
        else:
            # Just copy the predictions forward
            self.assign(out_data[0], req[0], in_data[0])


    def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
        self.approx_backward(req, out_grad, in_data, out_data, in_grad, aux)
        #self.exact_backward(req, out_grad, in_data, out_data, in_grad, aux)

    def approx_backward(self, req, out_grad, in_data, out_data, in_grad, aux):
        """Correct grad = (y-p)/(p-p^2)
        But if y is just 1 or 0, then this simplifies to
        grad = 1/(p-1+y)
        which is more numerically stable
        """
        p = in_data[0]  # shape=(b,d)
        y = in_data[1]
        grad = -1. / (p - self.eps_1 + y)
        self.assign(in_grad[0], req[0], grad)


    def exact_backward(self, req, out_grad, in_data, out_data, in_grad, aux):
        """grad = (y-p)/(p-p^2)
        """
        p = in_data[0] # shape=(b,d)
        y = in_data[1]  # seems right
        grad = (p - y) / ((p+self.eps) * (self.eps1 - p))
        self.assign(in_grad[0], req[0], grad)


@mx.operator.register("CrossEntropyLoss")
class CrossEntropyProp(mx.operator.CustomOpProp):
    def __init__(self):
        super(CrossEntropyProp, self).__init__(need_top_grad=False)

    def list_arguments(self):
        return ['data','label']

    def list_outputs(self):
        return ['preds']

    def create_operator(self, ctx, shapes, dtypes):
        return CrossEntropyLoss()

    def infer_shape(self, in_shape):
        if in_shape[0] != in_shape[1]:
            raise ValueError("Input shapes differ. data:%s. label:%s. must be same"
                    % (str(in_shape[0]),str(in_shape[1])))
        output_shape = in_shape[0]
        return in_shape, [output_shape], []

In [27]:
#https://hackernoon.com/transfer-learning-with-mxnet-gluon-8203005afafe
#http://mxnet.incubator.apache.org/faq/finetune.html
def get_symbol(model_name='densenet121', out_features=CLASSES):
    if model_name == 'densenet121':
        #models: https://github.com/miraclewkf/DenseNet
        get_mxnet_model('https://migonzastorage.blob.core.windows.net/deep-learning/models/mxnet/densenet-121', 0)
        sym, arg_params, aux_params = mx.model.load_checkpoint('densenet-121', 0)
        layer_name='pool5'
    elif model_name == 'resnet50':
        get_mxnet_model('http://data.mxnet.io/models/imagenet/resnet/50-layers/resnet-50', 0)
        sym, arg_params, aux_params = mx.model.load_checkpoint('resnet-50', 0)
        layer_name='flatten0'
    else:
        raise ValueError("Unknown model-name")
    all_layers = sym.get_internals()
    sym = all_layers[layer_name+'_output']
    sym = mx.symbol.FullyConnected(data=sym, num_hidden=out_features, name='fc1')
    sym = mx.symbol.sigmoid(data=sym, name='sig')
    sym = mx.symbol.Custom(data=sym, name='softmax', op_type='CrossEntropyLoss')
    #sym = mx.symbol.SoftmaxOutput(data=sym, name='softmax')
    #sym = mx.symbol.softmax_cross_entropy(data=sym, name='softmax')
    #sym = mx.symbol.LogisticRegressionOutput(data=sym, name='softmax')#https://github.com/apache/incubator-mxnet/issues/1758
    new_args = dict({k:arg_params[k] for k in arg_params if 'fc1' not in k})
    return sym, new_args, aux_params

In [28]:
def init_symbol(sym, lr=LR, gpus=NUM_GPUS, batch_size=BATCHSIZE, epochs=EPOCHS, 
                num_examples=TOT_PATIENT_NUMBER, step=[5,10], num_class=CLASSES):    
    devs = [mx.gpu(i) for i in range(gpus)]   
    model = mx.mod.Module(
        context       = devs,
        symbol        = sym
    )
    model.bind(data_shapes=[('data', (batch_size, 3, HEIGHT, WIDTH))],
             label_shapes=[('softmax_label', (batch_size, num_class))])
    #Criterion
    def acc(label, pred, label_width = num_class):
        return float((label == np.round(pred)).sum()) / label_width / pred.shape[0]

#    def loss(label, pred):
#        loss_all = 0
#        for i in range(len(pred)):
#            loss = 0
#            loss -= label[i] * np.log(pred[i] + 1e-6) + (1.- label[i]) * np.log(1. + 1e-6 - pred[i])
#            loss_all += np.sum(loss)
#        loss_all = float(loss_all)/float(len(pred) + 0.000001)
#        return loss_all
    
    cri = list()
    cri.append(mx.metric.np(acc))
#    cri.append(mx.metric.np(loss))
    
    #Scheduler
    def multi_factor_scheduler(begin_epoch, epoch_size, step=step, factor=0.1):
        step_ = [epoch_size * (x-begin_epoch) for x in step if x-begin_epoch > 0]
        return mx.lr_scheduler.MultiFactorScheduler(step=step_, factor=factor) if len(step_) else None

    epoch_size = max(int(num_examples / batch_size), 1)
    sch=multi_factor_scheduler(epochs, epoch_size)
    
    return model, cri, sch 

In [29]:
sym, arg_params, aux_params = get_symbol()

In [30]:
model, criterion, scheduler = init_symbol(sym, num_examples=df_train.shape[0])

In [31]:
%%time
head = '%(asctime)-15s %(message)s'
logging.basicConfig(level=logging.DEBUG, format=head)

optimizer_params = {
        'learning_rate': LR,
        'lr_scheduler': scheduler,
}

model.fit(train,
          valid,
          num_epoch=EPOCHS,
          arg_params=arg_params,
          aux_params=aux_params,
          allow_missing=True,
          batch_end_callback = mx.callback.Speedometer(BATCHSIZE, BATCHSIZE),
          kvstore='device',
          optimizer='adam',
          optimizer_params = optimizer_params,
          eval_metric=criterion
)



2018-03-21 17:11:45,839 Already bound, ignoring bind()
2018-03-21 17:13:55,247 Epoch[0] Batch [256]	Speed: 523.05 samples/sec	acc=0.949326
2018-03-21 17:14:36,767 Epoch[0] Train-acc=0.940471
2018-03-21 17:14:36,774 Epoch[0] Time cost=170.644
2018-03-21 17:14:41,630 Epoch[0] Validation-acc=0.952548
2018-03-21 17:16:46,726 Epoch[1] Batch [256]	Speed: 524.79 samples/sec	acc=0.951888
2018-03-21 17:17:27,729 Epoch[1] Train-acc=0.940795
2018-03-21 17:17:27,732 Epoch[1] Time cost=166.100
2018-03-21 17:17:32,379 Epoch[1] Validation-acc=0.952409
2018-03-21 17:19:37,502 Epoch[2] Batch [256]	Speed: 524.91 samples/sec	acc=0.952049
2018-03-21 17:20:18,257 Epoch[2] Train-acc=0.940984
2018-03-21 17:20:18,262 Epoch[2] Time cost=165.879
2018-03-21 17:20:22,919 Epoch[2] Validation-acc=0.952214
2018-03-21 17:22:28,507 Epoch[3] Batch [256]	Speed: 522.95 samples/sec	acc=0.952339
2018-03-21 17:23:09,102 Epoch[3] Train-acc=0.941234
2018-03-21 17:23:09,105 Epoch[3] Time cost=166.184
2018-03-21 17:23:13,665 Ep

CPU times: user 1h 2min 33s, sys: 19min, total: 1h 21min 34s
Wall time: 14min 18s


## Test CheXNet

In [32]:
y_guess = model.predict(test)
print(y_guess.shape)

(17408, 14)


In [33]:
def get_labels(iterator, shape):
    iterator.reset()
    data = np.zeros(shape, dtype=np.float32)
    batch_size = iterator.batch_size
    c = 0
    for batch in iterator:
        label = batch.label
        data[c*batch_size:(c+1)*batch_size] = label[0].asnumpy()
        c += 1
    return data    

In [34]:
labels = get_labels(test, y_guess.shape)
print(labels.shape)

(17408, 14)


In [35]:
def compute_roc_auc(data_gt, data_pd, full=True, classes=CLASSES):
    roc_auc = []
    for i in range(classes):
        roc_auc.append(roc_auc_score(data_gt[:, i], data_pd[:, i]))
    print("Full AUC", roc_auc)
    roc_auc = np.mean(roc_auc)
    return roc_auc

In [36]:
print("Validation AUC: {0:.4f}".format(compute_roc_auc(labels, y_guess.asnumpy())))

Full AUC [0.76038465872124505, 0.79686609686609688, 0.75042660146444251, 0.85506960094684126, 0.82954897183148457, 0.7885669844379638, 0.6842608649111539, 0.68850402272908318, 0.58971309413438378, 0.70868490148925112, 0.65033492409468974, 0.68877773703647227, 0.67590832115717325, 0.80016679386145784]
Validation AUC: 0.7334
