In [20]:
import os
import sys
import time
import multiprocessing
import logging
import numpy as np
import pandas as pd
import mxnet as mx
from mxnet.io import DataDesc
from mxnet.gluon.model_zoo import vision as models
from mxnet import nd
from sklearn.metrics.ranking import roc_auc_score
from sklearn.model_selection import train_test_split
from PIL import Image
from common.utils import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
print("OS: ", sys.platform)
print("Python: ", sys.version)
print("MXNet: ", mx.__version__)
print("Numpy: ", np.__version__)
print("GPU: ", get_gpu_name())
print(get_cuda_version())
print("CuDNN Version ", get_cudnn_version())
CPU_COUNT = multiprocessing.cpu_count()
print("CPUs: ", CPU_COUNT)

OS:  linux
Python:  3.5.4 |Anaconda custom (64-bit)| (default, Nov  3 2017, 20:01:27) 
[GCC 7.2.0]
MXNet:  0.12.0
Numpy:  1.13.3
GPU:  ['Tesla P100-PCIE-16GB', 'Tesla P100-PCIE-16GB', 'Tesla P100-PCIE-16GB', 'Tesla P100-PCIE-16GB']
CUDA Version 8.0.61
CuDNN Version  6.0.21
CPUs:  24


In [5]:
# User-set
# Note if NUM_GPUS > 1 then MULTI_GPU = True and ALL GPUs will be used
# Set below to affect batch-size
# E.g. 1 GPU = 64, 2 GPUs = 64*2, 4 GPUs = 64*4
# Note that the effective learning-rate will be decreased this way
NUM_GPUS = 4 # Scaling factor for batch
MULTI_GPU=NUM_GPUS>1

In [6]:
# Globals
CLASSES = 14
WIDTH = 224
HEIGHT = 224
CHANNELS = 3
LR = 0.0001*NUM_GPUS  # Effective learning-rate will decrease as BATCHSIZE rises
EPOCHS = 5
BATCHSIZE = 64*NUM_GPUS
IMAGENET_RGB_MEAN = [0.485, 0.456, 0.406]
IMAGENET_RGB_SD = [0.229, 0.224, 0.225]
TOT_PATIENT_NUMBER = 30805  # From data

In [7]:
# Paths
CSV_DEST = "chestxray"
IMAGE_FOLDER = os.path.join(CSV_DEST, "images")
LABEL_FILE = os.path.join(CSV_DEST, "Data_Entry_2017.csv")
TRAIN_LST = os.path.join(CSV_DEST, "train.lst")
VALID_LST = os.path.join(CSV_DEST, "valid.lst")
TEST_LST = os.path.join(CSV_DEST, "test.lst")


In [8]:
%%time
# Download data
print("Please make sure to download")
print("https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-linux#download-and-install-azcopy")
download_data_chextxray(CSV_DEST)

Please make sure to download
https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-linux#download-and-install-azcopy
Data already exists
CPU times: user 598 ms, sys: 654 ms, total: 1.25 s
Wall time: 17.9 s


## Data prep
https://github.com/apache/incubator-mxnet/issues/1480


In [9]:
df = pd.read_csv(LABEL_FILE)
df.head()    

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143,
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143,
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168,
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171,
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143,


In [10]:
# Split labels on unfiltered data
df_label = df['Finding Labels'].str.split('|', expand=False).str.join(sep='*').str.get_dummies(sep='*')
df_label['Image_path'] = IMAGE_FOLDER + os.path.sep + df['Image Index']
df_label.drop('No Finding', axis=1, inplace=True)
df_label.head()

Unnamed: 0,Atelectasis,Cardiomegaly,Consolidation,Edema,Effusion,Emphysema,Fibrosis,Hernia,Infiltration,Mass,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax,Image_path
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,chestxray/images/00000001_000.png
1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,chestxray/images/00000001_001.png
2,0,1,0,0,1,0,0,0,0,0,0,0,0,0,chestxray/images/00000001_002.png
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,chestxray/images/00000002_000.png
4,0,0,0,0,0,0,0,1,0,0,0,0,0,0,chestxray/images/00000003_000.png


In [14]:
# Training / Valid / Test split (70% / 10% / 20%)
train_set, valid_set, test_set = get_train_valid_test_split(TOT_PATIENT_NUMBER)
df_train = df_label.iloc[train_set]
df_valid = df_label.iloc[valid_set]
df_test = df_label.iloc[test_set]

train:21563 valid:3080 test:6162


In [19]:
df_train.to_csv(TRAIN_LST, sep='\t', header=False)
df_valid.to_csv(VALID_LST, sep='\t', header=False)
df_test.to_csv(TEST_LST, sep='\t', header=False)

In [11]:
#%tb
#%run ./common/im2rec.py $TRAIN_LST $PWD --resize 224 --center-crop --quality 90 --num-thread 24
#run not working!?
#python ./common/im2rec.py chestxray/train.lst $PWD --resize 250 --center-crop --quality 90 --num-thread 24
#python ./common/im2rec.py chestxray/valid.lst $PWD --resize 250 --center-crop --quality 90 --num-thread 24
#python ./common/im2rec.py chestxray/test.lst $PWD --resize 250 --center-crop --quality 90 --num-thread 24

## Data Loading
https://mxnet.incubator.apache.org/architecture/note_data_loading.html#mxnet-io-python-interface

https://github.com/miraclewkf/multilabel-MXNet/blob/master/train_multilabel.py

In [21]:
#https://mxnet.incubator.apache.org/versions/0.11.0/api/python/io.html#mxnet.io.ImageRecordIter
train = mx.io.ImageRecordIter(
    path_imgrec = "chestxray/train.rec",
    data_shape = (3, HEIGHT, WIDTH),
    path_imglist = "chestxray/train.lst",
    label_width = CLASSES,
    batch_size = BATCHSIZE,
    shuffle = True,
    mean_r = IMAGENET_RGB_MEAN[0],
    mean_g = IMAGENET_RGB_MEAN[1],
    mean_b = IMAGENET_RGB_MEAN[2],
    std_r = IMAGENET_RGB_SD[0],
    std_g = IMAGENET_RGB_SD[1],
    std_b = IMAGENET_RGB_SD[2],
    rand_crop = 1,
    rand_mirror = 1, #flip horizontally
    max_rotate_angle = 10,
    preprocess_threads = 24
)

valid = mx.io.ImageRecordIter(
    path_imgrec = "chestxray/valid.rec",
    data_shape = (3, HEIGHT, WIDTH),
    path_imglist = "chestxray/valid.lst",
    label_width = CLASSES,
    batch_size = BATCHSIZE,
    shuffle = False,
    rand_crop = 0,
    rand_mirror = 0, #flip horizontally
    preprocess_threads = 24
)


test = mx.io.ImageRecordIter(
    path_imgrec = "chestxray/test.rec",
    data_shape = (3, HEIGHT, WIDTH),
    path_imglist = "chestxray/test.lst",
    label_width = CLASSES,
    batch_size = BATCHSIZE,
    shuffle = False,
    rand_mirror = 0, #flip horizontally
    round_batch = False,
    preprocess_threads = 24
)


## Helper Functions

In [22]:
#https://github.com/miraclewkf/multilabel-MXNet/blob/master/crossentropy.py
class CrossEntropyLoss(mx.operator.CustomOp):
    """An output layer that calculates gradient for cross-entropy loss
    y * log(p) + (1-y) * log(p)
    for label "y" and prediction "p".
    However, the output of this layer is the original prediction -- same as
    the "data" input, making it useful for tasks like "predict".
    If you actually want to use the calculated loss, see CrossEntropyLoss op.

    This is useful for multi-label prediction where each possible output
    label is considered independently.
    Cross-entropy loss provides a very large penalty for guessing
    the wrong answer (0 or 1) confidently.
    The gradient calculation is optimized for y only being 0 or 1.
    """

    eps = 1e-6 # Avoid -inf when taking log(0)
    eps1 = 1. + eps
    eps_1 = 1. - eps

    def forward(self, is_train, req, in_data, out_data, aux):
        # Shapes:
        #  b = minibatch size
        #  d = number of dimensions
        actually_calculate_loss = False
        if actually_calculate_loss:
            p = in_data[0]  # shape=(b,d)
            y = in_data[1]
            out = y * nd.log(p+self.eps) + (1.-y) * nd.log((self.eps1) - p)
            self.assign(out_data[0], req[0], out)
        else:
            # Just copy the predictions forward
            self.assign(out_data[0], req[0], in_data[0])


    def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
        self.approx_backward(req, out_grad, in_data, out_data, in_grad, aux)
        #self.exact_backward(req, out_grad, in_data, out_data, in_grad, aux)

    def approx_backward(self, req, out_grad, in_data, out_data, in_grad, aux):
        """Correct grad = (y-p)/(p-p^2)
        But if y is just 1 or 0, then this simplifies to
        grad = 1/(p-1+y)
        which is more numerically stable
        """
        p = in_data[0]  # shape=(b,d)
        y = in_data[1]
        grad = -1. / (p - self.eps_1 + y)
        self.assign(in_grad[0], req[0], grad)


    def exact_backward(self, req, out_grad, in_data, out_data, in_grad, aux):
        """grad = (y-p)/(p-p^2)
        """
        p = in_data[0] # shape=(b,d)
        y = in_data[1]  # seems right
        grad = (p - y) / ((p+self.eps) * (self.eps1 - p))
        self.assign(in_grad[0], req[0], grad)


@mx.operator.register("CrossEntropyLoss")
class CrossEntropyProp(mx.operator.CustomOpProp):
    def __init__(self):
        super(CrossEntropyProp, self).__init__(need_top_grad=False)

    def list_arguments(self):
        return ['data','label']

    def list_outputs(self):
        return ['preds']

    def create_operator(self, ctx, shapes, dtypes):
        return CrossEntropyLoss()

    def infer_shape(self, in_shape):
        if in_shape[0] != in_shape[1]:
            raise ValueError("Input shapes differ. data:%s. label:%s. must be same"
                    % (str(in_shape[0]),str(in_shape[1])))
        output_shape = in_shape[0]
        return in_shape, [output_shape], []

In [23]:
#https://hackernoon.com/transfer-learning-with-mxnet-gluon-8203005afafe
#http://mxnet.incubator.apache.org/faq/finetune.html
def get_symbol(model_name='densenet121', out_features=CLASSES):
    if model_name == 'densenet121':
        #models: https://github.com/miraclewkf/DenseNet
        get_mxnet_model('https://migonzastorage.blob.core.windows.net/deep-learning/models/mxnet/densenet-121', 0)
        sym, arg_params, aux_params = mx.model.load_checkpoint('densenet-121', 0)
        layer_name='pool5'
    elif model_name == 'resnet50':
        get_mxnet_model('http://data.mxnet.io/models/imagenet/resnet/50-layers/resnet-50', 0)
        sym, arg_params, aux_params = mx.model.load_checkpoint('resnet-50', 0)
        layer_name='flatten0'
    else:
        raise ValueError("Unknown model-name")
    all_layers = sym.get_internals()
    sym = all_layers[layer_name+'_output']
    sym = mx.symbol.FullyConnected(data=sym, num_hidden=out_features, name='fc1')
    sym = mx.symbol.sigmoid(data=sym, name='sig')
    sym = mx.symbol.Custom(data=sym, name='softmax', op_type='CrossEntropyLoss')
    #sym = mx.symbol.SoftmaxOutput(data=sym, name='softmax')
    #sym = mx.symbol.softmax_cross_entropy(data=sym, name='softmax')
    #sym = mx.symbol.LogisticRegressionOutput(data=sym, name='softmax')#https://github.com/apache/incubator-mxnet/issues/1758
    new_args = dict({k:arg_params[k] for k in arg_params if 'fc1' not in k})
    return sym, new_args, aux_params

In [24]:
def init_symbol(sym, lr=LR, gpus=NUM_GPUS, batch_size=BATCHSIZE, epochs=EPOCHS, 
                num_examples=TOT_PATIENT_NUMBER, step=[5,10], num_class=CLASSES):    
    devs = [mx.gpu(i) for i in range(gpus)]   
    model = mx.mod.Module(
        context       = devs,
        symbol        = sym
    )
    model.bind(data_shapes=[('data', (batch_size, 3, HEIGHT, WIDTH))],
             label_shapes=[('softmax_label', (batch_size, num_class))])
    #Criterion
    def acc(label, pred, label_width = num_class):
        return float((label == np.round(pred)).sum()) / label_width / pred.shape[0]

#    def loss(label, pred):
#        loss_all = 0
#        for i in range(len(pred)):
#            loss = 0
#            loss -= label[i] * np.log(pred[i] + 1e-6) + (1.- label[i]) * np.log(1. + 1e-6 - pred[i])
#            loss_all += np.sum(loss)
#        loss_all = float(loss_all)/float(len(pred) + 0.000001)
#        return loss_all
    
    cri = list()
    cri.append(mx.metric.np(acc))
#    cri.append(mx.metric.np(loss))
    
    #Scheduler
    def multi_factor_scheduler(begin_epoch, epoch_size, step=step, factor=0.1):
        step_ = [epoch_size * (x-begin_epoch) for x in step if x-begin_epoch > 0]
        return mx.lr_scheduler.MultiFactorScheduler(step=step_, factor=factor) if len(step_) else None

    epoch_size = max(int(num_examples / batch_size), 1)
    sch=multi_factor_scheduler(epochs, epoch_size)
    
    return model, cri, sch 

In [25]:
sym, arg_params, aux_params = get_symbol()

In [26]:
model, criterion, scheduler = init_symbol(sym, num_examples=df_train.shape[0])

In [27]:
%%time
head = '%(asctime)-15s %(message)s'
logging.basicConfig(level=logging.DEBUG, format=head)

optimizer_params = {
        'learning_rate': LR,
        'lr_scheduler': scheduler,
}

model.fit(train,
          valid,
          num_epoch=EPOCHS,
          arg_params=arg_params,
          aux_params=aux_params,
          allow_missing=True,
          batch_end_callback = mx.callback.Speedometer(BATCHSIZE, BATCHSIZE),
          kvstore='device',
          optimizer='adam',
          optimizer_params = optimizer_params,
          eval_metric=criterion
)



2018-03-21 16:23:36,795 Already bound, ignoring bind()
2018-03-21 16:24:21,828 Epoch[0] Train-acc=0.948431
2018-03-21 16:24:21,830 Epoch[0] Time cost=44.744
2018-03-21 16:24:24,001 Epoch[0] Validation-acc=0.957868
2018-03-21 16:25:04,378 Epoch[1] Train-acc=0.955414
2018-03-21 16:25:04,383 Epoch[1] Time cost=40.378
2018-03-21 16:25:06,563 Epoch[1] Validation-acc=0.958031
2018-03-21 16:25:46,849 Epoch[2] Train-acc=0.955377
2018-03-21 16:25:46,853 Epoch[2] Time cost=40.287
2018-03-21 16:25:49,017 Epoch[2] Validation-acc=0.957915
2018-03-21 16:26:29,519 Epoch[3] Train-acc=0.955470
2018-03-21 16:26:29,521 Epoch[3] Time cost=40.500
2018-03-21 16:26:31,728 Epoch[3] Validation-acc=0.958008
2018-03-21 16:27:11,770 Update[421]: Change learning rate to 4.00000e-05
2018-03-21 16:27:12,673 Epoch[4] Train-acc=0.955410
2018-03-21 16:27:12,677 Epoch[4] Time cost=40.946
2018-03-21 16:27:14,876 Epoch[4] Validation-acc=0.958054


CPU times: user 14min 57s, sys: 4min 56s, total: 19min 54s
Wall time: 3min 38s


## Test CheXNet

In [28]:
y_guess = model.predict(test)
print(y_guess.shape)

(6400, 14)


In [29]:
def get_labels(iterator, shape):
    iterator.reset()
    data = np.zeros(shape, dtype=np.float32)
    batch_size = iterator.batch_size
    c = 0
    for batch in iterator:
        label = batch.label
        data[c*batch_size:(c+1)*batch_size] = label[0].asnumpy()
        c += 1
    return data    

In [30]:
labels = get_labels(test, y_guess.shape)
print(labels.shape)

(6400, 14)


In [31]:
def compute_roc_auc(data_gt, data_pd, full=True, classes=CLASSES):
    roc_auc = []
    for i in range(classes):
        roc_auc.append(roc_auc_score(data_gt[:, i], data_pd[:, i]))
    print("Full AUC", roc_auc)
    roc_auc = np.mean(roc_auc)
    return roc_auc

In [32]:
print("Validation AUC: {0:.4f}".format(compute_roc_auc(labels, y_guess.asnumpy())))

Full AUC [0.57753324505814274, 0.48787919085092069, 0.41411440747063766, 0.28096788296502995, 0.6422041630192592, 0.53673588259767768, 0.62747673895749023, 0.78126859827721218, 0.40543812190530426, 0.59213467541225306, 0.55718067341459887, 0.59773583406469966, 0.49339953922187957, 0.55275329565479836]
Validation AUC: 0.5391
