In [1]:
# TODO:
# Create a data-augmentor + data-preprocessing class
# Using cv2 not pil and maybe using https://github.com/aleju/imgaug
# Will ensure that pre-processing + augmentation standardised across frameworks
# Also maybe PIL is bottlenecking pytorch?

In [2]:
import os
import sys
import time
import multiprocessing
import numpy as np
import pandas as pd

import chainer
import chainer.functions as F
import chainer.links as L
from chainer import optimizers, cuda, dataset, training
from chainer.training import extensions, StandardUpdater

from sklearn.metrics.ranking import roc_auc_score
from sklearn.model_selection import train_test_split

#from PIL import Image
import random
import cv2

from common.utils import *

In [3]:
# Performance Improvement
# 1. Auto-tune
# This adds very little now .. not sure if True by default?
chainer.global_config.autotune = True

In [4]:
print("OS: ", sys.platform)
print("Python: ", sys.version)
print("Chainer: ", chainer.__version__)
print("CuPy: ", chainer.cuda.cupy.__version__)
print("Numpy: ", np.__version__)
print("GPU: ", get_gpu_name())
print(get_cuda_version())
print("CuDNN Version ", get_cudnn_version())
CPU_COUNT = multiprocessing.cpu_count()
print("CPUs: ", CPU_COUNT)

OS:  linux
Python:  3.5.2 |Anaconda custom (64-bit)| (default, Jul  2 2016, 17:53:06) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]
Chainer:  3.4.0
CuPy:  2.4.0
Numpy:  1.14.1
GPU:  ['Tesla P100-PCIE-16GB', 'Tesla P100-PCIE-16GB']
CUDA Version 8.0.61
CuDNN Version  6.0.21
CPUs:  12


In [5]:
# User-set
# Note if NUM_GPUS > 1 then MULTI_GPU = True and ALL GPUs will be used
# Set below to affect batch-size
# E.g. 1 GPU = 64, 2 GPUs = 64*2, 4 GPUs = 64*4
# Note that the effective learning-rate will be decreased this way
NUM_GPUS = 1 # Scaling factor for batch
MULTI_GPU=NUM_GPUS>1

In [6]:
# Globals
CLASSES = 14
WIDTH = 224
HEIGHT = 224
CHANNELS = 3
LR = 0.0001  # Effective learning-rate will decrease as BATCHSIZE rises
EPOCHS = 5
BATCHSIZE = 64*NUM_GPUS
IMAGENET_RGB_MEAN =  np.array([0.485, 0.456, 0.406], dtype=np.float32)
IMAGENET_RGB_SD =  np.array([0.229, 0.224, 0.225], dtype=np.float32)
TOT_PATIENT_NUMBER = 30805  # From data

In [7]:
# Paths
CSV_DEST = "chestxray"
IMAGE_FOLDER = os.path.join(CSV_DEST, "images")
LABEL_FILE = os.path.join(CSV_DEST, "Data_Entry_2017.csv")
print(IMAGE_FOLDER, LABEL_FILE)

chestxray/images chestxray/Data_Entry_2017.csv


In [8]:
%%time
# Download data
print("Please make sure to download")
print("https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-linux#download-and-install-azcopy")
download_data_chextxray(CSV_DEST)

Please make sure to download
https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-linux#download-and-install-azcopy
Data already exists
CPU times: user 678 ms, sys: 202 ms, total: 880 ms
Wall time: 881 ms


In [9]:
#####################################################################################################
## Data Loading

In [10]:
class XrayData(dataset.DatasetMixin):
    def __init__(self, patient_ids, img_dir=IMAGE_FOLDER, lbl_file=LABEL_FILE, augmentation=None):
        # Read labels-csv
        df = pd.read_csv(lbl_file)
        
        # Split labels on unfiltered data
        df_label = df['Finding Labels'].str.split(
            '|', expand=False).str.join(sep='*').str.get_dummies(sep='*')
        
        # Filter by patient-ids (both)
        df_label['Patient ID'] = df['Patient ID']
        df_label = df_label[df_label['Patient ID'].isin(patient_ids)]
        df = df[df['Patient ID'].isin(patient_ids)]
        
        # Remove unncessary columns
        df_label.drop(['Patient ID','No Finding'], axis=1, inplace=True)  
        
        # List of images (full-path)
        self.img_locs =  df['Image Index'].map(lambda im: os.path.join(img_dir, im)).values
        # One-hot encoded labels (float32 for BCE loss)
        self.labels = df_label.values
        
        # Processing
        self.augmentation = augmentation
        print("Loaded {} labels and {} images".format(len(self.labels), len(self.img_locs)))
        
    def __len__(self):
        return len(self.img_locs)   
    
    def get_example(self, idx):
        im_file = self.img_locs[idx]
        # RGB Image
        im = cv2.imread(im_file) 
        im_rgb = self._apply_data_preprocessing(im)
        label = self.labels[idx]
        if self.augmentation is not None:
            im_rgb = self._apply_data_augmentation(im_rgb)
        return np.array(im_rgb, dtype=np.float32), np.array(label, dtype=np.int32)
    
    def _apply_data_preprocessing(self, im, w=WIDTH, h=HEIGHT, 
                                  rgb_m=IMAGENET_RGB_MEAN, rgb_sd=IMAGENET_RGB_SD):
        # REDO Method!
        # BGR to RGB
        im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB) 
        # Resize
        im = cv2.resize(im, dsize=(w, h), interpolation=cv2.INTER_CUBIC)
        # Channels first (3, 224, 224)
        im = np.moveaxis(im, -1, 0)
        # Normalise image with imagenet-values
        # This is torch-model normalisation
        # Ref: https://github.com/keras-team/keras/blob/master/keras/applications/imagenet_utils.py
        im = im / 255.
        im = (im - rgb_m[:, None, None]) / rgb_sd[:, None, None]
        return im
    
    def _apply_data_augmentation(self, im):
        # REDO Method!
        # Random horizontal flip
        if random.randint(0, 1):
            im = cv2.flip(im, flipCode=1)
        # Random rotation
        # ...
        # Random crop/zoom
        # ...
        return im

In [11]:
# Training / Valid / Test split (70% / 10% / 20%)
train_set, other_set = train_test_split(
    range(1,TOT_PATIENT_NUMBER+1), train_size=0.7, test_size=0.3, shuffle=False)
valid_set, test_set = train_test_split(other_set, train_size=1/3, test_size=2/3, shuffle=False)
print("train:{} valid:{} test:{}".format(
    len(train_set), len(valid_set), len(test_set)))

train:21563 valid:3080 test:6162


In [12]:
train_dataset = XrayData(img_dir=IMAGE_FOLDER, lbl_file=LABEL_FILE, patient_ids=train_set, augmentation=True)
valid_dataset = XrayData(img_dir=IMAGE_FOLDER, lbl_file=LABEL_FILE, patient_ids=valid_set, augmentation=False)
test_dataset  = XrayData(img_dir=IMAGE_FOLDER, lbl_file=LABEL_FILE, patient_ids=test_set, augmentation=False)

Loaded 87306 labels and 87306 images
Loaded 7616 labels and 7616 images
Loaded 17198 labels and 17198 images


In [13]:
#####################################################################################################
## Helper Functions

In [14]:
class chexnet(chainer.Chain):
    def __init__(self, base_model, out_features=CLASSES):
        super(chexnet, self).__init__()
        with self.init_scope():
            self.base_model = base_model
            in_features = 2048  # How to get this programmatically?
            self.classifier = L.Linear(in_features, out_features)
    
    def __call__(self, x):
        h = self.base_model(x, layers=['pool5']) # How to get this programatically?
        #return F.sigmoid(self.classifier(h))
        return self.classifier(h)  # sigmoid applied in BCE function

In [15]:
def get_symbol(model_name='densenet121'):
    if model_name == 'resnet50':
        base_model = chainer.links.ResNet50Layers(pretrained_model="auto")
    elif model_name == 'densenet121':
        # https://github.com/chainer/chainer/issues/4426
        raise ValueError("Densenet is not yet officially implemented in Chainer")
    else:
        raise ValueError("Unknown model-name")
    # Change last-layer
    model = chexnet(base_model)
    # CUDA
    chainer.cuda.get_device(0).use()  # Make a specified GPU current
    model.to_gpu()  
    return model

In [16]:
def init_symbol(sym, lr=LR):
    opt = optimizers.Adam(alpha=lr, beta1=0.9, beta2=0.999)
    opt.setup(sym)
    return opt

In [17]:
%%time
# Load symbol
chexnet_sym = get_symbol(model_name='resnet50')

CPU times: user 671 ms, sys: 399 ms, total: 1.07 s
Wall time: 2.16 s


In [18]:
%%time
# Load optimiser
optimizer = init_symbol(chexnet_sym)

CPU times: user 545 µs, sys: 110 µs, total: 655 µs
Wall time: 662 µs


In [19]:
# Data-iterators
train_iter = chainer.iterators.MultiprocessIterator(train_dataset, BATCHSIZE, n_processes=CPU_COUNT)
valid_iter = chainer.iterators.MultiprocessIterator(valid_dataset, 16*BATCHSIZE, n_processes=CPU_COUNT)
test_iter = chainer.iterators.MultiprocessIterator(test_dataset, 16*BATCHSIZE, n_processes=CPU_COUNT)

In [20]:
# High-level trainer-class for easy multi-gpu later

In [21]:
updater = StandardUpdater(train_iter, optimizer, loss_func=F.sigmoid_cross_entropy, device=0)
trainer = training.Trainer(updater, stop_trigger=(EPOCHS, 'epoch'))
trainer.extend(extensions.ProgressBar(update_interval=10))

In [None]:
#trainer.run()

[J

Exception in main training loop: 
Invalid operation is performed in: SigmoidCrossEntropy (Forward)

Expect: in_types[0].shape == in_types[1].shape
Actual: (64, 3, 224, 224) != (64, 14)
Traceback (most recent call last):
  File "/anaconda/envs/py35/lib/python3.5/site-packages/chainer/training/trainer.py", line 299, in run
    update()
  File "/anaconda/envs/py35/lib/python3.5/site-packages/chainer/training/updater.py", line 223, in update
    self.update_core()
  File "/anaconda/envs/py35/lib/python3.5/site-packages/chainer/training/updater.py", line 234, in update_core
    optimizer.update(loss_func, *in_arrays)
  File "/anaconda/envs/py35/lib/python3.5/site-packages/chainer/optimizer.py", line 541, in update
    loss = lossfun(*args, **kwds)
  File "/anaconda/envs/py35/lib/python3.5/site-packages/chainer/functions/loss/sigmoid_cross_entropy.py", line 168, in sigmoid_cross_entropy
    return SigmoidCrossEntropy(normalize, reduce).apply((x, t))[0]
  File "/anaconda/envs/py35/lib/python3

# Debug

In [22]:
for one in train_iter:
    break

In [23]:
len(one)

64

In [26]:
dta = np.expand_dims(one[0][0], 0)
label = np.expand_dims(one[0][1], 0)
print(dta.shape, label.shape)

(1, 3, 224, 224) (1, 14)


In [27]:
# Try forward and back pass
data = cuda.to_gpu(dta)
target = cuda.to_gpu(label)
# Forwards
output = chexnet_sym(data)
# Loss
loss = F.sigmoid_cross_entropy(output, target)
chexnet_sym.cleargrads()
# Back-prop
loss.backward()
optimizer.update()

AttributeError: 'dict' object has no attribute 'ndim'