In [1]:
# GPU: 32*40 in 12s = 107/s
# CPU: 32*8 in  

In [2]:
import os
import sys
import numpy as np
import chainer
import chainer.functions as F
import chainer.links as L
from chainer import optimizers
from chainer import cuda
print("OS: ", sys.platform)
print("Python: ", sys.version)
print("Numpy: ", np.__version__)
print("Chainer: ", chainer.__version__)
print("CuPy: ", chainer.cuda.cupy.__version__)

OS:  linux
Python:  3.5.2 |Anaconda custom (64-bit)| (default, Jul  2 2016, 17:53:06) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]
Numpy:  1.13.3
Chainer:  3.1.0
CuPy:  2.1.0


In [3]:
!cat /proc/cpuinfo | grep processor | wc -l

6


In [4]:
!nvidia-smi --query-gpu=gpu_name --format=csv

name
Tesla K80


In [5]:
BATCH_SIZE = 32
RESNET_FEATURES = 2048
BATCHES_GPU = 40
BATCHES_CPU = 8

In [6]:
def give_fake_data(batches):
    """ Create an array of fake data to run inference on"""
    np.random.seed(0)
    dta = np.random.rand(BATCH_SIZE*batches, 224, 224, 3).astype(np.float32)
    return dta, np.swapaxes(dta, 1, 3)

In [7]:
def yield_mb(X, batchsize):
    """ Function yield (complete) mini_batches of data"""
    for i in range(len(X)//batchsize):
        yield i, X[i*batchsize:(i+1)*batchsize]

In [8]:
# Create batches of fake data
fake_input_data_cl, fake_input_data_cf = give_fake_data(BATCHES_GPU)
print(fake_input_data_cl.shape, fake_input_data_cf.shape)

(1280, 224, 224, 3) (1280, 3, 224, 224)


In [9]:
# Downloaded from https://github.com/KaimingHe/deep-residual-networks

In [10]:
#%%bash
#cd /home/iliauk/.chainer/dataset/pfnet/chainer/models/
#wget https://ikpublictutorial.blob.core.windows.net/deeplearningframeworks/ResNet-50-model.caffemodel

## 1. GPU

In [11]:
resnet50 = chainer.links.ResNet50Layers(pretrained_model="auto")
# GPU
chainer.cuda.get_device(0).use()  # Make a specified GPU current
resnet50.to_gpu()  # Copy the model to the GPU

<chainer.links.model.vision.resnet.ResNet50Layers at 0x7f0bdf64f5f8>

In [12]:
resnet50.available_layers

['conv1', 'pool1', 'res2', 'res3', 'res4', 'res5', 'pool5', 'fc6', 'prob']

In [13]:
def predict_fn(classifier, data, batchsize):
    """ Return features from classifier """
    out = np.zeros((len(data), RESNET_FEATURES), np.float32)
    with chainer.using_config('train', False):
        for idx, dta in yield_mb(data, batchsize):
            pred = classifier(cuda.to_gpu(dta), layers=['pool5'])
            out[idx*batchsize:(idx+1)*batchsize] = cuda.to_cpu(pred['pool5'].data).squeeze()        
    return out

In [14]:
cold_start = predict_fn(resnet50, fake_input_data_cf, BATCH_SIZE)

In [15]:
%%time
# GPU: 12s
features = predict_fn(resnet50, fake_input_data_cf, BATCH_SIZE)

CPU times: user 10.3 s, sys: 1.66 s, total: 12 s
Wall time: 12 s


## 2. CPU

In [16]:
# Kill all GPUs ...
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
del resnet50

In [17]:
resnet50 = chainer.links.ResNet50Layers(pretrained_model="auto")

In [18]:
def predict_fn(classifier, data, batchsize):
    """ Return features from classifier """
    out = np.zeros((len(data), RESNET_FEATURES), np.float32)
    with chainer.using_config('train', False):
        for idx, dta in yield_mb(data, batchsize):
            pred = classifier(dta, layers=['pool5'])
            out[idx*batchsize:(idx+1)*batchsize] = pred['pool5'].data.squeeze()        
    return out

In [20]:
# Create batches of fake data
fake_input_data_cl, fake_input_data_cf = give_fake_data(BATCHES_CPU)
print(fake_input_data_cl.shape, fake_input_data_cf.shape)

(256, 224, 224, 3) (256, 3, 224, 224)


In [21]:
cold_start = predict_fn(resnet50, fake_input_data_cf, BATCH_SIZE)

In [22]:
%%time
# CPU: 78s
features = predict_fn(resnet50, fake_input_data_cf, BATCH_SIZE)

CPU times: user 2min 18s, sys: 2min 35s, total: 4min 53s
Wall time: 1min 18s
