In [1]:
# GPU: 32*40 in 9.87s = 130/s
# CPU: 32*8 in 31.9s = 8/s

In [2]:
import os
import sys
import numpy as np
import mxnet as mx
from collections import namedtuple
print("OS: ", sys.platform)
print("Python: ", sys.version)
print("Numpy: ", np.__version__)
print("MXNet: ", mx.__version__)

OS:  linux
Python:  3.5.2 |Anaconda custom (64-bit)| (default, Jul  2 2016, 17:53:06) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]
Numpy:  1.13.3
MXNet:  0.12.1


In [3]:
!cat /proc/cpuinfo | grep processor | wc -l

6


In [4]:
!nvidia-smi --query-gpu=gpu_name --format=csv

name
Tesla K80


In [5]:
Batch = namedtuple('Batch', ['data'])
BATCH_SIZE = 32
RESNET_FEATURES = 2048
BATCHES_GPU = 40
BATCHES_CPU = 8

In [6]:
def give_fake_data(batches):
    """ Create an array of fake data to run inference on"""
    np.random.seed(0)
    dta = np.random.rand(BATCH_SIZE*batches, 224, 224, 3).astype(np.float32)
    return dta, np.swapaxes(dta, 1, 3)

In [7]:
def yield_mb(X, batchsize):
    """ Function yield (complete) mini_batches of data"""
    for i in range(len(X)//batchsize):
        yield i, X[i*batchsize:(i+1)*batchsize]

In [8]:
# Create batches of fake data
fake_input_data_cl, fake_input_data_cf = give_fake_data(BATCHES_GPU)
print(fake_input_data_cl.shape, fake_input_data_cf.shape)

(1280, 224, 224, 3) (1280, 3, 224, 224)


In [9]:
# Download Resnet weights
path='http://data.mxnet.io/models/imagenet/'
[mx.test_utils.download(path+'resnet/50-layers/resnet-50-symbol.json'),
 mx.test_utils.download(path+'resnet/50-layers/resnet-50-0000.params')]

['resnet-50-symbol.json', 'resnet-50-0000.params']

In [10]:
# Load model
sym, arg_params, aux_params = mx.model.load_checkpoint('resnet-50', 0)
# List the last 10 layers
all_layers = sym.get_internals()

In [11]:
print(all_layers.list_outputs()[-10:])

['bn1_moving_var', 'bn1_output', 'relu1_output', 'pool1_output', 'flatten0_output', 'fc1_weight', 'fc1_bias', 'fc1_output', 'softmax_label', 'softmax_output']


In [12]:
def predict_fn(classifier, data, batchsize):
    """ Return features from classifier """
    out = np.zeros((len(data), RESNET_FEATURES), np.float32)
    for idx, dta in yield_mb(data, batchsize):
        classifier.forward(Batch(data=[mx.nd.array(dta)]))
        out[idx*batchsize:(idx+1)*batchsize] = classifier.get_outputs()[0].asnumpy().squeeze()
    return out

## 1. GPU

In [13]:
# Get last layer
fe_sym = all_layers['flatten0_output']
# Initialise GPU
fe_mod = mx.mod.Module(symbol=fe_sym, context=[mx.gpu(0)], label_names=None)
fe_mod.bind(for_training=False, inputs_need_grad=False,
            data_shapes=[('data', (BATCH_SIZE,3,224,224))])
fe_mod.set_params(arg_params, aux_params)

In [14]:
cold_start = predict_fn(fe_mod, fake_input_data_cf, BATCH_SIZE)

In [15]:
%%time
# GPU: 9.87s
features = predict_fn(fe_mod, fake_input_data_cf, BATCH_SIZE)

CPU times: user 8.08 s, sys: 1.7 s, total: 9.78 s
Wall time: 9.87 s


## 2. CPU

In [16]:
# Kill all GPUs ...
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [17]:
# Get last layer
fe_sym = all_layers['flatten0_output']
# Initialise CPU
fe_mod = mx.mod.Module(symbol=fe_sym, context=mx.cpu(), label_names=None)
fe_mod.bind(for_training=False, inputs_need_grad=False,
            data_shapes=[('data', (BATCH_SIZE,3,224,224))])
fe_mod.set_params(arg_params, aux_params)

In [18]:
# Create batches of fake data
fake_input_data_cl, fake_input_data_cf = give_fake_data(BATCHES_CPU)
print(fake_input_data_cl.shape, fake_input_data_cf.shape)

(256, 224, 224, 3) (256, 3, 224, 224)


In [19]:
cold_start = predict_fn(fe_mod, fake_input_data_cf, BATCH_SIZE)

In [20]:
%%time
# CPU: 31.9s
features = predict_fn(fe_mod, fake_input_data_cf, BATCH_SIZE)

CPU times: user 58.8 s, sys: 1.42 s, total: 1min
Wall time: 31.9 s
