In [1]:
# GPU: 32*40 in 18.1s = 71/s
# CPU: 32*8 in 44s = 6/s

In [2]:
import os
import sys
import numpy as np
import caffe2
from caffe2.proto import caffe2_pb2
from caffe2.python import model_helper, core, workspace, models
print("OS: ", sys.platform)
print("Python: ", sys.version)
print("Numpy: ", np.__version__)

OS:  linux
Python:  3.5.2 |Anaconda custom (64-bit)| (default, Jul  2 2016, 17:53:06) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]
Numpy:  1.13.3


In [3]:
!cat /proc/cpuinfo | grep processor | wc -l

6


In [4]:
!nvidia-smi --query-gpu=gpu_name --format=csv

name
Tesla K80


In [5]:
BATCH_SIZE = 32
RESNET_FEATURES = 2048
BATCHES_GPU = 40
BATCHES_CPU = 8

In [6]:
def give_fake_data(batches):
    """ Create an array of fake data to run inference on"""
    np.random.seed(0)
    dta = np.random.rand(BATCH_SIZE*batches, 224, 224, 3).astype(np.float32)
    return dta, np.swapaxes(dta, 1, 3)

In [7]:
def yield_mb(X, batchsize):
    """ Function yield (complete) mini_batches of data"""
    for i in range(len(X)//batchsize):
        yield i, X[i*batchsize:(i+1)*batchsize]

In [8]:
# Create batches of fake data
fake_input_data_cl, fake_input_data_cf = give_fake_data(BATCHES_GPU)
print(fake_input_data_cl.shape, fake_input_data_cf.shape)

(1280, 224, 224, 3) (1280, 3, 224, 224)


In [9]:
#%%bash
#wget https://github.com/leonardvandriel/caffe2_models/raw/master/model/resnet50_init_net.pb
#wget https://github.com/leonardvandriel/caffe2_models/raw/master/model/resnet50_predict_net.pb

In [10]:
def load_net(INIT_NET, PREDICT_NET, device_opts):
    init_def = caffe2_pb2.NetDef()
    with open(INIT_NET, 'rb') as f:
        init_def.ParseFromString(f.read())
        init_def.device_option.CopyFrom(device_opts)
        workspace.RunNetOnce(init_def.SerializeToString())
    net_def = caffe2_pb2.NetDef()
    with open(PREDICT_NET, 'rb') as f:
        net_def.ParseFromString(f.read())
        net_def.device_option.CopyFrom(device_opts)
        workspace.CreateNet(net_def.SerializeToString(), overwrite=True)
    return net_def.name

In [11]:
def predict_fn(classifier, data, batchsize):
    """ Return features from classifier """
    out = np.zeros((len(data), RESNET_FEATURES), np.float32)
    for idx, dta in yield_mb(data, batchsize):
        workspace.FeedBlob("data", dta, device_option=device_opts)
        workspace.RunNet(classifier, 1)
        out[idx*batchsize:(idx+1)*batchsize] = workspace.FetchBlob('pool5').squeeze()
    return out

In [12]:
init_net_loc = 'resnet50_init_net.pb'
predict_net_loc = 'resnet50_predict_net.pb'

## 1. GPU

In [13]:
device_opts = core.DeviceOption(caffe2_pb2.CUDA, 0) 
test_net = load_net(init_net_loc, 
                    predict_net_loc,
                    device_opts=device_opts)

In [14]:
cold_start = predict_fn(test_net, fake_input_data_cf, BATCH_SIZE)

In [15]:
%%time
# GPU: 18.1s
features = predict_fn(test_net, fake_input_data_cf, BATCH_SIZE)

CPU times: user 15.2 s, sys: 2.89 s, total: 18.1 s
Wall time: 18.1 s


## 2. CPU

In [16]:
# Kill all GPUs ...
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
assert workspace.ResetWorkspace()

In [17]:
device_opts = core.DeviceOption(caffe2_pb2.CPU) 
test_net = load_net(init_net_loc, 
                    predict_net_loc,
                    device_opts=device_opts)

In [18]:
# Create batches of fake data
fake_input_data_cl, fake_input_data_cf = give_fake_data(BATCHES_CPU)
print(fake_input_data_cl.shape, fake_input_data_cf.shape)

(256, 224, 224, 3) (256, 3, 224, 224)


In [19]:
cold_start = predict_fn(test_net, fake_input_data_cf, BATCH_SIZE)

In [20]:
%%time
# CPU: 44s
features = predict_fn(test_net, fake_input_data_cf, BATCH_SIZE)

CPU times: user 4min, sys: 2.39 s, total: 4min 2s
Wall time: 44 s
