In [1]:
# GPU: 32*40 in 9.86s = 130/s
# CPU: 32*8 in 43.8s = 6/s

In [2]:
import os
import sys
import numpy as np
import torch
import torchvision.models as models
from torch.autograd import Variable
print("OS: ", sys.platform)
print("Python: ", sys.version)
print("Numpy: ", np.__version__)
print("Pytorch: ", torch.__version__)

OS:  linux
Python:  3.5.2 |Anaconda custom (64-bit)| (default, Jul  2 2016, 17:53:06) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]
Numpy:  1.13.3
Pytorch:  0.2.0_4


In [3]:
!cat /proc/cpuinfo | grep processor | wc -l

6


In [4]:
!nvidia-smi --query-gpu=gpu_name --format=csv

name
Tesla K80


In [5]:
BATCH_SIZE = 32
RESNET_FEATURES = 2048
BATCHES_GPU = 40
BATCHES_CPU = 8

In [6]:
def give_fake_data(batches):
    """ Create an array of fake data to run inference on"""
    np.random.seed(0)
    dta = np.random.rand(BATCH_SIZE*batches, 224, 224, 3).astype(np.float32)
    return dta, np.swapaxes(dta, 1, 3)

In [7]:
def yield_mb(X, batchsize):
    """ Function yield (complete) mini_batches of data"""
    for i in range(len(X)//batchsize):
        yield i, X[i*batchsize:(i+1)*batchsize]

In [8]:
# Create batches of fake data
fake_input_data_cl, fake_input_data_cf = give_fake_data(BATCHES_GPU)
print(fake_input_data_cl.shape, fake_input_data_cf.shape)

(1280, 224, 224, 3) (1280, 3, 224, 224)


## 1. GPU

In [9]:
# Download ResNet
resnet50 = models.resnet50(pretrained=True)
print(list(resnet50.children())[-1])
chopped_resnet50 = torch.nn.Sequential(*list(resnet50.children())[:-1])
print(list(chopped_resnet50.children())[-1])

Linear (2048 -> 1000)
AvgPool2d (size=7, stride=7, padding=0, ceil_mode=False, count_include_pad=True)


In [10]:
chopped_resnet50.cuda()

Sequential (
  (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True)
  (2): ReLU (inplace)
  (3): MaxPool2d (size=(3, 3), stride=(2, 2), padding=(1, 1), dilation=(1, 1))
  (4): Sequential (
    (0): Bottleneck (
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True)
      (relu): ReLU (inplace)
      (downsample): Sequential (
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True)
      )
    )
    (1): Bottleneck (
      (

In [11]:
def predict_fn(classifier, data, batchsize):
    """ Return features from classifier """
    classifier.eval()
    out = np.zeros((len(data), RESNET_FEATURES), np.float32)
    for idx, dta in yield_mb(data, batchsize):
        pred = classifier(Variable(torch.FloatTensor(dta).cuda()))
        out[idx*batchsize:(idx+1)*batchsize] = pred.data.cpu().numpy().squeeze()
    return out

In [12]:
cold_start = predict_fn(chopped_resnet50, fake_input_data_cf, BATCH_SIZE)

In [13]:
%%time
# GPU: 9.86s
features = predict_fn(chopped_resnet50, fake_input_data_cf, BATCH_SIZE)

CPU times: user 7.97 s, sys: 1.89 s, total: 9.86 s
Wall time: 9.86 s


## 2. CPU

In [14]:
# Kill all GPUs ...
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
del resnet50, chopped_resnet50

In [15]:
# Initialise CPU
# Download ResNet
resnet50 = models.resnet50(pretrained=True)
chopped_resnet50 = torch.nn.Sequential(*list(resnet50.children())[:-1])

In [16]:
def predict_fn(classifier, data, batchsize):
    """ Return features from classifier """
    classifier.eval()
    out = np.zeros((len(data), RESNET_FEATURES), np.float32)
    for idx, dta in yield_mb(data, batchsize):
        pred = classifier(Variable(torch.FloatTensor(dta)))
        out[idx*batchsize:(idx+1)*batchsize] = pred.data.numpy().squeeze()
    return out

In [17]:
# Create batches of fake data
fake_input_data_cl, fake_input_data_cf = give_fake_data(BATCHES_CPU)
print(fake_input_data_cl.shape, fake_input_data_cf.shape)

(256, 224, 224, 3) (256, 3, 224, 224)


In [18]:
cold_start = predict_fn(chopped_resnet50, fake_input_data_cf, BATCH_SIZE)

In [19]:
%%time
# CPU: 43.8s
features = predict_fn(chopped_resnet50, fake_input_data_cf, BATCH_SIZE)

CPU times: user 3min 47s, sys: 12.7 s, total: 4min
Wall time: 43.8 s
