In [1]:
# Example of using StagingAreaCallback for GPU prefetch 
#
# https://gist.github.com/bzamecnik/b9dbd50cdc195d54513cd2f9dfb7e21b

import math
from keras.applications import ResNet50
from keras.layers import Dense, Input, Conv2D, MaxPooling2D, Dropout, Flatten
from keras.models import Model
from keras.utils import to_categorical
from keras.utils.training_utils import multi_gpu_model
import numpy as np

from callbacks import StagingAreaCallback, SamplesPerSec
from keras_tf_multigpu.examples.datasets import create_synth_cifar10
from keras_tf_multigpu.examples.datasets import create_synth_imagenet

np.random.seed(42)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### CIFAR-10 training

In [2]:
def make_convnet(input, num_classes):
    x = Conv2D(32, (3, 3), padding='same', activation='relu')(input)
    x = Conv2D(64, (3, 3), activation='relu')(x)
    x = Conv2D(128, (3, 3), activation='relu')(x)
    x = MaxPooling2D(pool_size=(2, 2))(x)
    x = Dropout(0.25)(x)

    x = Conv2D(256, (3, 3), padding='same', activation='relu')(x)
    x = Conv2D(512, (3, 3), activation='relu')(x)
    x = MaxPooling2D(pool_size=(2, 2))(x)
    x = Dropout(0.25)(x)

    x = Flatten()(x)
    x = Dense(1024, activation='relu')(x)
    x = Dropout(0.5)(x)
    output = Dense(num_classes, activation='softmax')(x)

    return output

In [3]:
def make_plain_model(input_shape, num_classes, nb_gpus):
    input = Input(shape=input_shape)
    model = Model(inputs=input, outputs=make_convnet(input, num_classes))
    model = multi_gpu_model(model, gpus=nb_gpus)
    model.compile(optimizer='sgd', loss='categorical_crossentropy')
    return model

In [4]:
def make_tensor_model(staging_area_callback, num_classes, nb_gpus):
    input = Input(tensor=staging_area_callback.input_tensor)
    model = Model(inputs=input, outputs=make_convnet(input, num_classes))
    model = multi_gpu_model(model, gpus=nb_gpus)
    model.compile(optimizer='sgd', loss='categorical_crossentropy',
        target_tensors=[staging_area_callback.target_tensor],
        fetches=staging_area_callback.extra_ops)
    return model

In [5]:
nb_gpus = 8
num_classes = 10
dataset_size = 50000
batch_size = nb_gpus*2048
epochs = 5

In [6]:
x_train, y_train = create_synth_cifar10(dataset_size)

x_train = x_train.astype('float32')
y_train = y_train.astype('float32')

In [7]:
# last batch might be smaller
steps_per_epoch = int(math.ceil(len(x_train) / batch_size))
gauge = SamplesPerSec(batch_size)
staging_area_callback = StagingAreaCallback(x_train, y_train, batch_size, prefetch_count=2)

In [8]:
print('training plain model:')
plain_model = make_plain_model(x_train.shape[1:], num_classes, nb_gpus)
history1 = plain_model.fit(x_train, y_train, batch_size, epochs=epochs, callbacks=[gauge])

training plain model:
Epoch 1/5
Samples/sec: 26696.64
Epoch 2/5
Samples/sec: 26750.96
Epoch 3/5
Samples/sec: 26957.27
Epoch 4/5
Samples/sec: 27176.73
Epoch 5/5
Samples/sec: 27331.62


In [9]:
print('training pipelined model:')
pipelined_model = make_tensor_model(staging_area_callback, num_classes, nb_gpus)
history2 = pipelined_model.fit(steps_per_epoch=steps_per_epoch, epochs=epochs,
                               callbacks=[staging_area_callback, gauge])

training pipelined model:
Epoch 1/5
Samples/sec: 30587.05
Epoch 2/5
Samples/sec: 31266.97
Epoch 3/5
Samples/sec: 31090.30
Epoch 4/5
Samples/sec: 31102.63
Epoch 5/5
Samples/sec: 31102.63


### ImageNet Training

In [10]:
def make_tensor_model(staging_area_callback, num_classes, nb_gpus):
    model = ResNet50(input_tensor=staging_area_callback.input_tensor,
        classes=num_classes, weights=None)
    model = multi_gpu_model(model, gpus=nb_gpus)
    model.compile(optimizer='sgd', loss='categorical_crossentropy',
        target_tensors=[staging_area_callback.target_tensor],
        fetches=staging_area_callback.extra_ops)
    return model

In [11]:
nb_gpus = 8
num_classes = 1000
dataset_size = 10000
batch_size = nb_gpus*32
epochs = 5

In [12]:
x_train, y_train = create_synth_imagenet(224, dataset_size)
x_train = x_train.astype('float32')
y_train = y_train.astype('float32')

In [13]:
x_train.shape, y_train.shape

((10000, 224, 224, 3), (10000, 1000))

In [14]:
# last batch might be smaller
steps_per_epoch = int(math.ceil(len(x_train) / batch_size))

gauge = SamplesPerSec(batch_size)
staging_area_callback = StagingAreaCallback(x_train, y_train, batch_size, prefetch_count=5)

In [15]:
print('training pipelined model:')
pipelined_model = make_tensor_model(staging_area_callback, num_classes, nb_gpus)
%time pipelined_model.fit(steps_per_epoch=steps_per_epoch, epochs=epochs, callbacks=[staging_area_callback, gauge])

training pipelined model:
Epoch 1/5
Samples/sec: 367.41
Epoch 2/5
Samples/sec: 368.29
Epoch 3/5
Samples/sec: 367.88
Epoch 4/5
Samples/sec: 367.51
Epoch 5/5
Samples/sec: 368.14
CPU times: user 13min 3s, sys: 1min 12s, total: 14min 16s
Wall time: 4min 2s


<keras.callbacks.History at 0x7fde126afa90>