# MULTI-GPU TRAINING (WORK-IN-PROGRESS)


**Notes as of 2018-03-10** Attempted multi-gpu training using [Keras multi_gpu_model()](https://keras.io/utils/) capability. Tested on AWS p2.8xlarge instance with 8 Nvidia K80 gpus. Encountered couple of problems:

Did not see significant reduction in time per epoch. Original times about 165 seconds per epoch. With 8 GPU per epoch time reduced to about 157 seconds. One possibility is bottleneck when updating the model template housed on /cpu:0 device. Tried various batch/sub-batch sizes. Unable to keep the 8 gpus highly utlized.

ModelChekcpoint call-back to save best weights did not work in muti-gpu mode. Disabled that call-back. More research needed to see how this could work with multiple gpus.

In [1]:
import tensorflow as tf
import keras
keras.__version__

  from ._conv import register_converters as _register_converters
  (fname, cnt))
  (fname, cnt))
Using TensorFlow backend.


'2.1.4'

In [2]:
import os
import numpy as np
from keras.preprocessing.image import ImageDataGenerator

In [3]:
from keras import models
from keras import layers
from keras import optimizers
from keras.preprocessing.image import ImageDataGenerator

from keras.utils import multi_gpu_model

In [4]:
from keras import backend as K

## Setup run-time configuration

In [5]:
# training data specific names
base_model_name = './models/vgg16_full_multi_gpu_base_model.h5'
finetuned_model_name = './models/vgg16_full_multi_gpu_finetuned_model.h5'

train_dir = '/mystuff/cats_dogs_data/train_full'

# location for validation and test data
validation_dir = '/mystuff/cats_dogs_data/validation'
test_dir = '/mystuff/cats_dogs_data/test'


# training run specific parameters
NUMBER_GPUS = 8
BASE_MODEL_EPOCHS=30
FINETUNED_MODEL_EPOCHS=50
BATCH_SIZE = 1024

TRAIN_SAMPLES = len(os.listdir(os.path.join(train_dir,'cats'))) + len(os.listdir(os.path.join(train_dir,'dogs')))
VALIDATION_SAMPLES = len(os.listdir(os.path.join(validation_dir,'cats'))) + \
    len(os.listdir(os.path.join(validation_dir,'dogs')))
    
TEST_SAMPLES = len(os.listdir(os.path.join(test_dir,'cats'))) + len(os.listdir(os.path.join(test_dir,'dogs')))

print('Number of samples: training {:d}, validation {:d} and test {:d}'.format(TRAIN_SAMPLES, 
                                                                               VALIDATION_SAMPLES,TEST_SAMPLES))

Number of samples: training 21000, validation 2000 and test 2000


# Using a pre-trained convnet



## Feature extraction for VGG16 model



In [6]:
from keras.applications import VGG16

In [7]:
with tf.device('/cpu:0'):
    conv_base = VGG16(weights='imagenet',
                      include_top=False,
                      input_shape=(150, 150, 3))

    model = models.Sequential()
    model.add(conv_base)
    model.add(layers.Flatten())
    model.add(layers.Dense(256, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))


This is what our model looks like now:

In [8]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
vgg16 (Model)                (None, 4, 4, 512)         14714688  
_________________________________________________________________
flatten_1 (Flatten)          (None, 8192)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               2097408   
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 257       
Total params: 16,812,353
Trainable params: 16,812,353
Non-trainable params: 0
_________________________________________________________________


### Freeze base model layers

In [9]:
print('This is the number of trainable weights '
      'before freezing the conv base:', len(model.trainable_weights))

This is the number of trainable weights before freezing the conv base: 30


In [10]:
conv_base.trainable = False

In [11]:
print('This is the number of trainable weights '
      'after freezing the conv base:', len(model.trainable_weights))

This is the number of trainable weights after freezing the conv base: 4


In [12]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
vgg16 (Model)                (None, 4, 4, 512)         14714688  
_________________________________________________________________
flatten_1 (Flatten)          (None, 8192)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               2097408   
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 257       
Total params: 16,812,353
Trainable params: 2,097,665
Non-trainable params: 14,714,688
_________________________________________________________________


### Setup training data pipeline using data augmenation

In [13]:


train_datagen = ImageDataGenerator(
      rescale=1./255,
      rotation_range=40,
      width_shift_range=0.2,
      height_shift_range=0.2,
      shear_range=0.2,
      zoom_range=0.2,
      horizontal_flip=True,
      fill_mode='nearest')

# Note that the validation data should not be augmented!
test_datagen = ImageDataGenerator(rescale=1./255)

train_generator = train_datagen.flow_from_directory(
        # This is the target directory
        train_dir,
        # All images will be resized to 150x150
        target_size=(150, 150),
        batch_size=BATCH_SIZE,  # original
        #batch_size=50,
        # Since we use binary_crossentropy loss, we need binary labels
        class_mode='binary')

validation_generator = test_datagen.flow_from_directory(
        validation_dir,
        target_size=(150, 150),
        batch_size=BATCH_SIZE,
        class_mode='binary')

Found 21000 images belonging to 2 classes.
Found 2000 images belonging to 2 classes.


### Train new classifer specific to cat vs dog using features created by baseline model

In [14]:
parallel_model = multi_gpu_model(model, gpus=NUMBER_GPUS)

parallel_model.compile(loss='binary_crossentropy',
              optimizer=optimizers.Adam(),
              metrics=['acc'])

In [15]:
# set up to capture best performance weights ecountered during training
from keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint('./models/weights_best.hdf5', monitor='val_acc', verbose=1, save_best_only=True,
mode='max')
callbacks_list = [checkpoint]


history = parallel_model.fit_generator(
      train_generator,
      steps_per_epoch=TRAIN_SAMPLES/BATCH_SIZE,
      epochs=BASE_MODEL_EPOCHS,  
      validation_data=validation_generator,
      validation_steps=VALIDATION_SAMPLES/BATCH_SIZE,
      #callbacks=callbacks_list,
      verbose=2)

Epoch 1/30
 - 240s - loss: 0.9476 - acc: 0.5773 - val_loss: 0.5220 - val_acc: 0.7950
Epoch 2/30
 - 153s - loss: 0.5194 - acc: 0.7671 - val_loss: 0.3754 - val_acc: 0.8515
Epoch 3/30
 - 158s - loss: 0.4246 - acc: 0.8161 - val_loss: 0.3022 - val_acc: 0.8790
Epoch 4/30


KeyboardInterrupt: 

In [None]:
# load best performance weights
#model.load_weights('./models/weights_best.hdf5')

# save trained model
model.save(base_model_name)

Examine learning rates:

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()

In [None]:

test_generator = test_datagen.flow_from_directory(
        test_dir,
        target_size=(150, 150),
        batch_size=BATCH_SIZE,
        class_mode='binary')

test_loss, test_acc = model.evaluate_generator(test_generator, steps=TEST_SAMPLES/BATCH_SIZE)
print('base model test acc:', test_acc)

## Fine-tuning



Model before setup for fine tuning

In [None]:
model.summary()

In [None]:
conv_base.summary()

In [None]:
conv_base.trainable = True

set_trainable = False
for layer in conv_base.layers:
    if layer.name == 'block4_conv1':
        set_trainable = True
    if set_trainable:
        layer.trainable = True
    else:
        layer.trainable = False

In [None]:
conv_base.summary()

In [None]:
model.summary()


Now let's proceed with fine-tuning:

In [None]:
checkpoint = ModelCheckpoint('./models/weights_best.hdf5', monitor='val_acc', verbose=1, save_best_only=True,
mode='max')
callbacks_list = [checkpoint]


model.compile(loss='binary_crossentropy',
              optimizer=optimizers.RMSprop(lr=1e-5),  # use low rate to avoid large changes in cnn layer
              metrics=['acc'])

history = model.fit_generator(
      train_generator,
      steps_per_epoch=TRAIN_SAMPLES/BATCH_SIZE,
      epochs=FINETUNED_MODEL_EPOCHS,
      validation_data=validation_generator,
      validation_steps=VALIDATION_SAMPLES/BATCH_SIZE,
      callbacks=callbacks_list)

In [None]:
model.load_weights('./models/weights_best.hdf5')
model.save(finetuned_model_name)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()

In [None]:
test_generator = test_datagen.flow_from_directory(
        test_dir,
        target_size=(150, 150),
        batch_size=BATCH_SIZE,
        class_mode='binary')

test_loss, test_acc = model.evaluate_generator(test_generator, steps=TEST_SAMPLES/BATCH_SIZE)
print('fine tuned test acc:', test_acc)