# Transfer Learning to detect cats / dogs using Vgg16

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import skimage
import skimage.io
import skimage.transform

In [2]:
vgg_mean = [103.939, 116.779, 123.68]
classes = [l.strip() for l in open('synset.txt').readlines()]

## Vgg16 Model Class

In [3]:
class Vgg16Model:
    def __init__(self, weights_path='./vgg16.npy'):
        self.weights = np.load('vgg16.npy', encoding='latin1').item()
        self.activation_fn = tf.nn.relu
        self.conv_padding = 'SAME'
        self.pool_padding = 'SAME'
        self.use_bias = True

    def build(self, input_tensor, trainable=False):
        self.conv1_1 = self.conv2d(input_tensor, 'conv1_1', 64, trainable)
        self.conv1_2 = self.conv2d(self.conv1_1, 'conv1_2', 64, trainable)

        # Max-pooling is performed over a 2 × 2 pixel window, with stride 2.
        self.max_pool1 = tf.layers.max_pooling2d(self.conv1_2, (2, 2), (2, 2), padding=self.pool_padding)

        self.conv2_1 = self.conv2d(self.max_pool1, 'conv2_1', 128, trainable)
        self.conv2_2 = self.conv2d(self.conv2_1, 'conv2_2', 128, trainable)

        self.max_pool2 = tf.layers.max_pooling2d(self.conv2_2, (2, 2), (2, 2), padding=self.pool_padding)

        self.conv3_1 = self.conv2d(self.max_pool2, 'conv3_1', 256, trainable)
        self.conv3_2 = self.conv2d(self.conv3_1, 'conv3_2', 256, trainable)
        self.conv3_3 = self.conv2d(self.conv3_2, 'conv3_3', 256, trainable)

        self.max_pool3 = tf.layers.max_pooling2d(self.conv3_3, (2, 2), (2, 2), padding=self.pool_padding)

        self.conv4_1 = self.conv2d(self.max_pool3, 'conv4_1', 512, trainable)
        self.conv4_2 = self.conv2d(self.conv4_1, 'conv4_2', 512, trainable)
        self.conv4_3 = self.conv2d(self.conv4_2, 'conv4_3', 512, trainable)

        self.max_pool4 = tf.layers.max_pooling2d(self.conv4_3, (2, 2), (2, 2), padding=self.pool_padding)

        self.conv5_1 = self.conv2d(self.max_pool4, 'conv5_1', 512, trainable)
        self.conv5_2 = self.conv2d(self.conv5_1, 'conv5_2', 512, trainable)
        self.conv5_3 = self.conv2d(self.conv5_2, 'conv5_3', 512, trainable)

        self.max_pool5 = tf.layers.max_pooling2d(self.conv5_3, (2, 2), (2, 2), padding=self.pool_padding)

        reshaped = tf.reshape(self.max_pool5, shape=(-1, 7 * 7 * 512))

        self.fc6 = self.fc(reshaped, 'fc6', 4096, trainable)
        self.fc7 = self.fc(self.fc6, 'fc7', 4096, trainable)

        self.fc8 = self.fc(self.fc7, 'fc8', 1000, trainable)

        self.predictions = tf.nn.softmax(self.fc8, name='predictions')

    def conv2d(self, layer, name, n_filters, trainable, k_size=3):
        return tf.layers.conv2d(layer, n_filters, kernel_size=(k_size, k_size),
                                activation=self.activation_fn, padding=self.conv_padding, name=name, trainable=trainable,
                                kernel_initializer=tf.constant_initializer(self.weights[name][0], dtype=tf.float32),
                                bias_initializer=tf.constant_initializer(self.weights[name][1], dtype=tf.float32),
                                use_bias=self.use_bias)

    def fc(self, layer, name, size, trainable):
        return tf.layers.dense(layer, size, activation=self.activation_fn,
                               name=name, trainable=trainable,
                               kernel_initializer=tf.constant_initializer(self.weights[name][0], dtype=tf.float32),
                               bias_initializer=tf.constant_initializer(self.weights[name][1], dtype=tf.float32),
                               use_bias=self.use_bias)

## Images conversion for Vgg16

Images have to be of dimension (224, 224, 3). The last dimension is ordered BGR (blue, green, red)

In [4]:
# https://github.com/machrisaa/tensorflow-vgg/blob/master/utils.py
def load_image(image_path, mean=vgg_mean):
    image = skimage.io.imread(image_path)

    image = image.astype(float)
    
    short_edge = min(image.shape[:2])
    yy = int((image.shape[0] - short_edge) / 2)
    xx = int((image.shape[1] - short_edge) / 2)
    crop_image = image[yy: yy + short_edge, xx: xx + short_edge]
    
    resized_image = skimage.transform.resize(crop_image, (224, 224), mode='constant') 
            
    bgr = resized_image[:,:,::-1] - mean
    
    return bgr

## Extract Vgg16 features

In [5]:
import os
import math

dataset_dir = './datasets/dogs-vs-cats-redux-kernels-edition/train/'
filenames = os.listdir(dataset_dir)
num_files = len(filenames)

batch_size = 32

num_batches = int(math.ceil(num_files / batch_size))

In [6]:
import time

tf.reset_default_graph()

# create mapping of filename -> vgg features
codes_fc6 = {}
codes_fc7 = {}
predictions = {}

with tf.device('/gpu:0'):
    with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as sess:    
        _input = tf.placeholder(tf.float32, shape=(None, 224, 224, 3), name="images")

        vgg = Vgg16Model()
        vgg.build(_input)

        sess.run(tf.global_variables_initializer())

        for i in range(num_batches):
            batch_filenames = filenames[i*batch_size : ((i+1)*batch_size)]

            print("batch {} of {}".format(i+1, num_batches))

            start = time.time()
            images = np.array([load_image(dataset_dir + f) for f in batch_filenames])
            end = time.time()
            print("\timage loading took {:.4f} sec".format(end-start))

            start = end
            
            batch_codes_fc6, batch_codes_fc7 = sess.run(
                [vgg.fc6, vgg.fc7],
                feed_dict={ _input: images }
            )
            
            end = time.time()
            print("\tprediction took {:.4f} sec".format(end-start))

            for i, filename in enumerate(batch_filenames):
                codes_fc6[filename] = batch_codes_fc6[i]
                codes_fc7[filename] = batch_codes_fc7[i]

        np.save('codes_fc6.npy', codes_fc6)
        np.save('codes_fc7.npy', codes_fc7)


batch 1 of 782
	image loading took 0.2762 sec
	prediction took 2.0549 sec
batch 2 of 782
	image loading took 0.2554 sec
	prediction took 0.3229 sec
batch 3 of 782
	image loading took 0.2393 sec
	prediction took 0.3170 sec
batch 4 of 782
	image loading took 0.2508 sec
	prediction took 0.3232 sec
batch 5 of 782
	image loading took 0.2529 sec
	prediction took 0.3166 sec
batch 6 of 782
	image loading took 0.2636 sec
	prediction took 0.3328 sec
batch 7 of 782
	image loading took 0.2826 sec
	prediction took 0.3327 sec
batch 8 of 782
	image loading took 0.2571 sec
	prediction took 0.3146 sec
batch 9 of 782
	image loading took 0.2320 sec
	prediction took 0.3223 sec
batch 10 of 782
	image loading took 0.2507 sec
	prediction took 0.3153 sec
batch 11 of 782
	image loading took 0.2427 sec
	prediction took 0.3172 sec
batch 12 of 782
	image loading took 0.2591 sec
	prediction took 0.3344 sec
batch 13 of 782
	image loading took 0.2411 sec
	prediction took 0.3268 sec
batch 14 of 782
	image loading too

## Checkpoint - Vgg16 Features extracted and serialized

The features for each images should be stored in a file called `codes.npy`

In [7]:
#reset python environment
%reset -f

In [8]:
import numpy as np
import tensorflow as tf

In [9]:
from collections import OrderedDict
codes = np.load('codes_fc6.npy')
codes = OrderedDict(codes.item())

In [10]:
keys = list(codes.keys())

# one hot encode labels
labels = np.array([ (1, 0) if name[:3] == 'dog' else (0,1) for name in keys])

# extract images
images = np.array(list(codes.values()))

from sklearn.model_selection import StratifiedShuffleSplit

for i,key in enumerate(keys):
    assert (codes.get(key) == images[i]).all()

splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.1)


train_indices, val_indices = next(splitter.split(images, labels))

train_images, train_labels = images[train_indices], labels[train_indices]
val_images, val_labels = images[val_indices], labels[val_indices]

In [11]:
def get_batches(x, y, batch_size=32):
    num_rows = train_labels.shape[0]
    
    num_batches = num_rows // batch_size
    
    if num_rows % batch_size != 0:
        num_batches = num_batches + 1

    for batch in range(num_batches):
        yield x[batch_size * batch: batch_size * (batch + 1)], y[batch_size * batch: batch_size * (batch + 1)]

In [17]:
num_epochs = 5
learning_rate = 0.01
keep_prob = 0.5
batch_size = 64
accuracy_print_steps = 10
iteration = 0

tf.reset_default_graph()

_images = tf.placeholder(tf.float32, shape=(None, 4096), name='images')
_labels = tf.placeholder(tf.float32, shape=(None, 2), name='labels')
_keep_prob = tf.placeholder(tf.float32, name='keep_probability')

hidden = tf.contrib.layers.fully_connected(_images, 256)
hidden = tf.nn.dropout(hidden, keep_prob=_keep_prob, name='hidden_dropout')

logits = tf.contrib.layers.fully_connected(hidden, 2, activation_fn=None)

cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=_labels, name='cross_entropy')

cost = tf.reduce_mean(cross_entropy, name='cost')

optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

predictions = tf.nn.softmax(logits, name='predictions')

correct_predictions = tf.equal(tf.argmax(predictions, 1), tf.argmax(_labels, 1), name='correct_predictions')
accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32), name='accuracy')

with tf.device('/gpu:0'):
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        for epoch in range(num_epochs):
            for batch_train_images, batch_train_labels in get_batches(train_images, train_labels, batch_size=batch_size):
                train_loss, _, p = sess.run(
                    [cost, optimizer, logits], 
                    feed_dict = { 
                        _images: batch_train_images,
                        _labels: batch_train_labels,
                        _keep_prob: keep_prob
                    })

                iteration = iteration + 1

                if iteration % accuracy_print_steps == 0:
                    val_acc = sess.run(accuracy, feed_dict ={
                        _images: val_images,
                        _labels: val_labels,
                        _keep_prob: 1.
                    })

                    print('{} / {} Accuracy: {} Loss: {}'.format(epoch + 1, num_epochs, val_acc, train_loss))

1 / 5 Accuracy: 0.9744000434875488 Loss: 5.036487102508545
1 / 5 Accuracy: 0.9819999933242798 Loss: 0.0
1 / 5 Accuracy: 0.9772000312805176 Loss: 0.9204965829849243
1 / 5 Accuracy: 0.9839999675750732 Loss: 0.13784725964069366
1 / 5 Accuracy: 0.9860000014305115 Loss: 0.19984659552574158
1 / 5 Accuracy: 0.9843999743461609 Loss: 0.01619289256632328
1 / 5 Accuracy: 0.985200047492981 Loss: 0.0748998373746872
1 / 5 Accuracy: 0.9851999282836914 Loss: 0.0004492068837862462
1 / 5 Accuracy: 0.9860000014305115 Loss: 0.023588810116052628
1 / 5 Accuracy: 0.9863998889923096 Loss: 0.03603707626461983
1 / 5 Accuracy: 0.9855999946594238 Loss: 0.0067691137082874775
1 / 5 Accuracy: 0.9816000461578369 Loss: 0.0016983392415568233
1 / 5 Accuracy: 0.9796000719070435 Loss: 0.1684042364358902
1 / 5 Accuracy: 0.9844000339508057 Loss: 0.058293215930461884
1 / 5 Accuracy: 0.9871999025344849 Loss: 0.019754894077777863
1 / 5 Accuracy: 0.9843999743461609 Loss: 0.01443050429224968
1 / 5 Accuracy: 0.9835999608039856 Lo