## The Dog v Cat Problem using a CNN + STN
A 2 layer convolutional network with a third fully connected layer is combined with a Spatial Transformed Network for object detection.  
There are two STN layers, one before the first convolution layer and one before the second convolution layer 

I modified the original implementation of STN to be compatible with Tensorflow 1.0 

The folder UTILS contains:  
1. The modified spatial_transformer function in ./utils/
2. The function "batchify" is for yielding batches of the data

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import scipy.misc
import tensorflow as tf

from tflearn.data_utils import to_categorical
from utils import load_data
from utils import spatial_transformer
from utils import vgg19
from utils import batchify

In [None]:
BATCH_SIZE = 10
IMAGE_SIZE = (64, 64)

load data using the "load_data" function in ./utils/ 

In [None]:
X_train, Y_train, _ = load_data.load_data('path_to_training_and_test_folder', IMAGE_SIZE)

### CNN + STN

In [None]:
def stn_cnn(X_data, Y_data, image_size, batch_size, epochs):
    '''
    A CNN network enhanced with Spatial Transformer Networks
    
    INPUTS:
    - X_data     - image data
    - Y_data     - classifier data
    - image_size - size of the input images
    - batch_size - Size of the batches of input data
    - epochs     - number of epochs
    
    OUTPUTS:
    - returns the training loss list
    '''

    def _conv_layer(input_layer, weights, bias):
        '''
        Creates a convolutional layer

        INPUTS:
        - input_layer : preceding layer from network
        - weights     : filter for the convolution operation
        - bias        : bias vector

        OUTPUTS:
        - returns a bias added convoluted layer 
        '''
        weights = tf.Variable(initial_value=weights, trainable=True, name='conv_weights')
        bias = tf.Variable(initial_value=bias, trainable=True, name='conv_bias')
        conv_layer = tf.nn.conv2d(input_layer, 
                                  filter=weights,
                                  strides=(1, 2, 2, 1),
                                  padding='SAME')
        return(tf.nn.bias_add(conv_layer, bias))
    
    
    def _full_layer(input_layer, weights, bias):
        '''
        Creates a fully connected layer with specified no of units

        INPUTS:
        - input_layer : preceding layer from network
        - no_of_units : number of hidden units for this layer

        OUTPUTS:
        - returns a fully connected layer with the specified no of units
        '''
        shape = input_layer.get_shape().as_list()
        dim = 1
        for d in shape[1:]:
            dim *= d
        X = tf.reshape(input_layer, [-1, dim])
        
        weights = tf.Variable(initial_value=weights, trainable=True, name='full_layer_weights')
        bias = tf.Variable(initial_value=bias, trainable=True, name='full_layer_bias')
        return(tf.nn.bias_add(tf.matmul(X, weights), bias))
    
    def transform_layer(input_layer, out_size):
        '''
        Creates a Spatially Transformer Network (STN) layer

        INPUTS:
        - input_layer : preceding layer from network
        - out_size    : the size of the output image

        OUTPUTS:
        - outputs a Spatially Transformed Layer from the input layer
        '''
        shape = input_layer.get_shape().as_list()
        dim = 1
        for d in shape[1:]:
            dim *= d
        X = tf.reshape(input_layer, [-1, dim])
        weights = tf.Variable(tf.random_normal([dim, 6]), trainable=True, name='transform_weights')
        bias = tf.Variable(initial_value=np.array([ 1.,  0.,  0.,  0.,  1.,  0.], dtype=np.float32), 
                           trainable=True, name='transform_bias')
        output_layer = tf.nn.relu(tf.nn.bias_add(tf.matmul(X, weights), bias))
        output_layer = spatial_transformer.transformer(input_layer, output_layer, out_size)
        return(output_layer)
    
    def _drop_layer(self, input_layer, keep_prob=0.5):
        '''
        Creates a dropout layer

        INPUTS:
        - input_layer : preceding layer from network
        - keep_prob   : the probability of keeping unit
                        default value = 0.5

        OUTPUTS:
        - outputs a dropout layer
        '''
        intermediate_layer = tf.nn.relu(input_layer) 
        return(tf.nn.dropout(intermediate_layer, keep_prob))
    
    def _relu_layer(input_layer):
        '''
        Creates a RELU activated layer

        INPUTS:
        - input_layer : preceding layer from network

        OUTPUTS:
        - outputs a RELU activated layer
        '''
        return(tf.nn.relu(input_layer))
    
    def _pool_layer(input_layer, pool_func='avg'):
        '''
        Creates a {avg, max}-pool layer

        INPUTS:
        - 'avg' : Average pooling
        - Else  : Max pooling

        OUTPUTS:
        - 'avg' : An average pooled layer from the input layer
        - 'max' : A max pooled layer otherwise
        '''
        if pool_func == 'avg':
            return(tf.nn.avg_pool(input_layer,
                                  ksize=(1, 2, 2, 1),
                                  strides=(1, 2, 2, 1),
                                  padding='SAME'))
        else:
            return(tf.nn.max_pool(input_layer,
                                  ksize=(1, 2, 2, 1),
                                  strides=(1, 2, 2, 1),
                                  padding='SAME'))
    
    
    X = tf.placeholder(tf.float32, shape=(batch_size, *image_size, 3), name='X')
    Y = tf.placeholder(tf.float32, shape=(None, 2), name='Y')
    
    # the first Spatially Transformed Network layer
    transformed_layer_1 = transform_layer(X, image_size)
    
    # we choose the filter size and the bias here
    # weight_conv1.shape = (filter_size, filter_size, no_of_channels, no_of_filters)
    # bias_conv1.shape = (filter_size, )
    conv1_weight = tf.zeros((3, 3, 3, 64), dtype=tf.float32)
    conv1_bias = tf.random_normal([64])
    conv_layer1 = _conv_layer(transformed_layer_1, conv1_weight, conv1_bias)
    relu_layer1 = _relu_layer(conv_layer1)

    # the second Spatially Transformed Network layer
    transformed_layer_2 = transform_layer(relu_layer1, image_size)
    
    # we choose the filter size and the bias here
    # weight_conv1.shape = (filter_size, filter_size, no_of_filters_conv1, no_of_new_filters)
    # bias_conv1.shape = (new_filter_size, )
    conv2_weight = tf.zeros((3, 3, 64, 64), dtype=tf.float32)
    conv2_bias = tf.random_normal([64])
    conv_layer2 = _conv_layer(transformed_layer_2, conv2_weight, conv2_bias)
    relu_layer2 = _relu_layer(conv_layer2)

    shape = relu_layer2.get_shape().as_list()
    dim = 1
    for d in shape[1:]:
        dim *= d
    full_weights = tf.random_normal([dim, 2])
    full_biases = tf.random_normal([2])
    full_layer1 = _full_layer(relu_layer2, full_weights, full_biases)
    
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=full_layer1, labels=Y))
    
    optimizer = tf.train.AdamOptimizer(1.0).minimize(loss)
    
    init = tf.global_variables_initializer()
    
    with tf.Session() as sess:
        sess.run(init)
        number_batches = int(len(X_data)/batch_size)
        training_loss_list = []
        for epoch in range(epochs):
            for batch in batchify.batchify(X_data, Y_data, batch_size):
                X_batch, Y_batch = batch
                _, training_loss_value = sess.run([optimizer, loss], feed_dict={X: X_batch, Y: Y_batch})
                training_loss_list.append(training_loss_value)
    
    return(training_loss_list)

In [None]:
stn_cnn(X_train, Y_train, IMAGE_SIZE, BATCH_SIZE, epochs=100)