## Part IV:  [Mobilenet](https://arxiv.org/pdf/1704.04861.pdf) separable convolutions
- modify the model in part I to use *separable* convolutions
- check number of parameters and compare with the previous model
- train classifier

Architecture (same as in part I):
- conv output channels 64, 64, 128, 128, 128, 256, 256, 256, 512, 512, 512
- kernel shape (3,3)
- strides: 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1
- padding: SAME (snt.SAME)
- num_output_classes = 10


### Imports

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import math
import time

import tensorflow as tf

# Don't forget to select GPU runtime environment in Runtime -> Change runtime type
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

# we will use Sonnet on top of TF 
!pip install -q dm-sonnet
import sonnet as snt

import numpy as np

# Plotting library.
from matplotlib import pyplot as plt
import pylab as pl
from IPython import display

InternalError: ignored

In [0]:
# Reset graph
tf.reset_default_graph()

### Download dataset to be used for training and testing
- Cifar-10 equivalent of MNIST for natural RGB images
- 60000 32x32 colour images in 10 classes: airplane, automobile, bird, cat, deer, dog, frog, horse, ship, truck
- train: 50000; test: 10000

In [0]:
cifar10 = tf.keras.datasets.cifar10
# (down)load dataset
(train_images, train_labels), (test_images, test_labels) = cifar10.load_data()

In [0]:
#@title (optional) Check sizes of tensors
print ('Size of training images')
print (train_images.shape)
print ('Size of training labels')
print (train_labels.shape)
print ('Size of test images')
print (test_images.shape)
print ('Size of test labels')
print (test_labels.shape)

assert train_images.shape[0] == train_labels.shape[0]

Size of training images
(50000, 32, 32, 3)
Size of training labels
(50000, 1)
Size of test images
(10000, 32, 32, 3)
Size of test labels
(10000, 1)


### Prepare the data for training and testing
- for training, we use stochastic optimizers (e.g. SGD, Adam), so we need to sample at random mini-batches from the training dataset
- for testing, we iterate sequentially through the test set

In [0]:
# define dimension of the batches to sample from the datasets
BATCH_SIZE_TRAIN = 128 #@param
BATCH_SIZE_TEST = 100 #@param

# create Dataset objects using the data previously downloaded
dataset_train = tf.data.Dataset.from_tensor_slices((train_images, train_labels))
# we shuffle the data and sample repeatedly batches for training
batched_dataset_train = dataset_train.shuffle(100000).repeat().batch(BATCH_SIZE_TRAIN)
# create iterator to retrieve batches
iterator_train = batched_dataset_train.make_one_shot_iterator()
# get a training batch of images and labels
(batch_train_images, batch_train_labels) = iterator_train.get_next()

# check that the shape of the training batches is the expected one
# print ('Shape of training images')
# print (batch_train_images)
# print ('Shape of training labels')
# print (batch_train_labels)

# we do the same for test dataset
dataset_test = tf.data.Dataset.from_tensor_slices((test_images, test_labels))
batched_dataset_test = dataset_test.repeat().batch(BATCH_SIZE_TEST)
iterator_test = batched_dataset_test.make_one_shot_iterator() 
(batch_test_images, batch_test_labels) = iterator_test.get_next()
# print ('Shape of test images')
# print (batch_test_images)
# print ('Shape of test labels')
# print (batch_test_labels)

In [0]:
#@title Preprocessing of data
# preprocess input for training and testing
def random_flip_left_right(image, flip_index, seed=None):
  shape = image.get_shape()
  if shape.ndims == 3 or shape.ndims is None:
    uniform_random = tf.random_uniform([], 0, 1.0, seed=seed)
    mirror_cond = tf.less(uniform_random, .5)
    result = tf.cond(
        mirror_cond,
        lambda: tf.reverse(image, [flip_index]),
        lambda: image
    )
    return fix_image_flip_shape(image, result)
  elif shape.ndims == 4:
    uniform_random = tf.random_uniform(
        [tf.shape(image)[0]], 0, 1.0, seed=seed
    )
    mirror_cond = tf.less(uniform_random, .5)
    return tf.where(
        mirror_cond,
        image,
        tf.map_fn(lambda x: tf.reverse(x, [flip_index]), image, dtype=image.dtype)
    )
  else:
    raise ValueError("\'image\' must have either 3 or 4 dimensions.")
    
def train_image_preprocess(h, w, random_flip=True):
  """Image processing required for training the model."""

  def fn(image):
    batch_size = image.get_shape().as_list()[0]
    # Ensure the data is in range [-1, 1].
    image = tf.image.convert_image_dtype(image, dtype=tf.float32)
    image = image * 2.0 - 1.0
    # Randomly choose a (24, 24, 3) patch to be used for training.
    image = tf.random_crop(image, size=(BATCH_SIZE_TRAIN, h, w, 3))
    # Randomly flip the image.
    image = random_flip_left_right(image, 2)
    return image

  return fn

def test_image_preprocess():
  def fn(image):
    image = tf.image.convert_image_dtype(image, dtype=tf.float32)
    image = image * 2.0 - 1.0
    return image
  return fn

### Separable convolutions

For example, a 2D conv can be written as a sequence of 2 1D conv, i.e. \begin{equation} y[m,n]=x[m,n]*h[m,n] = h_1[m]*(h_2[n]*x[m,n])\end{equation}

assuming $x$ is a 2D input signal, $h$ is a 2D filter that can be separated into 2 1D filters $h_1$ and $h_2$, and $y$ is the output of convolving $x$ with $h$.  


Similarly for 3D case, we apply the separability between feature channel and spatial dimensions (as shown in the figure below on the left), i.e. \begin{equation} y[m,n,p]=x[m,n, p]*h[m,n,p] = h_1[p]*(h_2[m,n]*x[m,n,p])\end{equation}

![alt text](https://tmlss.ro/lab_tmp/separable.png)

### Modify the previous classifier to use *separable* convolutions; the first conv unit stays the same.

In [0]:
class Mobilenet(snt.AbstractModule):
  
  def __init__(self, num_classes, name="mobilenet"):
    super(Mobilenet, self).__init__(name=name)
    self._num_classes = num_classes
    self._channel_multipliers = [
        0, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1
    ]
    self._output_channels = [
        64, 64, 128, 128, 128, 256, 256, 256, 512, 512, 512
    ]
    self._num_layers = len(self._output_channels)

    self._kernel_shapes = [[3, 3]] * self._num_layers  # All kernels are 3x3.
    self._strides = [1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1]
    self._paddings = [snt.SAME] * self._num_layers
   
  def _build(self, inputs, is_training=None, test_local_stats=False):
    net = inputs
    # instantiate all the convolutional layers
    
    # instantiate depthwise conv layers
    
    # instantiate 1x1 conv layers
    
    # construct network    

    net = tf.reduce_mean(net, reduction_indices=[1, 2], keepdims=False,
                         name="avg_pool")

    logits = snt.Linear(self._num_classes)(net)

    return logits

## Parameter Comparison

What are the number of the parameters of this new model. How does it compare with the number computed for the baseline?

In [0]:
#@title Function to compute nr of params

def get_num_params(scope):
  total_parameters = 0
  for variable in tf.trainable_variables(scope):
    # shape is an array of tf.Dimension
    shape = variable.get_shape()
    variable_parameters = 1
    for dim in shape:
      variable_parameters *= dim.value
    total_parameters += variable_parameters
  return total_parameters

#get_num_params('mobilenet')

### Connect the model to the data 


In [0]:
# First define the preprocessing ops for the train/test data
crop_height = 24 #@param
cropt_width = 24 #@param
preprocess_fn_train = train_image_preprocess(crop_height, cropt_width)
preprocess_fn_test = test_image_preprocess()

num_classes = 10 #@param

In [0]:
# for evaluation, we look at top_k_accuracy since it's easier to interpret; normally k=1 or k=5
def top_k_accuracy(k, labels, logits):
  in_top_k = tf.nn.in_top_k(predictions=tf.squeeze(logits), targets=tf.squeeze(tf.cast(labels, tf.int32)), k=k)
  return tf.reduce_mean(tf.cast(in_top_k, tf.float32))

### Instantiate Mobilenet, get number of parameters and compare with baseline

In [0]:
with tf.variable_scope("mobilenet_model"):
  mobilenet = Mobilenet(num_classes=10)

predictions_mobilenet = mobilenet(preprocess_fn_train(batch_train_images), is_training=True)
print (predictions_mobilenet)
test_predictions_mobilenet = mobilenet(preprocess_fn_test(batch_test_images), is_training=False)
print (test_predictions_mobilenet)
  
print ('Number of parameters of Mobilenet is')
print (get_num_params("mobilenet_model"))

Tensor("mobilenet/linear/add:0", shape=(128, 10), dtype=float32)
Tensor("mobilenet_1/linear/add:0", shape=(?, 10), dtype=float32)
Number of parameters of Mobilenet is
1079050


### Create the optimizer: SGD with momentum

In [0]:
def get_optimizer(step):
  """Get the optimizer used for training."""
  lr_schedule = (40e3, 60e3, 80e3)
  lr_schedule = tf.to_int64(lr_schedule)
  lr_factor = 0.1
  
  lr_init = 0.1
  num_epochs = tf.reduce_sum(tf.to_float(step >= lr_schedule))
  lr = lr_init * lr_factor**num_epochs

  return tf.train.MomentumOptimizer(learning_rate=lr, momentum=0.9)

In [0]:
MAX_IMAGES = 10
def gallery(seq, lbl, class_dict, title='Display input'):
  num_frames, h, w, num_channels = seq.shape
  num_frames = min(num_frames, MAX_IMAGES)
  ff, axes = plt.subplots(1, num_frames,
                          figsize=(num_frames, 1),
                          subplot_kw={'xticks': [], 'yticks': []})
  for i in range(0, num_frames):
    if num_channels == 3:
      axes[i].imshow(np.squeeze(seq[i]))
    else:
      axes[i].imshow(np.squeeze(seq[i]), cmap='gray')
    axes[i].set_title(class_dict[lbl[i][0]])
    plt.setp(axes[i].get_xticklabels(), visible=False)
    plt.setp(axes[i].get_yticklabels(), visible=False)
  ff.subplots_adjust(wspace=0.1)
  plt.show()
  

### Set up the training

In [0]:
# Define number of training iterations and reporting intervals
TRAIN_ITERS = 90e3 #@param
REPORT_TRAIN_EVERY = 100 #@param
PLOT_EVERY = 500 #@param
REPORT_TEST_EVERY = 1000 #@param
TEST_ITERS = 10 #@param
lr_init = 0.1 #@param
display_inputs = False #@param

class_mapping = [u'airplane', u'automobile', u'bird', u'cat', u'deer', u'dog', u'frog', u'horse', u'ship', u'truck']

In [0]:
# Write a function that takes a list of losses and plots them.
def plot_losses(loss_list, steps):
  display.clear_output(wait=True)
  display.display(pl.gcf())
  pl.plot(steps, loss_list)
  time.sleep(1.0)

### Training MobileNet


In [0]:
# define train and test loss functions

# Create a global step that is incremented during training; useful for e.g. learning rate annealing
global_step = tf.train.get_or_create_global_step()

# instantiate the optimizer
optimizer, lr_op = get_optimizer(global_step)

# Get training and test ops
# training_op = 
# update_ops = 
# training_op = tf.group(training_op, update_ops)

test_acc_op = top_k_accuracy(1, batch_test_labels, test_predictions_mobilenet)

# Create the session and initialize variables
sess = tf.Session()
sess.run(tf.initialize_all_variables())

# run training; at every k iterations, run evaluation too
train_iter = 0
display_inputs = False
losses = []
steps = []
for train_iter in range(int(TRAIN_ITERS)):
  _, train_loss, lr, img_inp, lbl_inp = sess.run([training_op, train_loss_op, lr_op, batch_train_images, batch_train_labels])
  
  if (train_iter % REPORT_TRAIN_EVERY) == 0:
    print ('Train loss at iter {0:5d} out of {1:5d} is {2:.2f}'.format(int(train_iter), int(TRAIN_ITERS), train_loss)) 
    
  if (train_iter % REPORT_TEST_EVERY) == 0:
    avg_acc = 0.0
    for test_iter in range(TEST_ITERS):
      acc = sess.run(test_acc_op)
      avg_acc += acc
      
    if display_inputs:
      gallery(img_inp, lbl_inp, class_mapping)
    avg_acc /= (TEST_ITERS)
    print ('Test acc at iter {0:5d} out of {1:5d} is {2:.2f}'.format(int(train_iter), int(TRAIN_ITERS), avg_acc*100.0))