In [1]:
import tensorflow as tf
import cifar
tf.logging.set_verbosity(tf.logging.WARN)

## Download and extract the dataset

In [2]:
cifar.prepare_cifar_10()
cifar10_labels = cifar.cifar10_labels()

## Define the model function
Here we have defined a common model function which runs for both batchNormalized and non-batchNormalized networks.

#### Batch Norm before or after activation
This is a long-standing debate about Batch Normalization. To put batch norm before or after activation. I have chosen to put it after the activation. And more specifically, at the input of each layer. This makes more sense, since batch norm is introduced to reduce the covariant shift in input of each layer.

In [3]:
def model(features, labels, mode, params):
    layer = features['images']
    if params["print_shapes"]:
        print(layer.shape)

    for filt, kern, stride in zip(params['filters'], params['kern'], params['strides']):
        if params['with_bn']:
            layer = tf.layers.batch_normalization(
                layer, training=mode == tf.estimator.ModeKeys.TRAIN)
        layer = tf.layers.conv2d(
            layer, filt, kern, stride, activation=tf.nn.relu)
        if params["print_shapes"]:
            print(layer.shape)

    layer = tf.layers.flatten(layer)
    if params["print_shapes"]:
        print(layer.shape)

    for units in params['dense']:
        if params['with_bn']:
            layer = tf.layers.batch_normalization(
                layer, training=mode == tf.estimator.ModeKeys.TRAIN)
        layer = tf.layers.dense(
            layer, units, activation=tf.nn.relu)
        if params["print_shapes"]:
            print(layer.shape)

    if params['with_bn']:
        layer = tf.layers.batch_normalization(
            layer, training=mode == tf.estimator.ModeKeys.TRAIN)

    logits = tf.layers.dense(layer, 10)
    if params["print_shapes"]:
        print(logits.shape)
    cls = tf.argmax(logits, -1)

    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode, predictions={
            "class": cls,
            "score": tf.nn.softmax(logits)
        })

    loss = tf.losses.sparse_softmax_cross_entropy(labels, logits)

    if mode == tf.estimator.ModeKeys.EVAL:
        return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops={
            "accuracy": tf.metrics.accuracy(labels, cls)
        })
    adam = tf.train.AdamOptimizer()
    
    if params['with_bn']:
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            opt = adam.minimize(loss,global_step=tf.train.get_global_step())
    else:
        opt = adam.minimize(loss,global_step=tf.train.get_global_step())

    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=opt)

In [4]:
def inp_fn():
    return tf.data.Dataset.from_generator(cifar.cifar10_train, ({"images": tf.float32}, tf.int64),
                                          ({"images": tf.TensorShape([None, 32, 32, 3])}, tf.TensorShape(None)))


def test_inp_fn():
    return tf.data.Dataset.from_generator(cifar.cifar10_test, ({"images": tf.float32}, tf.int64),
                                          ({"images": tf.TensorShape([None, 32, 32, 3])}, tf.TensorShape(None)))

In [5]:
hparams = {
    "filters": [30, 50, 60],
    "kern": [[3, 3]]*3,
    "strides": [[2, 2], [1, 1], [1, 1]],
    "dense": [3500, 700],
    "with_bn": False,
    "print_shapes": True
}

### View layer shapes

In [6]:
model({"images": tf.placeholder(tf.float32, (10, 32, 32, 3))},
      tf.placeholder(tf.int32, (10)), tf.estimator.ModeKeys.TRAIN, hparams)
hparams["print_shapes"] = False

(10, 32, 32, 3)
(10, 15, 15, 30)
(10, 13, 13, 50)
(10, 11, 11, 60)
(10, 7260)
(10, 3500)
(10, 700)
(10, 10)


## Without BatchNorm (no dropout)

In [7]:
wobn = tf.estimator.Estimator(model, 'wobn-ckpts', config=tf.estimator.RunConfig(save_summary_steps=2),
                              params=hparams)

#### Start Tensorboard

In [8]:
get_ipython().system_raw("start tensorboard --logdir wobn-ckpts") #Windows
# get_ipython().system_raw("tensorboard --logdir wobnd-ckpts &") #Linux

In [9]:
for i in range(20):
    wobn.train(inp_fn)
    print(wobn.evaluate(test_inp_fn))

{'accuracy': 0.3997, 'loss': 1.6813979, 'global_step': 20}
{'accuracy': 0.4834, 'loss': 1.436868, 'global_step': 40}
{'accuracy': 0.5223, 'loss': 1.3160748, 'global_step': 60}
{'accuracy': 0.563, 'loss': 1.2359521, 'global_step': 80}
{'accuracy': 0.5723, 'loss': 1.1918161, 'global_step': 100}
{'accuracy': 0.5985, 'loss': 1.1358802, 'global_step': 120}
{'accuracy': 0.5969, 'loss': 1.1330194, 'global_step': 140}
{'accuracy': 0.6238, 'loss': 1.0888523, 'global_step': 160}
{'accuracy': 0.6115, 'loss': 1.1035516, 'global_step': 180}
{'accuracy': 0.6409, 'loss': 1.0914321, 'global_step': 200}
{'accuracy': 0.6447, 'loss': 1.0573525, 'global_step': 220}
{'accuracy': 0.6415, 'loss': 1.1172442, 'global_step': 240}
{'accuracy': 0.6298, 'loss': 1.1756989, 'global_step': 260}
{'accuracy': 0.6507, 'loss': 1.1496994, 'global_step': 280}
{'accuracy': 0.6569, 'loss': 1.2456324, 'global_step': 300}
{'accuracy': 0.5878, 'loss': 1.4401076, 'global_step': 320}
{'accuracy': 0.6286, 'loss': 1.2964361, 'globa

## With BatchNorm

In [10]:
hparams['with_bn'] = True

In [11]:
wbn = tf.estimator.Estimator(model, 'wbn-ckpts', config=tf.estimator.RunConfig(save_summary_steps=2),
                             params=hparams)

#### Start Tensorboard

In [12]:
get_ipython().system_raw("start tensorboard --logdir wbn-ckpts --port 6007") #Windows
# get_ipython().system_raw("tensorboard --logdir wobnd-ckpts &") #Linux

In [13]:
for i in range(5):
    wbn.train(inp_fn)
    print(wbn.evaluate(test_inp_fn))

{'accuracy': 0.1, 'loss': 2.5260954, 'global_step': 20}
{'accuracy': 0.1, 'loss': 2.61592, 'global_step': 40}
{'accuracy': 0.1, 'loss': 2.9976683, 'global_step': 60}
{'accuracy': 0.1, 'loss': 3.0407357, 'global_step': 80}
{'accuracy': 0.1177, 'loss': 3.8774574, 'global_step': 100}


## Results
#### Without BN
![Graph (Without BN)](images/wobn.png)
#### With BN
![Graph (with BN)](images/wbn.png)

As we can see, the loss converges much faster in the version using Batch Normalization. Version without BN takes about 350 steps to reach 0.2 loss, whereas version with BN does it in just 70. Also, we can see that the loss reduction is gradual. There are no ups and downs. This is due to the reduction of internal covariant shift for each layer as done by BN.

#### Overfitting
Here, we can see that the eval losses goes up during training - an obvious indication of overfitting. For now, we have concentrated only to know how the BN fastens up the regular networks. So, I didn't use any regularization.

BN also provides a weak form of regularization. But as we can see, it cannot prevent overfitting. It should be used with any other regularization layers like dropout.

Read more about Batch Normalization in the [original paper](https://arxiv.org/abs/1502.03167). You can try running this notebook in your local system or in [Google Colab](https://drive.google.com/file/d/1w7k4r7CQaqnK_f6596SbuON9xcaw5lNo/view?usp=sharing)