In [1]:
import tensorflow as tf
import cifar
tf.logging.set_verbosity(tf.logging.WARN)

## Download and extract the dataset

In [2]:
cifar.prepare_cifar_10()
cifar10_labels = cifar.cifar10_labels()

## Define the model function
Here we have defined a common model function which runs for both batch_normalized and non-batch_normalized networks.

#### Batch Norm before or after activation
This is a long-standing debate about Batch Normalization. To put batch norm before or after activation. I have chosen to put it after the activation. And more specifically, at the input of each layer. This makes more sense, since batch norm is introduced to reduce the covariant shift in input of each layer.

In [3]:
def model(features, labels, mode, params):
    layer = features['images']
    if params["print_shapes"]:
        print(layer.shape)

    for filt, kern, stride in zip(params['filters'], params['kern'], params['strides']):
        if params['with_bn']:
            layer = tf.layers.batch_normalization(
                layer, training=mode == tf.estimator.ModeKeys.TRAIN)
        layer = tf.layers.conv2d(
            layer, filt, kern, stride, activation=tf.nn.relu)
        if params["print_shapes"]: 
            print(layer.shape)

    layer = tf.layers.flatten(layer)
    if params["print_shapes"]:
        print(layer.shape)

    for units in params['dense']:
        if params['with_bn']:
            layer = tf.layers.batch_normalization(
                layer, training=mode == tf.estimator.ModeKeys.TRAIN)
        layer = tf.layers.dense(
            layer, units, activation=tf.nn.relu)
        if params["print_shapes"]:
            print(layer.shape)

    if params['with_bn']:
        layer = tf.layers.batch_normalization(
            layer, training=mode == tf.estimator.ModeKeys.TRAIN)

    logits = tf.layers.dense(layer, 10)
    if params["print_shapes"]:
        print(logits.shape)
    cls = tf.argmax(logits, -1)

    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode, predictions={
            "class": cls,
            "score": tf.nn.softmax(logits)
        })

    loss = tf.losses.sparse_softmax_cross_entropy(labels, logits)

    if mode == tf.estimator.ModeKeys.EVAL:
        return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops={
            "accuracy": tf.metrics.accuracy(labels, cls)
        })

    opt = tf.train.AdamOptimizer().minimize(
        loss, global_step=tf.train.get_global_step())

    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=opt)

In [4]:
def inp_fn():
    return tf.data.Dataset.from_generator(cifar.cifar10_train,({"images":tf.float32},tf.int64),
                                         ({"images":tf.TensorShape([None,32,32,3])},tf.TensorShape(None)))

def test_inp_fn():
    return tf.data.Dataset.from_generator(cifar.cifar10_test,({"images":tf.float32},tf.int64),
                                         ({"images":tf.TensorShape([None,32,32,3])},tf.TensorShape(None)))

## Without BatchNorm (no dropout)

In [5]:
wobn_params = {
    "filters": [30, 50,60],
    "kern": [[3, 3]]*3,
    "strides": [[2,2],[1,1],[1,1]],
    "dense": [3500,700],
    "with_bn": False,
    "print_shapes": True
}

### View layer shapes

In [6]:
model({"images":tf.placeholder(tf.float32,(10,32,32,3))},
      tf.placeholder(tf.int32,(10)),tf.estimator.ModeKeys.TRAIN,wobn_params)
wobn_params["print_shapes"] = False

(10, 32, 32, 3)
(10, 15, 15, 30)
(10, 13, 13, 50)
(10, 11, 11, 60)
(10, 7260)
(10, 3500)
(10, 700)
(10, 10)


In [7]:
wobn = tf.estimator.Estimator(model, 'wobn-ckpts', config=tf.estimator.RunConfig(save_summary_steps=2),
                              params=wobn_params)

Run tensorboard with:
```
tensorboard --logdir wobn-ckpts
```
in current directory

In [8]:
for i in range(20):
    wobn.train(inp_fn)
    print(wobn.evaluate(test_inp_fn))

{'accuracy': 0.3379, 'loss': 1.8762674, 'global_step': 20}
{'accuracy': 0.4551, 'loss': 1.4975078, 'global_step': 40}
{'accuracy': 0.5002, 'loss': 1.3790355, 'global_step': 60}
{'accuracy': 0.5239, 'loss': 1.3139155, 'global_step': 80}
{'accuracy': 0.5495, 'loss': 1.2388148, 'global_step': 100}
{'accuracy': 0.5889, 'loss': 1.1540627, 'global_step': 120}
{'accuracy': 0.5812, 'loss': 1.1657984, 'global_step': 140}
{'accuracy': 0.6214, 'loss': 1.0684103, 'global_step': 160}
{'accuracy': 0.6337, 'loss': 1.0501449, 'global_step': 180}
{'accuracy': 0.6292, 'loss': 1.0418706, 'global_step': 200}
{'accuracy': 0.6272, 'loss': 1.0928996, 'global_step': 220}
{'accuracy': 0.6459, 'loss': 1.0434248, 'global_step': 240}
{'accuracy': 0.6463, 'loss': 1.0748448, 'global_step': 260}
{'accuracy': 0.6411, 'loss': 1.119929, 'global_step': 280}
{'accuracy': 0.6517, 'loss': 1.1203771, 'global_step': 300}
{'accuracy': 0.6285, 'loss': 1.2333841, 'global_step': 320}
{'accuracy': 0.6551, 'loss': 1.1896093, 'glob

## With BatchNorm

In [9]:
wbn_params = {
    "filters": [30, 50,60],
    "kern": [[3, 3]]*3,
    "strides": [[2,2],[1,1],[1,1]],
    "dense": [3500,700],
    "with_bn": True,
    "print_shapes": True
}

### View layer shapes

In [10]:
model({"images":tf.placeholder(tf.float32,(10,32,32,3))},
      tf.placeholder(tf.int32,(10)),tf.estimator.ModeKeys.TRAIN,wbn_params)
wbn_params["print_shapes"] = False

(10, 32, 32, 3)
(10, 15, 15, 30)
(10, 13, 13, 50)
(10, 11, 11, 60)
(10, 7260)
(10, 3500)
(10, 700)
(10, 10)


In [11]:
wbn = tf.estimator.Estimator(model, 'wbn-ckpts', config=tf.estimator.RunConfig(save_summary_steps=2),
                              params=wbn_params)

Run tensorboard with:
```
tensorboard --logdir wbn-ckpts --port 6007
```
in current directory

In [12]:
for i in range(5): 
    wbn.train(inp_fn)
    print(wbn.evaluate(test_inp_fn))

{'accuracy': 0.1005, 'loss': 2.34123, 'global_step': 20}
{'accuracy': 0.1, 'loss': 2.3317132, 'global_step': 40}
{'accuracy': 0.1, 'loss': 2.3635697, 'global_step': 60}
{'accuracy': 0.1, 'loss': 2.3704066, 'global_step': 80}
{'accuracy': 0.1, 'loss': 2.3658943, 'global_step': 100}


## Results
#### Without BN
![Without BN](images/wobn.png)
#### With BN
![With BN](images/wbn.png)

As we can see, the loss converges much faster in the version using Batch Normalization. Version without BN takes about 370 steps to reach 0.2 loss, whereas version with BN does it in just 75. Also, we can see that the loss reduction is gradual. There are no ups and downs. This is due to the reduction of internal covariant shift for each layer as done by BN.

#### Overfitting
Here, we can see that the eval losses goes up during training - an obvious indication of overfitting. For now, we have concentrated only to know how the BN fastens up the regular networks. So, I didn't use any regularization.

BN also provides a weak form of regularization. But as we can see, it cannot prevent overfitting. It should used with any other regularization layers used normally.

Read more about Batch Normalization in the [original paper](https://arxiv.org/abs/1502.03167). You can try running this notebook in your local system or in [Google Colab](https://colab.research.google.com)