12. Implement a custom layer that performs Layer Normalization

In [3]:
import tensorflow as tf
from tensorflow import keras
import numpy as np

In [58]:
class NormalizationLayer(keras.layers.Layer):

    def __init__(self, epsilon=0.001, **kwargs):
        super().__init__(**kwargs)
        self.epsilon = epsilon
    
    def build(self, batch_input_shape):
        self.alpha = self.add_weight(
            name="alpha",
            shape=batch_input_shape[-1:],
            dtype=tf.float32,
            trainable=True,
            initializer=tf.ones
            
        )    
        self.beta = self.add_weight(
            name="beta",
            shape=batch_input_shape[-1:],
            dtype=tf.float32,
            trainable=True,
            initializer=tf.zeros
            
        )    
        super().build(batch_input_shape)
        
        
    def call(self, inputs):
        mean, variance = tf.nn.moments(inputs, axes=-1, keepdims=True)
        std = tf.sqrt(variance + self.epsilon)    # derivative of sqrt(z) is undefined for z=0
        return self.alpha * (inputs - mean) / std + self.beta

In [54]:
data = tf.random.normal(shape=(5,), mean=5, stddev=1)
print(data)

tf.Tensor([4.7031407 3.0655727 7.487052  4.412018  4.158657 ], shape=(5,), dtype=float32)


In [55]:
layer = tf.keras.layers.LayerNormalization(axis=0)
output = layer(data)
print(output)

tf.Tensor([-0.04228865 -1.15658     1.8520375  -0.24038468 -0.41278526], shape=(5,), dtype=float32)


In [59]:
my_layer = NormalizationLayer()
my_output = my_layer(data)
print(my_output)

tf.Tensor([-0.04228865 -1.15658     1.8520375  -0.24038468 -0.41278526], shape=(5,), dtype=float32)


13. Train a model using a custom training loop

In [25]:
(X_train_full, y_train_full), (X_test, y_test) = keras.datasets.mnist.load_data()

In [26]:
# Scale the features for Gradient Descent (is it really necessary?)
X_valid, X_train = X_train_full[:5000] / 255.0, X_train_full[5000:] / 255.0
y_valid, y_train = y_train_full[:5000], y_train_full[5000:]
X_test = X_test / 255.0

In [27]:
model = keras.models.Sequential()
model.add(keras.layers.Flatten(input_shape=[28, 28]))
model.add(keras.layers.Dense(200, activation="relu"))
model.add(keras.layers.Dense(200, activation="relu"))
model.add(keras.layers.Dense(10, activation="softmax"))

In [29]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 784)               0         
_________________________________________________________________
dense (Dense)                (None, 200)               157000    
_________________________________________________________________
dense_1 (Dense)              (None, 200)               40200     
_________________________________________________________________
dense_2 (Dense)              (None, 10)                2010      
Total params: 199,210
Trainable params: 199,210
Non-trainable params: 0
_________________________________________________________________


In [33]:
model.trainable_variables

[<tf.Variable 'dense/kernel:0' shape=(784, 200) dtype=float32, numpy=
 array([[ 0.07531695,  0.06428696,  0.06990725, ...,  0.055079  ,
          0.00066985, -0.05107821],
        [ 0.05469356,  0.05387262, -0.0416443 , ..., -0.03119045,
          0.0102316 ,  0.07232903],
        [-0.02286955,  0.05385177,  0.0756866 , ..., -0.02611945,
          0.02291939,  0.0380661 ],
        ...,
        [ 0.03041444,  0.07093401,  0.06709787, ...,  0.03425878,
         -0.01465827,  0.01852486],
        [-0.06564058,  0.04024745, -0.06805687, ...,  0.06872854,
          0.00028025, -0.06044695],
        [-0.0453898 , -0.02092795, -0.04542916, ..., -0.04152139,
          0.0341448 , -0.06919041]], dtype=float32)>,
 <tf.Variable 'dense/bias:0' shape=(200,) dtype=float32, numpy=
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,

In [48]:
n_epochs = 5
batch_size = 32
n_steps = len(X_train) // batch_size
optimizer_lower = keras.optimizers.Nadam(learning_rate=0.01)
optimizer_upper = keras.optimizers.Adam(learning_rate=0.001)
loss_fn = keras.losses.sparse_categorical_crossentropy
mean_loss = keras.metrics.Mean()
mean_validation_loss = keras.metrics.Mean()
metrics = [keras.metrics.SparseCategoricalAccuracy()]
validation_metrics = [keras.metrics.SparseCategoricalAccuracy()]

In [35]:
def print_status_bar(iteration, total, loss, metrics=None):
    metrics = " - ".join(["{}: {:.4f}".format(m.name, m.result())
                         for m in [loss] + (metrics or [])])
    end = "" if iteration < total else "\n"
    print("\r{}/{} - ".format(iteration, total) + metrics,
          end=end)

In [60]:
for epoch in range(1, n_epochs + 1):
    print(f"Epoch {epoch} / {n_epochs}")
    for step in range(1, n_steps + 1):
        print(f"Step {step} / {n_steps}")
        X_batch = X_train[(step-1)*batch_size:(step*batch_size)]
        y_batch = y_train[(step-1)*batch_size:(step*batch_size)]
        with tf.GradientTape(persistent=True) as tape:
            y_pred = model(X_batch, training=True)
            loss = tf.reduce_mean(loss_fn(y_batch, y_pred))
        gradients_lower = tape.gradient(loss, model.trainable_variables[:2])
        gradients_upper = tape.gradient(loss, model.trainable_variables[2:])
        del tape
        optimizer_lower.apply_gradients(zip(gradients_lower, model.trainable_variables[:2]))
        optimizer_upper.apply_gradients(zip(gradients_upper, model.trainable_variables[2:]))
        mean_loss(loss)

        
        for metric in metrics:
            metric(y_batch, y_pred)

            
        print_status_bar(step * batch_size, len(y_train), mean_loss, metrics)

    ### Validation  at the end of every epoch###
    y_pred_valid = model(X_valid)
    validation_loss = tf.reduce_mean(loss_fn(y_valid,y_pred_valid))
    mean_validation_loss(validation_loss)
    for metric in validation_metrics:
        metric(y_valid, y_pred_valid)
    
    print_status_bar(len(y_train), len(y_train), mean_loss, metrics)
    print_status_bar(len(y_valid), len(y_valid), mean_validation_loss, validation_metrics)

    for metric in [mean_loss] + metrics:
        metric.reset_states()
        
    for metric in [mean_validation_loss] + validation_metrics:
        metric.reset_states()

Epoch 1 / 5
Step 1 / 1718
32/55000 - mean: 0.0009 - sparse_categorical_accuracy: 1.0000Step 2 / 1718
64/55000 - mean: 0.0012 - sparse_categorical_accuracy: 1.0000Step 3 / 1718
96/55000 - mean: 0.0009 - sparse_categorical_accuracy: 1.0000Step 4 / 1718
128/55000 - mean: 0.0069 - sparse_categorical_accuracy: 1.0000Step 5 / 1718
160/55000 - mean: 0.0058 - sparse_categorical_accuracy: 1.0000Step 6 / 1718
192/55000 - mean: 0.0056 - sparse_categorical_accuracy: 1.0000Step 7 / 1718
224/55000 - mean: 0.0051 - sparse_categorical_accuracy: 1.0000Step 8 / 1718
256/55000 - mean: 0.0045 - sparse_categorical_accuracy: 1.0000Step 9 / 1718
288/55000 - mean: 0.0078 - sparse_categorical_accuracy: 0.9965Step 10 / 1718
320/55000 - mean: 0.0092 - sparse_categorical_accuracy: 0.9969Step 11 / 1718
352/55000 - mean: 0.0111 - sparse_categorical_accuracy: 0.9943Step 12 / 1718
384/55000 - mean: 0.0103 - sparse_categorical_accuracy: 0.9948Step 13 / 1718
416/55000 - mean: 0.0097 - sparse_categorical_accuracy: 0.995