In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

### The Vanishing / Exploding Gradients Problems

In [2]:
# Use a different weight initialization technique (default="glorot_uniform")
dense = tf.keras.layers.Dense(50, activation="relu", kernel_initializer="he_normal")

In [3]:
# Implement any initialization technique with VarianceScaling
he_avg_init = tf.keras.initializers.VarianceScaling(scale=2., mode="fan_avg",
                                                    distribution="uniform")
dense = tf.keras.layers.Dense(50, activation="sigmoid", kernel_initializer=he_avg_init)

### Better Activation Functions

In [4]:
# LealyReLU
leaky_relu = tf.keras.layers.LeakyReLU(alpha=0.2)  # defaults to alpha=0.3
dense = tf.keras.layers.Dense(50, activation=leaky_relu, kernel_initializer="he_normal")

In [5]:
# LealyReLU as a separate layer
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(50, kernel_initializer="he_normal"), # no activation
    tf.keras.layers.LeakyReLU(alpha=0.2)  # activation as a separate layer
])

2024-04-08 11:30:07.669009: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2 Max
2024-04-08 11:30:07.669037: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 32.00 GB
2024-04-08 11:30:07.669050: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 10.67 GB
2024-04-08 11:30:07.669100: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-04-08 11:30:07.669122: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


### Batch Normalization

In [10]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape=[28, 28]),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(300, activation="relu",
                          kernel_initializer="he_normal"),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(100, activation="relu",
                          kernel_initializer="he_normal"),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(10, activation="softmax")
])

In [11]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten (Flatten)           (None, 784)               0         
                                                                 
 batch_normalization (Batch  (None, 784)               3136      
 Normalization)                                                  
                                                                 
 dense_5 (Dense)             (None, 300)               235500    
                                                                 
 batch_normalization_1 (Bat  (None, 300)               1200      
 chNormalization)                                                
                                                                 
 dense_6 (Dense)             (None, 100)               30100     
                                                                 
 batch_normalization_2 (Bat  (None, 100)              

In [17]:
[(var.name, var.trainable) for var in model.layers[1].variables]

[('batch_normalization/gamma:0', True),
 ('batch_normalization/beta:0', True),
 ('batch_normalization/moving_mean:0', False),
 ('batch_normalization/moving_variance:0', False)]

In [18]:
# Use the BN layer before the activation function
model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=[28, 28]),
    # we can omit the bias term since BN also has one
    tf.keras.layers.Dense(300, kernel_initializer="he_normal", use_bias=False),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Activation("relu"),
    tf.keras.layers.Dense(100, kernel_initializer="he_normal", use_bias=False),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Activation("relu"),
    tf.keras.layers.Dense(10, activation="softmax")
])

### Gradient clipping

In [19]:
optimizer = tf.keras.optimizers.SGD(clipvalue=1.0)



### Transfer Learning with Keras

In [14]:
fashion_mnist = tf.keras.datasets.fashion_mnist.load_data()
(X_train_full, y_train_full), (X_test, y_test) = fashion_mnist
X_train, y_train = X_train_full[:-5000], y_train_full[:-5000]
X_valid, y_valid = X_train_full[-5000:], y_train_full[-5000:]
X_train, X_valid, X_test = X_train / 255, X_valid / 255, X_test / 255

In [15]:
class_names = ["T-shirt/top", "Trouser", "Pullover", "Dress", "Coat",
               "Sandal", "Shirt", "Sneaker", "Bag", "Ankle boot"]

In [22]:
# extra code – split Fashion MNIST into tasks A and B, then train and save
#              model A to "my_model_A".

pos_class_id = class_names.index("Pullover")
neg_class_id = class_names.index("T-shirt/top")

def split_dataset(X, y):
    y_for_B = (y == pos_class_id) | (y == neg_class_id)  # classes 0 and 2
    y_A = y[~y_for_B]
    y_B = (y[y_for_B] == pos_class_id).astype(np.float32)  # 0 becomes False and then 0., 2 becomes True and then 1.
    old_class_ids = list(set(range(10)) - set([neg_class_id, pos_class_id]))
    for old_class_id, new_class_id in zip(old_class_ids, range(8)):
        y_A[y_A == old_class_id] = new_class_id  # reorder class ids for A
    return ((X[~y_for_B], y_A), (X[y_for_B], y_B))

(X_train_A, y_train_A), (X_train_B, y_train_B) = split_dataset(X_train, y_train)
(X_valid_A, y_valid_A), (X_valid_B, y_valid_B) = split_dataset(X_valid, y_valid)
(X_test_A, y_test_A), (X_test_B, y_test_B) = split_dataset(X_test, y_test)
X_train_B = X_train_B[:200]
y_train_B = y_train_B[:200]

tf.random.set_seed(42)

model_A = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=[28, 28]),
    tf.keras.layers.Dense(100, activation="relu",
                          kernel_initializer="he_normal"),
    tf.keras.layers.Dense(100, activation="relu",
                          kernel_initializer="he_normal"),
    tf.keras.layers.Dense(100, activation="relu",
                          kernel_initializer="he_normal"),
    tf.keras.layers.Dense(8, activation="softmax")
])

model_A.compile(loss="sparse_categorical_crossentropy",
                optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
                metrics=["accuracy"])
history = model_A.fit(X_train_A, y_train_A, epochs=20,
                      validation_data=(X_valid_A, y_valid_A))
model_A.save("my_model_A")



Epoch 1/20


2024-04-08 16:02:24.973818: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.
2024-04-08 16:02:24.990725: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node SGD/AssignVariableOp.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
INFO:tensorflow:Assets written to: my_model_A/assets


INFO:tensorflow:Assets written to: my_model_A/assets


In [32]:
y_for_B = (y_train == pos_class_id) | (y_train == neg_class_id)
y_for_B

array([False,  True,  True, ..., False,  True,  True])

In [24]:
y_train[-3:]

array([9, 0, 2], dtype=uint8)

In [30]:
y_train[y_for_B] == pos_class_id

array([False, False, False, ...,  True, False,  True])

In [31]:
(y_train[y_for_B] == pos_class_id).astype(np.float32)

array([0., 0., 0., ..., 1., 0., 1.], dtype=float32)

In [48]:
# extra code – train and evaluate model B, without reusing model A

tf.random.set_seed(42)
model_B = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=[28, 28]),
    tf.keras.layers.Dense(100, activation="relu",
                          kernel_initializer="he_normal"),
    tf.keras.layers.Dense(100, activation="relu",
                          kernel_initializer="he_normal"),
    tf.keras.layers.Dense(100, activation="relu",
                          kernel_initializer="he_normal"),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

model_B.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
                metrics=["accuracy"])
history = model_B.fit(X_train_B, y_train_B, epochs=20,
                      validation_data=(X_valid_B, y_valid_B))
model_B.evaluate(X_test_B, y_test_B)



Epoch 1/20


2024-04-08 16:43:18.634324: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node SGD/AssignVariableOp.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


[0.356483519077301, 0.8934999704360962]

In [33]:
# Create a new model based on model_A's layers
model_A = tf.keras.models.load_model("my_model_A")
model_B_on_A = tf.keras.Sequential(model_A.layers[:-1])
model_B_on_A.add(tf.keras.layers.Dense(1, activation="sigmoid"))



In [34]:
# In order not to affect model_A's layers during training model B_on_A, 
# we should clone model_B_on_A's architecture and then copy its weights
# The layers will be different objects, even though they have the same names
model_A_clone = tf.keras.models.clone_model(model_A)
model_A_clone.set_weights(model_A.get_weights())

In [44]:
model_A.layers[0], model_A.layers[0].name

(<keras.src.layers.reshaping.flatten.Flatten at 0x363636b60>, 'flatten_2')

In [45]:
model_A_clone.layers[0], model_A_clone.layers[0].name

(<keras.src.layers.reshaping.flatten.Flatten at 0x3663b58d0>, 'flatten_2')

In [46]:
# create model_B_on_A again using the clone, replacing the top layer
model_B_on_A = tf.keras.Sequential(model_A_clone.layers[:-1])
model_B_on_A.add(tf.keras.layers.Dense(1, activation="sigmoid"))

In [47]:
# freeze the lower layers' weights to avoid wrecking them during the first few epochs of training
for layer in model_B_on_A.layers[:-1]:
    layer.trainable = False

optimizer = tf.keras.optimizers.legacy.SGD(learning_rate=0.001)
model_B_on_A.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])

In [49]:
# Train for a few epochs. Then unfreeze the lower layers and continue training with
# a smaller learning rate
history = model_B_on_A.fit(X_train_B, y_train_B, epochs=4,
                           validation_data=(X_valid_B, y_valid_B))

for layer in model_B_on_A.layers[:-1]:
    layer.trainable = True

optimizer = tf.keras.optimizers.legacy.SGD(learning_rate=0.0005)
model_B_on_A.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])
history = model_B_on_A.fit(X_train_B, y_train_B, epochs=16,
                           validation_data=(X_valid_B, y_valid_B))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


In [50]:
model_B_on_A.evaluate(X_test_B, y_test_B)



[0.35216501355171204, 0.8855000138282776]

Oops, no improvement. But that's okay :)

### Faster optimizers

In [2]:
# Momentum optimization
optimizer = tf.keras.optimizers.SGD(learning_rate=0.001, momentum=0.9)

2024-04-09 16:46:08.490508: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2 Max
2024-04-09 16:46:08.490533: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 32.00 GB
2024-04-09 16:46:08.490541: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 10.67 GB
2024-04-09 16:46:08.490579: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-04-09 16:46:08.490600: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [3]:
# Nesterov optimization
optimizer = tf.keras.optimizers.SGD(learning_rate=0.001, momentum=0.9, nesterov=True)



In [2]:
# RMSProp
optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.001, rho=0.9)

2024-04-09 17:23:53.945756: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2 Max
2024-04-09 17:23:53.945776: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 32.00 GB
2024-04-09 17:23:53.945783: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 10.67 GB
2024-04-09 17:23:53.945837: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-04-09 17:23:53.946123: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [4]:
# Adam
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999)



### Learning Rate Schedules

In [None]:
# Power scheduling
optimizer = tf.keras.optimizers.SGD(learning_rate=0.01, decay=1e-4)  # decay= 1/s (s-> steps)

In [5]:
# Exponential Scheduling
def exponential_decay_fn(epoch):
    # eta_0 = 0.01, s = 20
    return 0.01 * 0.1 ** (epoch / 20)

In [7]:
# Do not hardcode eta_0 and s.
# We do it in  this slightly complicated way because we have to supply the function
# to the LearningRateScheduler callback, with no way to supply the extra arguments for lr0 and s
def exponential_decay(lr0, s):
    def exponential_decay_fn(epoch):
        return lr0 * 0.1 ** (epoch / s)
    return exponential_decay_fn

exponential_decay_fn = exponential_decay(lr0=0.01, s=20)

In [8]:
lr_scheduler = tf.keras.callbacks.LearningRateScheduler(exponential_decay_fn)
# history = model.fit(X_train, y_train, [...], callbacks=[lr_scheduler])

In [9]:
# Optionally pass the current learning rate as a second argument to the schedule function
def exponential_decay_fn(epoch, lr):
    return lr * 0.1 ** (1 / 20)

In [11]:
# Piecewise constant scheduling
def piecewise_constant_fn(epoch):
    if epoch < 5:
        return 0.01
    elif epoch < 15:
        return 0.005
    else:
        return 0.001

In [12]:
# Performance scheduling
lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=5)
# history = model.fit(X_train, y_train, [...], callbacks=[lr_scheduler])

In [16]:
# Learning rate scheduling using a class from tf.keras.optimizers.schedules
# Note that this approach updates the learning rate at each step rather than
# at each epoch
import math

batch_size = 32
n_epochs = 25
n_steps = n_epochs * math.ceil(len(X_train) / batch_size)
scheduled_learning_rate = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=0.01, decay_steps=n_steps, decay_rate=0.1
)
optimizer = tf.keras.optimizers.SGD(learning_rate=scheduled_learning_rate)



### Regularization

In [17]:
# l2 regularization
layer = tf.keras.layers.Dense(100, activation="relu",
                              kernel_initializer="he_normal",
                              kernel_regularizer=tf.keras.regularizers.l2(0.01))

In [18]:
# Use functools.partial to avoid passing the same arguments 
# again and again to every layer
# Niceee
from functools import partial

RegularizedDense = partial(tf.keras.layers.Dense, activation="relu",
                           kernel_initializer="he_normal",
                           kernel_regularizer=tf.keras.regularizers.l2(0.01))

model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=[28, 28]),
    RegularizedDense(100),
    RegularizedDense(100),
    RegularizedDense(10, activation="softmax")
])

### Dropout

In [21]:
model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=[28, 28]),
    tf.keras.layers.Dropout(rate=0.2),
    tf.keras.layers.Dense(100, activation='elu', kernel_initializer="he_normal"),
    tf.keras.layers.Dropout(rate=0.2),
    tf.keras.layers.Dense(100, activation="elu", kernel_initializer="he_normal"),
    tf.keras.layers.Dropout(rate=0.2),
    tf.keras.layers.Dense(10, activation="softmax")
])

In [22]:
# extra code – compile and train the model
optimizer = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9)
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer,
              metrics=["accuracy"])
history = model.fit(X_train, y_train, epochs=10,
                    validation_data=(X_valid, y_valid))



Epoch 1/10
   9/1719 [..............................] - ETA: 11s - loss: 2.4548 - accuracy: 0.2049 

2024-04-10 11:03:33.350395: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node SGD/AssignVariableOp.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [23]:
#The training accuracy looks like it's lower than the validation accuracy, 
# but that's just because dropout is only active during training. 
# If we evaluate the model on the training set after training (i.e., with dropout turned off), 
# we get the "real" training accuracy:
model.evaluate(X_train, y_train)



[0.305176705121994, 0.8838363885879517]

In [30]:
tf.random.set_seed(42)  # extra code – for reproducibility

In [31]:
# MC dropout: Sample 100 different models' predictions with dropout enabled
# and take their average. This is generally more reliable than a single model's
# prediction with dropout disabled
y_probas = np.stack([model(X_test, training=True) for sample in range(100)])
y_proba = y_probas.mean(axis=0)

In [32]:
model.predict(X_test[:1]).round(3)



array([[0.   , 0.   , 0.   , 0.   , 0.   , 0.003, 0.   , 0.092, 0.   ,
        0.905]], dtype=float32)

In [33]:
# MC prediction is a little less confident
y_proba[0].round(3)

array([0.   , 0.   , 0.   , 0.   , 0.   , 0.061, 0.   , 0.195, 0.   ,
       0.744], dtype=float32)

In [34]:
# How much the predictions for the first sample vary across the different MC models
y_std = y_probas.std(axis=0)
y_std[0].round(3)

array([0.   , 0.   , 0.   , 0.   , 0.   , 0.132, 0.   , 0.217, 0.002,
       0.244], dtype=float32)

In [35]:
y_pred = y_proba.argmax(axis=1)
accuracy = (y_pred == y_test).sum() / len(y_test)
accuracy

0.8625

In [36]:
model.evaluate(X_test, y_test)



[0.37918326258659363, 0.8623999953269958]

In [37]:
# If we have other layers that do something different during training and during inference,
# we can't use training=True to implement MCDropout. Instead, we can subclass the Dropout class
# to implement MCDropout.
# Then, we can use it instead of Dropout layer when creating the model
class MCDropout(tf.keras.layers.Dropout):
    def call(self, inputs, training=False):
        return super().call(inputs, training=True)

### Max-Norm Regularization

In [38]:
dense = tf.keras.layers.Dense(
    100, activation="relu", kernel_initializer="he_normal",
    kernel_constraint=tf.keras.constraints.max_norm(1.)
)