# Notes on Chapter 12 of *Hands-On Machine Learning with Scikit-Learn, Keras, & TensorFlow*, 3rd edition, by Aurélien Géron

Reduce the amount of logging messages displayed by TensorFlow

In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [2]:
import itertools
import time

import keras
from keras import layers
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
import tensorflow as tf
import tensorflow.experimental.numpy as tnp

## Basic tensor operations

In [3]:
x = tf.constant([[1,2],[3,4],[5,6]], dtype=tf.float16)

In [4]:
x

<tf.Tensor: shape=(3, 2), dtype=float16, numpy=
array([[1., 2.],
       [3., 4.],
       [5., 6.]], dtype=float16)>

In [5]:
3 * x

<tf.Tensor: shape=(3, 2), dtype=float16, numpy=
array([[ 3.,  6.],
       [ 9., 12.],
       [15., 18.]], dtype=float16)>

In [6]:
x @ tf.transpose(x)

<tf.Tensor: shape=(3, 3), dtype=float16, numpy=
array([[ 5., 11., 17.],
       [11., 25., 39.],
       [17., 39., 61.]], dtype=float16)>

In [7]:
tf.constant(23)

<tf.Tensor: shape=(), dtype=int32, numpy=23>

Keras 3 also has a tensor library in keras.ops

```
y = keras.ops.array([[1,2],[3,4],[5,6]])
y
```

```
[[1,2],[3,4],[5,6]] * keras.ops.arange(2,4)
```

Note the tensorflow will not automatically perform implicit type promotion due to performance concerns:

In [8]:
try:
    tf.constant(3) * tf.constant(2.)
except Exception as e:
    print(type(e), ':', e)
else:
    assert False # (unreached)

<class 'tensorflow.python.framework.errors_impl.InvalidArgumentError'> : cannot compute Mul as input #1(zero-based) was expected to be a int32 tensor but is a float tensor [Op:Mul] name: 


Thus manual casts are required for this type of code:

In [9]:
tf.cast(tf.constant(3), tf.float32) * tf.constant(2.)

<tf.Tensor: shape=(), dtype=float32, numpy=6.0>

Tensorflow also includes a more comprehensive numpy emulation library in experimental. First one needs to enable Numpy-like behavior:

In [10]:
from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()

This enables automatic type promotion, in addition to adding more numpy-like member functions (e.g. ravel and reshape) to tf tensors. 

In [11]:
tf.constant(3) * tf.constant(2.) # previously an error

<tf.Tensor: shape=(), dtype=float64, numpy=6.0>

In [12]:
x = tnp.arange(12).reshape((3,4))
x

<tf.Tensor: shape=(3, 4), dtype=int64, numpy=
array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])>

In [13]:
x[tnp.newaxis, 1, :]

<tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[4, 5, 6, 7]])>

Note that (unlike JAX) some things like direct assignment are still not allowed:

In [14]:
try:
    x[1,1] = 100
except Exception as e:
    print(type(e), ':', e)
else:
    assert False # (unreached)

<class 'TypeError'> : 'tensorflow.python.framework.ops.EagerTensor' object does not support item assignment


Mutible tensors need to be declared with tf.Variable:

In [15]:
x = tf.Variable([[1,2],[3,4]])
x

<tf.Variable 'Variable:0' shape=(2, 2) dtype=int32, numpy=
array([[1, 2],
       [3, 4]], dtype=int32)>

In [16]:
x.assign( 2*x )
x

<tf.Variable 'Variable:0' shape=(2, 2) dtype=int32, numpy=
array([[2, 4],
       [6, 8]], dtype=int32)>

In [17]:
x.scatter_nd_update([[0,0], [1,1,]], [23, 42])
x

<tf.Variable 'Variable:0' shape=(2, 2) dtype=int32, numpy=
array([[23,  4],
       [ 6, 42]], dtype=int32)>

### Custom objects

Custom objects such as loss functions can be used, but they can create complications when loading and saving models. In particular, you'll need to provide a dictionary at load time with the custom components, which can be either functions or objects that implement a get_config function.

In [18]:
m = keras.models.Sequential([
    layers.Input((8,)),
    layers.Dense(12),
    layers.Activation('softmax')
])

In [19]:
def my_loss(y_true, y_predicted):
    residual = y_true - y_predicted
    return tf.where(tf.abs(residual) > 1, residual, tf.square(residual)/2)

class MyLRSchedule(keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, initial_learning_rate, alpha, **kwargs):
        self.initial_learning_rate = initial_learning_rate
        self.alpha = alpha
        super().__init__(**kwargs) # kwargs not needed for this example

    def __call__(self, step):
        return self.initial_learning_rate / (step + 1)

    def get_config(self):
        base_config = {} #super().get_config()
        return {
            'initial_learning_rate': self.initial_learning_rate,
            'alpha': self.alpha,
            **base_config,
        }

In [20]:
m.compile(
    loss=my_loss,
    optimizer=keras.optimizers.SGD(learning_rate=MyLRSchedule(0.002, 0.999))
)
m.fit(tf.constant([[1,2,3,4,5,6,7,8]]), tf.constant([1]), verbose=0)
m.save('ch13_custom_objects.keras')

I0000 00:00:1711765640.283118  582241 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Load will fail without custom objects

In [21]:
try:
    m1 = keras.models.load_model('ch13_custom_objects.keras')
    assert False # unreached
except TypeError as e:
    print(type(e), e)

<class 'TypeError'> Cannot deserialize object of type `MyLRSchedule`. If `MyLRSchedule` is a custom class, please register it using the `@keras.saving.register_keras_serializable()` decorator.


Custom objects can be passed either as a context or an extra parameters on load

In [22]:
custom_objects = {
    'my_loss': my_loss,
    'MyLRSchedule': MyLRSchedule
}

m1 = tf.keras.models.load_model(
    'ch13_custom_objects.keras',
    custom_objects=custom_objects
)

with keras.saving.custom_object_scope(custom_objects):
    m2 = tf.keras.models.load_model('ch13_custom_objects.keras')

Custom layers

Simple layers without weights can be constructed with a lambda layer

In [23]:
square_layer = keras.layers.Lambda(lambda x : tf.square(x))

More complex layers can be implemented by deriving from the Layer class

In [24]:
class DenseAndFlatten(layers.Layer):
    """Same as the old Dense, but with less functionality"""
    
    def __init__(self, units, activation=None, **kwargs):
        super().__init__(**kwargs)
        self.units = units
        self.activation = keras.activations.get(activation)

    # initialize the parameters etc the first time this instance is used.
    def build(self, batch_input_shape):
        self.W = self.add_weight(name="W", shape=(batch_input_shape[-1], self.units),
            initializer="glorot_normal")
        self.b = self.add_weight(name="b", shape=(self.units,),
            initializer="zeros")

    def call(self, X):
        return self.activation(X @ self.W + self.b)

    def get_config(self):
        # needed only if supporting load/save functionality
        return {
            "units": self.units,
            "activation": keras.activations.serialize(self.activation),
            **(super().get_config())
        }

Models are a subclass of layers and custom layers can be defined in a similar fashion. This can sometimes be useful when a loss function needs to contain a term from internal variables in the model, in which case you can call the Model.add_loss to add a term to the loss function (typically as part of the call function).

## GradientTape and Autodiff

The GradientTape class can be used to record autodiff-calculated gradients:

In [25]:
def f(x,y):
    return x - 2*y + x*y

x1 = tf.Variable(42.)
y1 = tf.Variable(23.)

with tf.GradientTape() as tape:
    z1 = f(x1, y1)

tape.gradient(z1, [x1, y1])

[<tf.Tensor: shape=(), dtype=float32, numpy=24.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=40.0>]

By default gradient tapes can only be queried once:

In [26]:
with tf.GradientTape() as tape:
    z1 = f(x1, y1)

print(tape.gradient(z1, x1))

try:
    print(tape.gradient(z1, y1))
except RuntimeError as e:
    print(type(e), ':', e)
else:
    assert False # (unreached)

tf.Tensor(24.0, shape=(), dtype=float32)
<class 'RuntimeError'> : A non-persistent GradientTape can only be used to compute one set of gradients (or jacobians)


This behavior can be controlled with the `persistent` parameter:

In [27]:
with tf.GradientTape(persistent=True) as tape:
    z1 = f(x1, y1)

print(tape.gradient(z1, x1))
print(tape.gradient(z1, y1))

tf.Tensor(24.0, shape=(), dtype=float32)
tf.Tensor(40.0, shape=(), dtype=float32)


Only variables are tracked by default:

In [28]:
c1 = tf.constant(7.)

with tf.GradientTape() as tape:
    z1 = c1 * f(x1, y1)

print(tape.gradient(z1, c1))

None


Things like constants can be tracked by adding them with the `watch` function:

In [29]:
with tf.GradientTape() as tape:
    tape.watch(c1)
    z1 = c1 * f(x1, y1)

print(tape.gradient(z1, c1))

tf.Tensor(962.0, shape=(), dtype=float32)


### Custom training loops

We can easily create custom training loops

In [30]:
(X_train, y_train), (X_test,y_test) = keras.datasets.mnist.load_data()

In [31]:
m = keras.Sequential([
    layers.Input(shape=(28,28)),
    layers.Reshape((28,28,1)),
    layers.Conv2D(8, (5,5), padding="same", kernel_initializer="he_uniform"),
    layers.Activation('leaky_relu'),
    layers.MaxPool2D(),
    layers.Conv2D(16, (5,5), padding="same", kernel_initializer="he_uniform"),
    layers.Activation('leaky_relu'),
    layers.MaxPool2D(),
    layers.Conv2D(32, (5,5), padding="same", kernel_initializer="he_uniform"),
    layers.Activation('leaky_relu'),
    layers.MaxPool2D(),
    layers.Conv2D(32, (5,5), padding="same", kernel_initializer="he_uniform"),
    layers.Activation('leaky_relu'),
    layers.MaxPool2D(),
    layers.Flatten(),
    layers.Dense(32, kernel_initializer="he_uniform"),
    layers.Activation('leaky_relu'),
    layers.Dense(10),
    layers.Activation('softmax'),
])

In [32]:
n_epochs = 5
batch_size = 128
optimizer = keras.optimizers.AdamW()
loss_fn = keras.losses.SparseCategoricalCrossentropy()
mean_loss_train = keras.metrics.Mean(name="mean_loss_train")
mean_loss_val = keras.metrics.Mean(name="mean_loss_val")
metrics_train = [keras.metrics.SparseCategoricalAccuracy()]
metrics_val = [keras.metrics.SparseCategoricalAccuracy()]

In [33]:
for epoch in range(n_epochs):
    for i in range(0, X_train.shape[0], batch_size):
        X_batch = X_train[i:i+batch_size]
        y_batch = y_train[i:i+batch_size]
        with tf.GradientTape() as tape:
            y_hat_batch = m(X_batch, training=True)
            loss = tf.reduce_mean(loss_fn(y_batch, y_hat_batch))
        gradients = tape.gradient(loss, m.trainable_variables)
        optimizer.apply_gradients(zip(gradients, m.trainable_variables))
        mean_loss_train(loss)
        for metric in metrics_train:
            metric(y_batch, y_hat_batch)
    
    for i in range(0, X_test.shape[0], batch_size):
        X_batch = X_test[i:i+batch_size]
        y_batch = y_test[i:i+batch_size]
        y_hat_batch = m(X_batch)
        loss = tf.reduce_mean(loss_fn(y_batch, y_hat_batch))
        mean_loss_val(loss)
        for metric in metrics_val:
            metric(y_batch, y_hat_batch)
        
    
    print(
        f"Epoch:{epoch}"
        + f" loss_train:{mean_loss_train.result():.3f} "
        + " ".join([
                f"{metric.name}_train:{metric.result():.3f}" for metric in metrics_train
            ])
        + f" loss_val:{mean_loss_val.result():.3f} "
        + " ".join([
                f"{metric.name}_val:{metric.result():.3f}" for metric in metrics_val
            ])
    )
            
    for metric in metrics_train + metrics_val:
        metric.reset_states()

Epoch:0 loss_train:4.724 sparse_categorical_accuracy_train:0.757 loss_val:0.570 sparse_categorical_accuracy_val:0.875
Epoch:1 loss_train:2.558 sparse_categorical_accuracy_train:0.908 loss_val:0.429 sparse_categorical_accuracy_val:0.926
Epoch:2 loss_train:1.781 sparse_categorical_accuracy_train:0.940 loss_val:0.351 sparse_categorical_accuracy_val:0.947
Epoch:3 loss_train:1.374 sparse_categorical_accuracy_train:0.957 loss_val:0.301 sparse_categorical_accuracy_val:0.956
Epoch:4 loss_train:1.122 sparse_categorical_accuracy_train:0.967 loss_val:0.267 sparse_categorical_accuracy_val:0.964


### Tensorflow graphs

A function can be converted to a Tensorflow graph using tf.function:

In [34]:
def my_python_func(n):
    result = 0
    for i in range(n):
        result += (-n)**3
    return result

my_tf_func = tf.function(my_python_func)

In [35]:
my_python_func(8)

-4096

In [36]:
my_tf_func(8)

<tf.Tensor: shape=(), dtype=int32, numpy=-4096>

Underneath the hood, the function is first converted to a control-statement-free form:

In [37]:
print(tf.autograph.to_code(my_python_func))

def tf__my_python_func(n):
    with ag__.FunctionScope('my_python_func', 'fscope', ag__.ConversionOptions(recursive=True, user_requested=True, optional_features=(), internal_convert_user_code=True)) as fscope:
        do_return = False
        retval_ = ag__.UndefinedReturnValue()
        result = 0

        def get_state():
            return (result,)

        def set_state(vars_):
            nonlocal result
            (result,) = vars_

        def loop_body(itr):
            nonlocal result
            i = itr
            result = ag__.ld(result)
            result += (-n) ** 3
        i = ag__.Undefined('i')
        ag__.for_stmt(ag__.converted_call(ag__.ld(range), (ag__.ld(n),), None, fscope), None, loop_body, get_state, set_state, ('result',), {'iterate_names': 'i'})
        try:
            do_return = True
            retval_ = ag__.ld(result)
        except:
            do_return = False
            raise
        return fscope.ret(retval_, do_return)



## Exercises

### 12.1

Tensorflow is a framework for performing efficient calculations on multidimensional arrays with an extensive set of tools for building neural networks built on these arrays. Other similar popular libraries include PyTorch and JAX.

### 12.2

In most cases Tensorflow and Numpy can be used for the same tasks, but in many cases they do not use identical syntax so they are usually not drop-in replacements for one another. Tensorflow tends to be more performant but less widely supported. Numpy also lacks features such as autodiff and pre-built neural network abstractions.

### 12.3

In [38]:
tf.range(10)

<tf.Tensor: shape=(10,), dtype=int32, numpy=array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int32)>

In [39]:
tf.constant(np.arange(10))

<tf.Tensor: shape=(10,), dtype=int64, numpy=array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])>

Note the differences in default integer size, and also that the latter will not work properly with autograph or autodiff.

### 12.4

In [40]:
tf.Variable(4)

<tf.Variable 'Variable:0' shape=() dtype=int32, numpy=4>

In [41]:
tf.RaggedTensor.from_row_lengths(range(4), [3,1])

<tf.RaggedTensor [[0, 1, 2], [3]]>

In [42]:
tf.SparseTensor([[1,1],[2,3]], [7,8], [3,4])

SparseTensor(indices=tf.Tensor(
[[1 1]
 [2 3]], shape=(2, 2), dtype=int64), values=tf.Tensor([7 8], shape=(2,), dtype=int32), dense_shape=tf.Tensor([3 4], shape=(2,), dtype=int64))

In [43]:
tf.GradientTape()

<tensorflow.python.eager.backprop.GradientTape at 0x7fd9d5bc63b0>

In [44]:
tf.Graph()

<tensorflow.python.framework.ops.Graph at 0x7fd8167b7a30>

In [45]:
tf.TensorArray(tf.float32, size=0)

<tensorflow.python.ops.tensor_array_ops.TensorArray at 0x7fd816584ee0>

### 12.5

As is typical for callable objects, the choice between a function and a class is usually based on whether one need state that will persist across multiple calls to the callable object.

### 12.6

See 12.5.

### 12.7

Although custom models are a subclass of layers (and will work as one), it's best to reserve custom models for capturing the behavior of the model as a whole.

### 12.8

Models that require more detailed control than is available in the standard training loop, e.g. the use of different optimizers in different parts of the model.

### 12.9

They usually can contain arbitrary python code, but performance may be poor compared to code that could be converted by autograph.

### 12.10

- External library code may only be called during tracing, so avoid this if possible (and plan carefully when not)
- For loops will only run during tracing in most circumstances (exceptions include looping over tensors or datasets)
- Variable creation will only occur during tracing

### 12.11

A dynamic keras model may be needed when the model can change during training, e.g. adding or removing layers. One approach to this would use a custom training loop.

### 12.12

In [46]:
class LayerNormalization(layers.Layer):
    """Same as the old Dense, but with less functionality"""
    
    def __init__(self, epsilon=1e-3, **kwargs):
        super().__init__(**kwargs)
        self.epsilon = tf.constant(epsilon)

    # initialize the parameters etc the first time this instance is used.
    def build(self, batch_input_shape):
        self.alpha = self.add_weight(name="alpha", shape=batch_input_shape[-1:],
            initializer="ones")
        self.beta = self.add_weight(name="beta", shape=batch_input_shape[-1:],
            initializer="zeros")

    def call(self, X):
        mu, sigma_squared = tf.nn.moments(X, axes=-1, keepdims=True)
        sigma = tf.math.sqrt(sigma_squared)
        return self.alpha * (X - mu) / (sigma + self.epsilon) + self.beta

    def get_config(self):
        # needed only if supporting load/save functionality
        return {
            "epsilon": self.epsilon,
            **(super().get_config())
        }

In [47]:
l1 = LayerNormalization()
l2 = tf.keras.layers.LayerNormalization()

l1.build(tf.constant((2,3)))
l2.build(tf.constant((2,3)))

X = tf.constant([[1,2,3],[5,7,11]], dtype=float)
print(l1(X), l2(X))

tf.Tensor(
[[-1.2232468   0.          1.2232468 ]
 [-1.0686165  -0.26715407  1.3357707 ]], shape=(2, 3), dtype=float32) tf.Tensor(
[[-1.2238274   0.          1.2238274 ]
 [-1.0689592  -0.26723987  1.3361988 ]], shape=(2, 3), dtype=float32)


### 12.13
#### 12.13.a

In [48]:
(X_train, y_train), (X_test,y_test) = keras.datasets.fashion_mnist.load_data()

In [49]:
m = keras.Sequential([
    layers.Input(shape=(28,28)),
    layers.Reshape((28,28,1)),
    layers.Conv2D(8, (5,5), padding="same", kernel_initializer="he_uniform"),
    layers.Activation('leaky_relu'),
    layers.MaxPool2D(),
    layers.Conv2D(16, (5,5), padding="same", kernel_initializer="he_uniform"),
    layers.Activation('leaky_relu'),
    layers.MaxPool2D(),
    layers.Conv2D(32, (5,5), padding="same", kernel_initializer="he_uniform"),
    layers.Activation('leaky_relu'),
    layers.MaxPool2D(),
    layers.Conv2D(32, (5,5), padding="same", kernel_initializer="he_uniform"),
    layers.Activation('leaky_relu'),
    layers.MaxPool2D(),
    layers.Flatten(),
    layers.Dense(32, kernel_initializer="he_uniform"),
    layers.Activation('leaky_relu'),
    layers.Dense(10),
    layers.Activation('softmax'),
])

In [50]:
n_epochs = 5
batch_size = 128
optimizer = keras.optimizers.AdamW()
loss_fn = keras.losses.SparseCategoricalCrossentropy()
mean_loss_train = keras.metrics.Mean(name="mean_loss_train")
mean_loss_val = keras.metrics.Mean(name="mean_loss_val")
metrics_train = [keras.metrics.SparseCategoricalAccuracy()]
metrics_val = [keras.metrics.SparseCategoricalAccuracy()]

In [51]:
for epoch in range(n_epochs):
    for i in range(0, X_train.shape[0], batch_size):
        X_batch = X_train[i:i+batch_size]
        y_batch = y_train[i:i+batch_size]
        with tf.GradientTape() as tape:
            y_hat_batch = m(X_batch, training=True)
            loss = tf.reduce_mean(loss_fn(y_batch, y_hat_batch))
        gradients = tape.gradient(loss, m.trainable_variables)
        optimizer.apply_gradients(zip(gradients, m.trainable_variables))
        mean_loss_train(loss)
        for metric in metrics_train:
            metric(y_batch, y_hat_batch)
        print(
            f"Epoch:{epoch}"
            + f" {100*i/X_train.shape[0]:5.2f}% "
            + f" loss_train:{mean_loss_train.result():5.3f} "
            + " ".join([
                f"{metric.name}_train:{metric.result():.3f}" for metric in metrics_train
            ]),
            end="\r"
        )
    
    for i in range(0, X_test.shape[0], batch_size):
        X_batch = X_test[i:i+batch_size]
        y_batch = y_test[i:i+batch_size]
        y_hat_batch = m(X_batch)
        loss = tf.reduce_mean(loss_fn(y_batch, y_hat_batch))
        mean_loss_val(loss)
        for metric in metrics_val:
            metric(y_batch, y_hat_batch)
        
    
    print(
        f"Epoch:{epoch}"
        + f" loss_train:{mean_loss_train.result():.3f} "
        + " ".join([
                f"{metric.name}_train:{metric.result():.3f}" for metric in metrics_train
            ])
        + f" loss_val:{mean_loss_val.result():.3f} "
        + " ".join([
                f"{metric.name}_val:{metric.result():.3f}" for metric in metrics_val
            ])
    )
            
    for metric in metrics_train + metrics_val:
        metric.reset_states()

Epoch:0 loss_train:2.939 sparse_categorical_accuracy_train:0.714 loss_val:0.686 sparse_categorical_accuracy_val:0.788
Epoch:1 loss_train:1.745 sparse_categorical_accuracy_train:0.816 loss_val:0.614 sparse_categorical_accuracy_val:0.817
Epoch:2 loss_train:1.313 sparse_categorical_accuracy_train:0.844 loss_val:0.571 sparse_categorical_accuracy_val:0.835
Epoch:3 loss_train:1.084 sparse_categorical_accuracy_train:0.859 loss_val:0.541 sparse_categorical_accuracy_val:0.844
Epoch:4 loss_train:0.940 sparse_categorical_accuracy_train:0.870 loss_val:0.520 sparse_categorical_accuracy_val:0.849


### 12.13
#### 12.13.b

In [52]:
(X_train, y_train), (X_test,y_test) = keras.datasets.fashion_mnist.load_data()

In [53]:
lower = keras.Sequential([
    layers.Conv2D(8, (5,5), padding="same", kernel_initializer="he_uniform"),
    layers.Activation('leaky_relu'),
    layers.MaxPool2D(),
    layers.Conv2D(16, (5,5), padding="same", kernel_initializer="he_uniform"),
    layers.Activation('leaky_relu'),
    layers.MaxPool2D(),
    layers.Conv2D(32, (5,5), padding="same", kernel_initializer="he_uniform"),
    layers.Activation('leaky_relu'),
    layers.MaxPool2D(),
    layers.Conv2D(32, (5,5), padding="same", kernel_initializer="he_uniform"),
    layers.Activation('leaky_relu'),
    layers.MaxPool2D(),
])

upper = keras.Sequential([
    layers.Dense(32, kernel_initializer="he_uniform"),
    layers.Activation('leaky_relu'),
    layers.Dense(10),
    layers.Activation('softmax'),
])

m = keras.Sequential([
    layers.Input(shape=(28,28)),
    layers.Reshape((28,28,1)),
    lower,
    layers.Flatten(),
    upper,
])

In [54]:
n_epochs = 5
batch_size = 128
optimizer_lower = keras.optimizers.AdamW()
optimizer_upper = keras.optimizers.Adam(learning_rate=0.002)
loss_fn = keras.losses.SparseCategoricalCrossentropy()
mean_loss_train = keras.metrics.Mean(name="mean_loss_train")
mean_loss_val = keras.metrics.Mean(name="mean_loss_val")
metrics_train = [keras.metrics.SparseCategoricalAccuracy()]
metrics_val = [keras.metrics.SparseCategoricalAccuracy()]

In [55]:
for epoch in range(n_epochs):
    for i in range(0, X_train.shape[0], batch_size):
        X_batch = X_train[i:i+batch_size]
        y_batch = y_train[i:i+batch_size]
        with tf.GradientTape(persistent=True) as tape:
            y_hat_batch = m(X_batch, training=True)
            loss = tf.reduce_mean(loss_fn(y_batch, y_hat_batch))
        gradients_lower = tape.gradient(loss, lower.trainable_variables)
        optimizer_lower.apply_gradients(zip(gradients_lower, lower.trainable_variables))
        gradients_upper = tape.gradient(loss, upper.trainable_variables)
        optimizer_upper.apply_gradients(zip(gradients_upper, upper.trainable_variables))
        mean_loss_train(loss)
        for metric in metrics_train:
            metric(y_batch, y_hat_batch)
        print(
            f"Epoch:{epoch}"
            + f" {100*i/X_train.shape[0]:5.2f}% "
            + f" loss_train:{mean_loss_train.result():5.3f} "
            + " ".join([
                f"{metric.name}_train:{metric.result():.3f}" for metric in metrics_train
            ]),
            end="\r"
        )
    
    for i in range(0, X_test.shape[0], batch_size):
        X_batch = X_test[i:i+batch_size]
        y_batch = y_test[i:i+batch_size]
        y_hat_batch = m(X_batch)
        loss = tf.reduce_mean(loss_fn(y_batch, y_hat_batch))
        mean_loss_val(loss)
        for metric in metrics_val:
            metric(y_batch, y_hat_batch)
        
    
    print(
        f"Epoch:{epoch}"
        + f" loss_train:{mean_loss_train.result():.3f} "
        + " ".join([
                f"{metric.name}_train:{metric.result():.3f}" for metric in metrics_train
            ])
        + f" loss_val:{mean_loss_val.result():.3f} "
        + " ".join([
                f"{metric.name}_val:{metric.result():.3f}" for metric in metrics_val
            ])
    )
            
    for metric in metrics_train + metrics_val:
        metric.reset_states()

Epoch:0 loss_train:4.567 sparse_categorical_accuracy_train:0.680 loss_val:0.735 sparse_categorical_accuracy_val:0.761
Epoch:1 loss_train:2.579 sparse_categorical_accuracy_train:0.798 loss_val:0.657 sparse_categorical_accuracy_val:0.797
Epoch:2 loss_train:1.882 sparse_categorical_accuracy_train:0.829 loss_val:0.607 sparse_categorical_accuracy_val:0.817
Epoch:3 loss_train:1.519 sparse_categorical_accuracy_train:0.846 loss_val:0.571 sparse_categorical_accuracy_val:0.834
Epoch:4 loss_train:1.295 sparse_categorical_accuracy_train:0.860 loss_val:0.545 sparse_categorical_accuracy_val:0.843
