Builder's Guide: https://d2l.ai/chapter_builders-guide/model-construction.html

## 6.1. Layers and Modules

In [1]:
import tensorflow as tf

In [2]:
net = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation=tf.nn.relu),
    tf.keras.layers.Dense(10),
])

X = tf.random.uniform((2, 20))
net(X).shape

TensorShape([2, 10])

In [3]:
class MLP(tf.keras.Model):
    def __init__(self):
        # Call the constructor of the parent class tf.keras.Model to perform
        # the necessary initialization
        super().__init__()
        self.hidden = tf.keras.layers.Dense(units=256, activation=tf.nn.relu)
        self.out = tf.keras.layers.Dense(units=10)

    # Define the forward propagation of the model, that is, how to return the
    # required model output based on the input X
    def call(self, X):
        return self.out(self.hidden((X)))

In [4]:
net = MLP()
net(X).shape

TensorShape([2, 10])

In [5]:
class MySequential(tf.keras.Model):
    def __init__(self, *args):
        super().__init__()
        self.modules = args

    def call(self, X):
        for module in self.modules:
            X = module(X)
        return X

In [6]:
class FixedHiddenMLP(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.flatten = tf.keras.layers.Flatten()
        # Random weight parameters created with tf.constant are not updated
        # during training (i.e., constant parameters)
        self.rand_weight = tf.constant(tf.random.uniform((20, 20)))
        self.dense = tf.keras.layers.Dense(20, activation=tf.nn.relu)

    def call(self, inputs):
        X = self.flatten(inputs)
        # Use the created constant parameters, as well as the relu and
        # matmul functions
        X = tf.nn.relu(tf.matmul(X, self.rand_weight) + 1)
        # Reuse the fully connected layer. This is equivalent to sharing
        # parameters with two fully connected layers
        X = self.dense(X)
        # Control flow
        while tf.reduce_sum(tf.math.abs(X)) > 1:
            X /= 2
        return tf.reduce_sum(X)

In [7]:
net = FixedHiddenMLP()
net(X)

<tf.Tensor: shape=(), dtype=float32, numpy=0.72308606>

6.1.3 Executing code in the forward propagation method.

In [8]:
class NestMLP(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.net = tf.keras.Sequential()
        self.net.add(tf.keras.layers.Dense(64, activation=tf.nn.relu))
        self.net.add(tf.keras.layers.Dense(32, activation=tf.nn.relu))
        self.dense = tf.keras.layers.Dense(16, activation=tf.nn.relu)

    def call(self, inputs):
        return self.dense(self.net(inputs))

chimera = tf.keras.Sequential()
chimera.add(NestMLP())
chimera.add(tf.keras.layers.Dense(20))
chimera.add(FixedHiddenMLP())
chimera(X)

<tf.Tensor: shape=(), dtype=float32, numpy=0.5950629>

In [9]:
net = FixedHiddenMLP()
net(X)

<tf.Tensor: shape=(), dtype=float32, numpy=0.7176568>

## 6.2. Parameter Management

In [10]:
net = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(4, activation=tf.nn.relu),
    tf.keras.layers.Dense(1),
])

X = tf.random.uniform((2, 4))
net(X).shape

TensorShape([2, 1])

In [11]:
net.layers[2].weights

[<tf.Variable 'dense_12/kernel:0' shape=(4, 1) dtype=float32, numpy=
 array([[-0.7930276 ],
        [ 0.1335479 ],
        [ 0.7386651 ],
        [ 0.24774909]], dtype=float32)>,
 <tf.Variable 'dense_12/bias:0' shape=(1,) dtype=float32, numpy=array([0.], dtype=float32)>]

In [12]:
type(net.layers[2].weights[1]), tf.convert_to_tensor(net.layers[2].weights[1])

(tensorflow.python.ops.resource_variable_ops.ResourceVariable,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.], dtype=float32)>)

In [13]:
net.get_weights()

[array([[-0.12317443,  0.00921631, -0.4517009 ,  0.5182318 ],
        [ 0.1324361 , -0.21518761,  0.46206373, -0.70800185],
        [-0.05851465, -0.32758492, -0.00445682,  0.5983378 ],
        [-0.41979283, -0.667422  ,  0.03833359,  0.8380889 ]],
       dtype=float32),
 array([0., 0., 0., 0.], dtype=float32),
 array([[-0.7930276 ],
        [ 0.1335479 ],
        [ 0.7386651 ],
        [ 0.24774909]], dtype=float32),
 array([0.], dtype=float32)]

In [14]:
# tf.keras behaves a bit differently. It removes the duplicate layer
# automatically
shared = tf.keras.layers.Dense(4, activation=tf.nn.relu)
net = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(),
    shared,
    shared,
    tf.keras.layers.Dense(1),
])

net(X)
# Check whether the parameters are different
print(len(net.layers) == 3)

True


### 6.3. Parameter Initialization

In [15]:
net = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(4, activation=tf.nn.relu),
    tf.keras.layers.Dense(1),
])

X = tf.random.uniform((2, 4))
net(X).shape

TensorShape([2, 1])

### 6.3.1. Built-in Init

In [16]:
net = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(
        4, activation=tf.nn.relu,
        kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.01),
        bias_initializer=tf.zeros_initializer()),
    tf.keras.layers.Dense(1)])

net(X)
net.weights[0], net.weights[1]

(<tf.Variable 'dense_17/kernel:0' shape=(4, 4) dtype=float32, numpy=
 array([[ 0.00470567, -0.00807384, -0.00109407,  0.01465958],
        [ 0.00696203,  0.00278478,  0.00436364,  0.00804104],
        [ 0.0110795 , -0.00353022,  0.0018249 , -0.00260229],
        [ 0.00412887,  0.00194994, -0.00630489, -0.0002423 ]],
       dtype=float32)>,
 <tf.Variable 'dense_17/bias:0' shape=(4,) dtype=float32, numpy=array([0., 0., 0., 0.], dtype=float32)>)

In [17]:
net = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(
        4, activation=tf.nn.relu,
        kernel_initializer=tf.keras.initializers.Constant(1),
        bias_initializer=tf.zeros_initializer()),
    tf.keras.layers.Dense(1),
])

net(X)
net.weights[0], net.weights[1]

(<tf.Variable 'dense_19/kernel:0' shape=(4, 4) dtype=float32, numpy=
 array([[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]], dtype=float32)>,
 <tf.Variable 'dense_19/bias:0' shape=(4,) dtype=float32, numpy=array([0., 0., 0., 0.], dtype=float32)>)

In [18]:
net = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(
        4,
        activation=tf.nn.relu,
        kernel_initializer=tf.keras.initializers.GlorotUniform()),
    tf.keras.layers.Dense(
        1, kernel_initializer=tf.keras.initializers.Constant(42)),
])

net(X)
print(net.layers[1].weights[0])
print(net.layers[2].weights[0])

<tf.Variable 'dense_21/kernel:0' shape=(4, 4) dtype=float32, numpy=
array([[ 0.18770963, -0.22140336, -0.40790886,  0.42803973],
       [-0.10313588, -0.1775465 ,  0.7639758 , -0.27005482],
       [-0.73524314, -0.745402  , -0.6793747 , -0.2170139 ],
       [-0.6623129 , -0.5094018 ,  0.23806709, -0.6926675 ]],
      dtype=float32)>
<tf.Variable 'dense_22/kernel:0' shape=(4, 1) dtype=float32, numpy=
array([[42.],
       [42.],
       [42.],
       [42.]], dtype=float32)>


6.3.1.1. Custom Initialization

In [19]:
class MyInit(tf.keras.initializers.Initializer):
    def __call__(self, shape, dtype=None):
        data=tf.random.uniform(shape, -10, 10, dtype=dtype)
        factor=(tf.abs(data) >= 5)
        factor=tf.cast(factor, tf.float32)
        return data * factor

net = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(
        4,
        activation=tf.nn.relu,
        kernel_initializer=MyInit()),
    tf.keras.layers.Dense(1),
])

net(X)
print(net.layers[1].weights[0])

<tf.Variable 'dense_23/kernel:0' shape=(4, 4) dtype=float32, numpy=
array([[ 8.6502   , -0.       , -0.       , -9.381149 ],
       [ 8.4265175,  5.957653 , -0.       ,  7.558634 ],
       [ 5.9635277,  7.2402744, -8.67532  ,  6.335335 ],
       [ 0.       , -8.746784 , -9.136968 ,  0.       ]], dtype=float32)>


In [20]:
net.layers[1].weights[0][:].assign(net.layers[1].weights[0] + 1)
net.layers[1].weights[0][0, 0].assign(42)
net.layers[1].weights[0]

<tf.Variable 'dense_23/kernel:0' shape=(4, 4) dtype=float32, numpy=
array([[42.       ,  1.       ,  1.       , -8.381149 ],
       [ 9.4265175,  6.957653 ,  1.       ,  8.558634 ],
       [ 6.9635277,  8.240274 , -7.6753197,  7.335335 ],
       [ 1.       , -7.746784 , -8.136968 ,  1.       ]], dtype=float32)>

## 6.4. Lazy Initialization

In [21]:
net = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation=tf.nn.relu),
    tf.keras.layers.Dense(10),
])

In [22]:
[net.layers[i].get_weights() for i in range(len(net.layers))]

[[], []]

In [23]:
X = tf.random.uniform((2, 20))
net(X)
[w.shape for w in net.get_weights()]

[(20, 256), (256,), (256, 10), (10,)]

## 6.5. Custom Layers

In [24]:
class CenteredLayer(tf.keras.Model):
    def __init__(self):
        super().__init__()

    def call(self, X):
        return X - tf.reduce_mean(X)

In [25]:
layer = CenteredLayer()
layer(tf.constant([1.0, 2, 3, 4, 5]))

<tf.Tensor: shape=(5,), dtype=float32, numpy=array([-2., -1.,  0.,  1.,  2.], dtype=float32)>

In [26]:
net = tf.keras.Sequential([tf.keras.layers.Dense(128), CenteredLayer()])

In [27]:
Y = net(tf.random.uniform((4, 8)))
tf.reduce_mean(Y)

<tf.Tensor: shape=(), dtype=float32, numpy=-1.8626451e-09>

### 6.5.2. Layers with Parameters

In [28]:
class MyDense(tf.keras.Model):
    def __init__(self, units):
        super().__init__()
        self.units = units

    def build(self, X_shape):
        self.weight = self.add_weight(name='weight',
            shape=[X_shape[-1], self.units],
            initializer=tf.random_normal_initializer())
        self.bias = self.add_weight(
            name='bias', shape=[self.units],
            initializer=tf.zeros_initializer())

    def call(self, X):
        linear = tf.matmul(X, self.weight) + self.bias
        return tf.nn.relu(linear)

In [29]:
dense = MyDense(3)
dense(tf.random.uniform((2, 5)))
dense.get_weights()

[array([[ 0.08063438, -0.02159081,  0.0818014 ],
        [ 0.1103104 , -0.11156829,  0.05035781],
        [ 0.04400355, -0.02747956,  0.04825045],
        [-0.04681975, -0.09091557,  0.11648601],
        [ 0.05192385, -0.0989032 , -0.05969038]], dtype=float32),
 array([0., 0., 0.], dtype=float32)]

In [30]:
dense(tf.random.uniform((2, 5)))

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[0.17001481, 0.        , 0.07705029],
       [0.05126169, 0.        , 0.12768671]], dtype=float32)>

In [31]:
net = tf.keras.models.Sequential([MyDense(8), MyDense(1)])
net(tf.random.uniform((2, 64)))

<tf.Tensor: shape=(2, 1), dtype=float32, numpy=
array([[0.00263962],
       [0.02497042]], dtype=float32)>

In [32]:
tf.random.uniform((2,64)).shape

TensorShape([2, 64])

## 6.6. File I/O

In [33]:
import numpy as np
import tensorflow as tf

### 6.6.1. Loading and saving tensors

In [34]:
x = tf.range(4)
np.save('x-file.npy', x)

In [35]:
x2 = np.load('x-file.npy', allow_pickle=True)
x2

array([0, 1, 2, 3], dtype=int32)

In [36]:
y = tf.zeros(4)
np.save('xy-files.npy', [x, y])
x2, y2 = np.load('xy-files.npy', allow_pickle=True)
(x2, y2)

(array([0., 1., 2., 3.]), array([0., 0., 0., 0.]))

In [37]:
mydict = {'x': x, 'y': y}
np.save('mydict.npy', mydict)
mydict2 = np.load('mydict.npy', allow_pickle=True)
mydict2

array({'x': <tf.Tensor: shape=(4,), dtype=int32, numpy=array([0, 1, 2, 3], dtype=int32)>, 'y': <tf.Tensor: shape=(4,), dtype=float32, numpy=array([0., 0., 0., 0.], dtype=float32)>},
      dtype=object)

### 6.6.2. Loading and saving model parameters

In [38]:
class MLP(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.flatten = tf.keras.layers.Flatten()
        self.hidden = tf.keras.layers.Dense(units=256, activation=tf.nn.relu)
        self.out = tf.keras.layers.Dense(units=10)

    def call(self, inputs):
        x = self.flatten(inputs)
        x = self.hidden(x)
        return self.out(x)

net = MLP()
X = tf.random.uniform((2, 20))
Y = net(X)

In [39]:
net.save_weights('mlp.params')

In [40]:
clone = MLP()
clone.load_weights('mlp.params')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f7f8806cb50>

In [41]:
Y_clone = clone(X)
Y_clone == Y

<tf.Tensor: shape=(2, 10), dtype=bool, numpy=
array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True]])>

### 6.7. GPUs

In [42]:
!nvidia-smi

Sat Jun 17 15:02:39 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.51.06    Driver Version: 450.51.06    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  TITAN RTX           On   | 00000000:21:00.0 Off |                  N/A |
| 40%   30C    P2    59W / 280W |  19329MiB / 24220MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  TITAN RTX           On   | 00000000:42:00.0 Off |                  N/A |
| 41%   31C    P2    60W / 280W |   5790MiB / 24220MiB |      0%      Default |
|       

In [43]:
import tensorflow as tf
from d2l import tensorflow as d2l

### 6.7.1. Computing Devices

In [47]:
def cpu():  #@save
    """Get the CPU device."""
    return tf.device('/CPU:0')
def gpu(i=0):  #@save
    """Get a GPU device."""
    return tf.device(f'/GPU:{i}')
cpu(), gpu(), gpu(1)

(<tensorflow.python.eager.context._EagerDeviceContext at 0x7f7f8801fa30>,
 <tensorflow.python.eager.context._EagerDeviceContext at 0x7f7f8801f520>,
 <tensorflow.python.eager.context._EagerDeviceContext at 0x7f7f8801faf0>)

In [48]:
def num_gpus():  #@save
    """Get the number of available GPUs."""
    return len(tf.config.experimental.list_physical_devices('GPU'))
num_gpus()

2

In [85]:
def try_gpu(i=0):  #@save
    """Return gpu(i) if exists, otherwise return cpu()."""
    if num_gpus() >= i + 1:
        return gpu(i)
    return cpu()

def try_all_gpus():  #@save
    """Return all available GPUs, or [cpu(),] if no GPU exists."""
    return [gpu(i) for i in range(num_gpus())]

try_gpu(), try_gpu(10), try_all_gpus()

(<tensorflow.python.eager.context._EagerDeviceContext at 0x7f8092bc85e0>,
 <tensorflow.python.eager.context._EagerDeviceContext at 0x7f8092bc8880>,
 [<tensorflow.python.eager.context._EagerDeviceContext at 0x7f8092bc8310>,
  <tensorflow.python.eager.context._EagerDeviceContext at 0x7f8092bc8df0>])

### 6.7.2. Tensors ad GPUs

In [50]:
x = tf.constant([1, 2, 3])
x.device

'/job:localhost/replica:0/task:0/device:CPU:0'

In [89]:
with try_gpu():
    X = tf.ones((2, 3))
X, X.device

(<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
 array([[1., 1., 1.],
        [1., 1., 1.]], dtype=float32)>,
 '/job:localhost/replica:0/task:0/device:GPU:0')

In [92]:
with try_gpu(1):
    Y = tf.random.uniform((2, 3))
Y, Y.device

(<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
 array([[0.45257938, 0.8680155 , 0.58143866],
        [0.47831988, 0.5795398 , 0.2757933 ]], dtype=float32)>,
 '/job:localhost/replica:0/task:0/device:GPU:1')

In [93]:
X + Y

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[1.4525794, 1.8680155, 1.5814387],
       [1.4783199, 1.5795398, 1.2757933]], dtype=float32)>

In [94]:
with try_gpu(1):
    Z = X
print(X)
print(Z)

tf.Tensor(
[[1. 1. 1.]
 [1. 1. 1.]], shape=(2, 3), dtype=float32)
tf.Tensor(
[[1. 1. 1.]
 [1. 1. 1.]], shape=(2, 3), dtype=float32)


In [95]:
Y + Z

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[1.4525794, 1.8680155, 1.5814387],
       [1.4783199, 1.5795398, 1.2757933]], dtype=float32)>

In [96]:
with try_gpu(1):
    Z2 = Z
Z2 is Z

True

### 6.7.3. NNs and GPUs

In [111]:
strategy = tf.distribute.MirroredStrategy()
with strategy.scope():
    net = tf.keras.models.Sequential([
        tf.keras.layers.Dense(1)])

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


In [113]:
net(X)

<tf.Tensor: shape=(2, 1), dtype=float32, numpy=
array([[1.4504831],
       [1.4504831]], dtype=float32)>

In [100]:
net.layers[0].weights[0].device, net.layers[0].weights[1].device

('/job:localhost/replica:0/task:0/device:GPU:0',
 '/job:localhost/replica:0/task:0/device:GPU:0')