In [1]:
import torch
from torch import nn
from torch.nn import functional as F
from d2l import torch as d2l

# 6.1. Layers and Modules

a network with one fully connected hidden layer with 256 units and ReLU activation, followed by a fully connected output layer with ten units (no activation function).

In [4]:
net = nn.Sequential(nn.LazyLinear(256), nn.ReLU(), nn.LazyLinear(10))

X = torch.rand(2, 20)
net(X).shape

torch.Size([2, 10])

## 6.1.1. A Custom Module

an MLP with one hidden layer with 256 hidden units, and a 10-dimensional output layer.

In [5]:
class MLP(nn.Module):
    def __init__(self):
        # Call the constructor of the parent class nn.Module to perform
        # the necessary initialization
        super().__init__()
        self.hidden = nn.LazyLinear(256)
        self.out = nn.LazyLinear(10)

    # Define the forward propagation of the model, that is, how to return the
    # required model output based on the input X
    def forward(self, X):
        return self.out(F.relu(self.hidden(X)))

In [6]:
net = MLP()
net(X).shape

torch.Size([2, 10])

## 6.1.2. The Sequential Module

design a Sequential class and use it to create an MLP with one hidden layer with 256 hidden units, and a 10-dimensional output layer.

In [7]:
class MySequential(nn.Module):
    def __init__(self, *args):
        super().__init__()
        for idx, module in enumerate(args):
            self.add_module(str(idx), module)

    def forward(self, X):
        for module in self.children():
            X = module(X)
        return X

In [8]:
net = MySequential(nn.LazyLinear(256), nn.ReLU(), nn.LazyLinear(10))
net(X).shape

torch.Size([2, 10])

## 6.1.3. Executing Code in the Forward Propagation Method

add a "fixed" layer after a fully connected layers

In [9]:
class FixedHiddenMLP(nn.Module):
    def __init__(self):
        super().__init__()
        # Random weight parameters that will not compute gradients and
        # therefore keep constant during training
        self.rand_weight = torch.rand((20, 20))
        self.linear = nn.LazyLinear(20)

    def forward(self, X):
        X = self.linear(X)
        X = F.relu(X @ self.rand_weight + 1)
        # Reuse the fully connected layer. This is equivalent to sharing
        # parameters with two fully connected layers
        X = self.linear(X)
        # Control flow
        while X.abs().sum() > 1:
            X /= 2
        return X.sum()

In [10]:
net = FixedHiddenMLP()
net(X)

tensor(-0.1554, grad_fn=<SumBackward0>)

mix several modules together

In [11]:
class NestMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.LazyLinear(64), nn.ReLU(), nn.LazyLinear(32), nn.ReLU()
        )
        self.linear = nn.LazyLinear(16)

    def forward(self, X):
        return self.linear(self.net(X))


chimera = nn.Sequential(NestMLP(), nn.LazyLinear(20), FixedHiddenMLP())
chimera(X)

tensor(-0.2477, grad_fn=<SumBackward0>)

# 6.2. Parameter Management

a fully connected network with 8 neutrons in hidden layer

In [12]:
net = nn.Sequential(nn.LazyLinear(8), nn.ReLU(), nn.LazyLinear(1))

X = torch.rand(size=(2, 4))
net(X).shape



torch.Size([2, 1])

## 6.2.1. Parameter Access

In [13]:
net[2].state_dict()

OrderedDict([('weight',
              tensor([[ 0.0406, -0.1461, -0.2337, -0.3382, -0.2761, -0.2261, -0.3275, -0.2732]])),
             ('bias', tensor([-0.2971]))])

In [14]:
type(net[2].bias), net[2].bias.data

(torch.nn.parameter.Parameter, tensor([-0.2971]))

In [18]:
print(
    "first layer is: ",
    net[0],
    "\nsecond layer is: ",
    net[1],
    "\nlast layer is: ",
    net[-1],
)

first layer is:  Linear(in_features=4, out_features=8, bias=True) 
second layer is:  ReLU() 
last layer is:  Linear(in_features=8, out_features=1, bias=True)


In [19]:
net[2].weight.grad == None

True

access parameters of all layers at one time

In [20]:
[(name, param.shape) for name, param in net.named_parameters()]

[('0.weight', torch.Size([8, 4])),
 ('0.bias', torch.Size([8])),
 ('2.weight', torch.Size([1, 8])),
 ('2.bias', torch.Size([1]))]

## 6.2.2. Tied Parameters

share parameters between layers  
in this case is layer 2 and 4

In [21]:
# We need to give the shared layer a name so that we can refer to its
# parameters
shared = nn.LazyLinear(8)
net = nn.Sequential(
    nn.LazyLinear(8), nn.ReLU(), shared, nn.ReLU(), shared, nn.ReLU(), nn.LazyLinear(1)
)

net(X)
# Check whether the parameters are the same
print(net[2].weight.data[0] == net[4].weight.data[0])
net[2].weight.data[0, 0] = 100
# Make sure that they are actually the same object rather than just having the
# same value
print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])




# 6.3. Parameter Initialization

In [22]:
net = nn.Sequential(nn.LazyLinear(8), nn.ReLU(), nn.LazyLinear(1))
X = torch.rand(size=(2, 4))
net(X).shape

torch.Size([2, 1])

## 6.3.1. Built-in Initialization

initializes all weight parameters as Gaussian random variables with standard deviation 0.01, while bias parameters are cleared to zero

In [23]:
def init_normal(module):
    if type(module) == nn.Linear:
        nn.init.normal_(module.weight, mean=0, std=0.01)
        nn.init.zeros_(module.bias)


net.apply(init_normal)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([-0.0055,  0.0119, -0.0139,  0.0022]), tensor(0.))

initialize all the parameters to a given constant value (say, 1)

In [24]:
def init_constant(module):
    if type(module) == nn.Linear:
        nn.init.constant_(module.weight, 1)
        nn.init.zeros_(module.bias)


net.apply(init_constant)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([1., 1., 1., 1.]), tensor(0.))

able to apply different initializers for certain blocks  
initialize the first layer with the Xavier initializer and initialize the second layer to a constant value of 42

In [25]:
def init_xavier(module):
    if type(module) == nn.Linear:
        nn.init.xavier_uniform_(module.weight)


def init_42(module):
    if type(module) == nn.Linear:
        nn.init.constant_(module.weight, 42)


net[0].apply(init_xavier)
net[2].apply(init_42)
print(net[0].weight.data[0])
print(net[2].weight.data)

tensor([-0.0758,  0.3287, -0.2625,  0.5050])
tensor([[42., 42., 42., 42., 42., 42., 42., 42.]])


self-defined initialization
\begin{split}\begin{aligned}
    w \sim \begin{cases}
        U(5, 10) & \textrm{ with probability } \frac{1}{4} \\
            0    & \textrm{ with probability } \frac{1}{2} \\
        U(-10, -5) & \textrm{ with probability } \frac{1}{4}
    \end{cases}
\end{aligned}\end{split}

In [26]:
def my_init(module):
    if type(module) == nn.Linear:
        print(
            "Init",
            *[(name, param.shape) for name, param in module.named_parameters()][0]
        )
        nn.init.uniform_(module.weight, -10, 10)
        module.weight.data *= module.weight.data.abs() >= 5


net.apply(my_init)
net[0].weight[:2]

Init weight torch.Size([8, 4])
Init weight torch.Size([1, 8])


tensor([[ 7.0704,  0.0000, -0.0000, -0.0000],
        [ 0.0000, -0.0000, -0.0000, -8.3834]], grad_fn=<SliceBackward0>)

setting parameters directly

In [27]:
net[0].weight.data[:] += 1
net[0].weight.data[0, 0] = 42
net[0].weight.data[0]

tensor([42.,  1.,  1.,  1.])

# 6.4. Lazy Initialization

In [2]:
net = nn.Sequential(nn.LazyLinear(256), nn.ReLU(), nn.LazyLinear(10))



the network cannot possibly know the dimensions of the input layer’s weights because the input dimension remains unknown  
framework has not yet initialized any parameters

In [3]:
net[0].weight

<UninitializedParameter>

In [4]:
X = torch.rand(2, 20)
net(X)

net[0].weight.shape

torch.Size([256, 20])

The following method passes in dummy inputs through the network for a dry run to infer all parameter shapes and subsequently initializes the parameters.

In [5]:
@d2l.add_to_class(d2l.Module)  # @save
def apply_init(self, inputs, init=None):
    self.forward(*inputs)
    if init is not None:
        self.net.apply(init)

# 6.5. Custom Layers

## 6.5.1. Layers without Parameters

The following CenteredLayer class simply subtracts the mean from its input.  
To build it, we simply need to inherit from the base layer class and implement the forward propagation function.

In [6]:
class CenteredLayer(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, X):
        return X - X.mean()

In [7]:
layer = CenteredLayer()
layer(torch.tensor([1.0, 2, 3, 4, 5]))

tensor([-2., -1.,  0.,  1.,  2.])

In [8]:
net = nn.Sequential(nn.LazyLinear(128), CenteredLayer())

In [9]:
Y = net(torch.rand(4, 8))
Y.mean()

tensor(0., grad_fn=<MeanBackward0>)

## 6.5.2. Layers with Parameters

implement our own version of the fully connected layer with two parameters:  
- in_units, which denote the number of inputs
- units, which denote the number of outputs

In [10]:
class MyLinear(nn.Module):
    def __init__(self, in_units, units):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(in_units, units))
        self.bias = nn.Parameter(
            torch.randn(
                units,
            )
        )

    def forward(self, X):
        linear = torch.matmul(X, self.weight.data) + self.bias.data
        return F.relu(linear)

In [18]:
linear = MyLinear(5, 3)
linear.weight

Parameter containing:
tensor([[ 0.6058,  1.8815, -0.0207],
        [ 0.0217, -0.1634, -0.2659],
        [ 0.7306,  0.9074, -0.7039],
        [-0.0488, -1.0025, -1.9870],
        [-1.4135,  0.0432, -0.9741]], requires_grad=True)

In [19]:
linear(torch.rand(2, 5))

tensor([[1.0193, 0.6143, 0.0000],
        [0.0000, 1.4747, 0.0000]])

construct models using custom layers

In [20]:
net = nn.Sequential(MyLinear(64, 8), MyLinear(8, 1))
net(torch.rand(2, 64))

tensor([[2.5166],
        [2.0084]])

# 6.6. File I/O

## 6.6.1. Loading and Saving Tensors

save and load individual tensors

In [21]:
x = torch.arange(4)
torch.save(x, "x-file")

In [22]:
x2 = torch.load("x-file")
x2

tensor([0, 1, 2, 3])

store a list of tensors and read them back into memory

In [23]:
y = torch.zeros(4)
torch.save([x, y], "x-files")
x2, y2 = torch.load("x-files")
(x2, y2)

(tensor([0, 1, 2, 3]), tensor([0., 0., 0., 0.]))

write and read a dictionary that maps from strings to tensors

In [24]:
mydict = {"x": x, "y": y}
torch.save(mydict, "mydict")
mydict2 = torch.load("mydict")
mydict2

{'x': tensor([0, 1, 2, 3]), 'y': tensor([0., 0., 0., 0.])}

## 6.6.2. Loading and Saving Model Parameters

In [25]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.LazyLinear(256)
        self.output = nn.LazyLinear(10)

    def forward(self, x):
        return self.output(F.relu(self.hidden(x)))


net = MLP()
X = torch.randn(size=(2, 20))
Y = net(X)

store the parameters of the model

In [26]:
torch.save(net.state_dict(), "mlp.params")

To recover the model, we instantiate a clone of the original MLP model. Instead of randomly initializing the model parameters, we read the parameters stored in the file directly

In [27]:
clone = MLP()
clone.load_state_dict(torch.load("mlp.params"))
clone.eval()

MLP(
  (hidden): LazyLinear(in_features=0, out_features=256, bias=True)
  (output): LazyLinear(in_features=0, out_features=10, bias=True)
)

Since both instances have the same model parameters, the computational result of the same input X should be the same

In [28]:
Y_clone = clone(X)
Y_clone == Y

tensor([[True, True, True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True, True, True]])

# 6.7. GPUs

## 6.7.1. Computing Devices

In [29]:
def cpu():  # @save
    """Get the CPU device."""
    return torch.device("cpu")


def gpu(i=0):  # @save
    """Get a GPU device."""
    return torch.device(f"cuda:{i}")


cpu(), gpu(), gpu(1)  # gpu() is equivalent to gpu(0)

(device(type='cpu'),
 device(type='cuda', index=0),
 device(type='cuda', index=1))

In [30]:
def num_gpus():  # @save
    """Get the number of available GPUs."""
    return torch.cuda.device_count()


num_gpus()

0

In [31]:
def try_gpu(i=0):  # @save
    """Return gpu(i) if exists, otherwise return cpu()."""
    if num_gpus() >= i + 1:
        return gpu(i)
    return cpu()


def try_all_gpus():  # @save
    """Return all available GPUs, or [cpu(),] if no GPU exists."""
    return [gpu(i) for i in range(num_gpus())]


try_gpu(), try_gpu(10), try_all_gpus()

(device(type='cpu'), device(type='cpu'), [])

## 6.7.2. Tensors and GPUs

By default, tensors are created on the CPU.

In [32]:
x = torch.tensor([1, 2, 3])
x.device

device(type='cpu')

It is important to note that whenever we want to operate on multiple terms, they need to be on the same device!

### 6.7.2.1. Storage on the GPU

In [33]:
X = torch.ones(2, 3, device=try_gpu())  # gpu(0)
X

tensor([[1., 1., 1.],
        [1., 1., 1.]])

In [35]:
Y = torch.rand(2, 3, device=try_gpu(1))  # on gpu(1)
Y

tensor([[0.5868, 0.3934, 0.3080],
        [0.4541, 0.9542, 0.6361]])

### 6.7.2.2. Copying

this part need two GPU to run. Don't have such a equipment.  
Asked professor about this and he agreed to just leave comments without running it.

In [None]:
Z = X.cuda(1)  # copy X from gpu(0) to gpu(1)
print(X)
print(Z)

In [None]:
Y + Z

In [None]:
Z.cuda(1) is Z

## 6.7.3. Neural Networks and GPUs


In [39]:
net = nn.Sequential(nn.LazyLinear(1))
net = net.to(device=try_gpu())  # puts the model parameters on the GPU

In [41]:
net(X)

tensor([[0.1404],
        [0.1404]], grad_fn=<AddmmBackward0>)

In [44]:
# when the input is a tensor on the GPU, the model will calculate the result on the same GPU.
net[0].weight.data.device

device(type='cpu')

In [43]:
@d2l.add_to_class(d2l.Trainer)  # @save
def __init__(self, max_epochs, num_gpus=0, gradient_clip_val=0):
    self.save_hyperparameters()
    self.gpus = [d2l.gpu(i) for i in range(min(num_gpus, d2l.num_gpus()))]


@d2l.add_to_class(d2l.Trainer)  # @save
def prepare_batch(self, batch):
    if self.gpus:
        batch = [a.to(self.gpus[0]) for a in batch]
    return batch


@d2l.add_to_class(d2l.Trainer)  # @save
def prepare_model(self, model):
    model.trainer = self
    model.board.xlim = [0, self.max_epochs]
    if self.gpus:
        model.to(self.gpus[0])
    self.model = model