In [None]:
import torch
from torch import nn
from torch.nn import functional as F

In [None]:
### Neurons, Layers, and Models

### neuron network Module: nn.Module, a module can be a single layer, a component consisting of multiple layers, or the entire model itself
# Individual layers can be modules. Many layers can comprise a module. Many modules can comprise a module.

In [None]:
# nn.Sequential defines a special kind of Module, It maintains an ordered list of constituent Modules.
## LazyLinear class is also a subclass of Module.

net = nn.Sequential(nn.LazyLinear(256), nn.ReLU(), nn.LazyLinear(10))

X = torch.rand(2, 20)
net(X).shape



torch.Size([2, 10])

In [None]:
# A custom Module

class MLP(nn.Module):
    def __init__(self):
        # Call the constructor of the parent class nn.Module to perform
        # the necessary initialization
        super().__init__()
        self.hidden = nn.LazyLinear(256)
        self.out = nn.LazyLinear(10)

    # Define the forward propagation of the model, that is, how to return the
    # required model output based on the input X
    def forward(self, X):
        return self.out(F.relu(self.hidden(X)))

In [None]:
net = MLP()
net(X).shape

torch.Size([2, 10])

In [None]:
# The Sequential Module
class MySequential(nn.Module):
    def __init__(self, *args):
        super().__init__()
        for idx, module in enumerate(args):
            self.add_module(str(idx), module)

    def forward(self, X):
        for module in self.children():
            X = module(X)
        return X

In [None]:
net = MySequential(nn.LazyLinear(256), nn.ReLU(), nn.LazyLinear(10))
net(X).shape

torch.Size([2, 10])

In [None]:
class FixedHiddenMLP(nn.Module):
    def __init__(self):
        super().__init__()
        # Random weight parameters that will not compute gradients and
        # therefore keep constant during training
        self.rand_weight = torch.rand((20, 20))
        self.linear = nn.LazyLinear(20)

    def forward(self, X):
        X = self.linear(X)

        # A hidden layer whose weights (self.rand_weight) are initialized randomly at instantiation and are thereafter constant.
        X = F.relu(X @ self.rand_weight + 1)

        # Reuse the fully connected layer. This is equivalent to sharing parameters with two fully connected layers
        X = self.linear(X)
        # Control flow
        while X.abs().sum() > 1:
            X /= 2
        return X.sum()

net = FixedHiddenMLP()
net(X)

tensor(0.0664, grad_fn=<SumBackward0>)

In [None]:
# We can mix and match various ways of assembling modules together.

class NestMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(nn.LazyLinear(64), nn.ReLU(),
                                 nn.LazyLinear(32), nn.ReLU())
        self.linear = nn.LazyLinear(16)

    def forward(self, X):
        return self.linear(self.net(X))

chimera = nn.Sequential(NestMLP(), nn.LazyLinear(20), FixedHiddenMLP())
chimera(X)

tensor(0.2779, grad_fn=<SumBackward0>)

========Parameter Management================

In [None]:
net = nn.Sequential(nn.LazyLinear(8),
                    nn.ReLU(),
                    nn.LazyLinear(1))

X = torch.rand(size=(2, 4))
net(X).shape

torch.Size([2, 1])

In [None]:
# When a model is defined via the Sequential class, we can first access any layer by indexing into the model as though it were a list.

net[2].state_dict()

OrderedDict([('weight',
              tensor([[ 0.0980, -0.0327, -0.1445,  0.1823,  0.1344, -0.1969,  0.2621, -0.3496]])),
             ('bias', tensor([0.1680]))])

In [None]:
# each parameter is represented as an instance of the parameter class.

type(net[2].bias), net[2].bias.data

(torch.nn.parameter.Parameter, tensor([0.1680]))

In [None]:
# Parameters are complex objects, containing values, gradients, and additional information. That is why we need to request the value explicitly.

net[2].weight.grad == None


True

In [None]:
[(name, param.shape) for name, param in net.named_parameters()]

[('0.weight', torch.Size([8, 4])),
 ('0.bias', torch.Size([8])),
 ('2.weight', torch.Size([1, 8])),
 ('2.bias', torch.Size([1]))]

In [None]:
# Tied/Shared Parameters

# We need to give the shared layer a name so that we can refer to its parameters
shared = nn.LazyLinear(8)
net = nn.Sequential(nn.LazyLinear(8), nn.ReLU(),
                    shared, nn.ReLU(),
                    shared, nn.ReLU(),
                    nn.LazyLinear(1))

net(X)
# Check whether the parameters are the same
print(net[2].weight.data[0] == net[4].weight.data[0])
net[2].weight.data[0, 0] = 100
# Make sure that they are actually the same object rather than just having the same value
print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])


=====================Parameter Initialization=======================

In [None]:
net = nn.Sequential(nn.LazyLinear(8), nn.ReLU(), nn.LazyLinear(1))
X = torch.rand(size=(2, 4))
net(X).shape

torch.Size([2, 1])

In [None]:
# Built-in Initialization

def init_normal(module):
    if type(module) == nn.Linear:
        nn.init.normal_(module.weight, mean=0, std=0.01) #  initializes all weight parameters as Gaussian random variables with standard deviation 0.01
        nn.init.zeros_(module.bias)

net.apply(init_normal)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([ 0.0131,  0.0030, -0.0118,  0.0016]), tensor(0.))

In [None]:
def init_constant(module):
    if type(module) == nn.Linear:
        nn.init.constant_(module.weight, 1) # given constant value
        nn.init.zeros_(module.bias)

net.apply(init_constant)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([1., 1., 1., 1.]), tensor(0.))

In [None]:
def init_xavier(module):
    if type(module) == nn.Linear:
        nn.init.xavier_uniform_(module.weight)

def init_42(module):
    if type(module) == nn.Linear:
        nn.init.constant_(module.weight, 42)

net[0].apply(init_xavier)
net[2].apply(init_42)
print(net[0].weight.data[0])
print(net[2].weight.data)

tensor([-0.0509, -0.5759, -0.5115,  0.5238])
tensor([[42., 42., 42., 42., 42., 42., 42., 42.]])


In [None]:
# Custome initialization

def my_init(module):
    if type(module) == nn.Linear:
        print("Init", *[(name, param.shape)
                        for name, param in module.named_parameters()][0])
        nn.init.uniform_(module.weight, -10, 10)
        module.weight.data *= module.weight.data.abs() >= 5

net.apply(my_init)
net[0].weight[:2]

Init weight torch.Size([8, 4])
Init weight torch.Size([1, 8])


tensor([[ 8.5926,  0.0000, -8.8848,  7.0531],
        [-0.0000, -7.3913,  5.3600,  8.4965]], grad_fn=<SliceBackward0>)

In [None]:
print("Init", *("Hello", "World"))

Init Hello World


==================Lazy Initialization======================

In [None]:
import torch
from torch import nn
from d2l import torch as d2l

In [None]:
net = nn.Sequential(nn.LazyLinear(256), nn.ReLU(), nn.LazyLinear(10))

# At this point, the network cannot possibly know the dimensions of the input layer’s weights because the input dimension remains unknown.
# Consequently the framework has not yet initialized any parameters. We confirm by attempting to access the parameters below.

net[0].weight

<UninitializedParameter>

In [None]:
X = torch.rand(2, 20)
net(X)

net[0].weight.shape

torch.Size([256, 20])

In [None]:
# The following method passes in dummy inputs through the network for a dry run to infer all parameter shapes and subsequently initializes the
# parameters. It will be used later when default random initializations are not desired.


@d2l.add_to_class(d2l.Module)  #@save
def apply_init(self, inputs, init=None):
    self.forward(*inputs)
    if init is not None:
        self.net.apply(init)

=================Custom Layers===================

In [None]:
import torch
from torch import nn
from torch.nn import functional as F
from d2l import torch as d2l

In [None]:
# Without Parameters


class CenteredLayer(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, X):
        return X - X.mean()

layer = CenteredLayer()
layer(torch.tensor([1.0, 2, 3, 4, 5]))

tensor([-2., -1.,  0.,  1.,  2.])

In [None]:
net = nn.Sequential(nn.LazyLinear(128), CenteredLayer())

Y = net(torch.rand(4, 8))
Y.mean(), Y.shape



(tensor(2.7940e-09, grad_fn=<MeanBackward0>), torch.Size([4, 128]))

In [None]:
# With Parameters

class MyLinear(nn.Module):
    def __init__(self, in_units, units):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(in_units, units))
        self.bias = nn.Parameter(torch.randn(units,))

    def forward(self, X):
        linear = torch.matmul(X, self.weight.data) + self.bias.data
        return F.relu(linear)

linear = MyLinear(5, 3)
linear.weight

Parameter containing:
tensor([[ 0.5466,  1.1332,  1.1790],
        [-1.2105, -0.5504, -0.6399],
        [-1.1234,  0.0551,  0.0035],
        [ 0.2254, -1.6338,  0.2674],
        [-0.7772, -0.6681,  1.1908]], requires_grad=True)

In [None]:
linear(torch.rand(2, 5))

tensor([[0.0487, 0.6646, 0.5094],
        [0.0000, 0.0000, 0.0000]])

In [None]:
net = nn.Sequential(MyLinear(64, 8), MyLinear(8, 1))
net(torch.rand(2, 64))

tensor([[0.0000],
        [0.0039]])

================File I/O==============

In [None]:
import torch
from torch import nn
from torch.nn import functional as F

In [None]:
x = torch.arange(4)
torch.save(x, '..\\Data\\x-file')

In [None]:
x2 = torch.load('..\\Data\\x-file')
x2

tensor([0, 1, 2, 3])

In [None]:
y = torch.zeros(4)
torch.save([x, y],'..\\Data\\x-file')
x2, y2 = torch.load('..\\Data\\x-file')
(x2, y2)

(tensor([0, 1, 2, 3]), tensor([0., 0., 0., 0.]))

In [None]:
mydict = {'x': x, 'y': y}
torch.save(mydict, 'mydict')
mydict2 = torch.load('mydict')
mydict2

{'x': tensor([0, 1, 2, 3]), 'y': tensor([0., 0., 0., 0.])}

In [None]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.LazyLinear(256)
        self.output = nn.LazyLinear(10)

    def forward(self, x):
        return self.output(F.relu(self.hidden(x)))

net = MLP()
X = torch.randn(size=(2, 20))
Y = net(X)

In [None]:
torch.save(net.state_dict(), '..\\Data\\mlp.params')
# net.state_dict()

In [None]:
# To recover the model, we instantiate a clone of the original MLP model. Instead of randomly initializing the model parameters, we read the parameters stored in the file directly.

clone = MLP()
clone.load_state_dict(torch.load('..\\Data\\mlp.params'))
clone.eval()

MLP(
  (hidden): LazyLinear(in_features=0, out_features=256, bias=True)
  (output): LazyLinear(in_features=0, out_features=10, bias=True)
)

In [None]:
Y_clone = clone(X)
Y_clone == Y

tensor([[False, False, False, False, False, False, False, False, False, False],
        [False, False, False, False, False, False, False, False, False, False]])

In [None]:
X2 = torch.randn(size=(5, 20))
clone(X2) # works!

X3 = torch.randn(size=(5, 21))
# clone(X3) # Error:  mat1 and mat2 shapes cannot be multiplied (5x21 and 20x256)

=========================GPUs=========================

In [1]:
# Below runs under Google Colab (Pythion3.12)

%pip install numpy==1.26.4
# %pip install torch torchvision
%pip install d2l==1.0.3 --no-deps



In [2]:
import torch
from torch import nn
from d2l import torch as d2l

In [3]:
# In PyTorch, every array has a device; we often refer it as a context. So far, by default, all variables and associated computation have been assigned to the CPU. Typically, other contexts might be various GPUs.

#  the cpu device means all physical CPUs and memory. This means that PyTorch’s calculations will try to use all CPU cores
# a gpu device only represents one card and the corresponding memory. If there are multiple GPUs, we use torch.device(f'cuda:{i}') to represent the GPU (starts at 0). Also, gpu:0 and gpu are equivalent.
def cpu():
    """Get the CPU device."""
    return torch.device('cpu')

def gpu(i=0):
    """Get a GPU device."""
    return torch.device(f'cuda:{i}')

cpu(), gpu(), gpu(1)


(device(type='cpu'),
 device(type='cuda', index=0),
 device(type='cuda', index=1))

In [4]:
def num_gpus():
    """Get the number of available GPUs."""
    return torch.cuda.device_count()

num_gpus()

1

In [5]:
import torch

print("CUDA available:", torch.cuda.is_available())
print("GPU count:", torch.cuda.device_count())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")


CUDA available: True
GPU count: 1
GPU name: Tesla T4


In [6]:
def try_gpu(i=0):
    """Return gpu(i) if exists, otherwise return cpu()."""
    if num_gpus() >= i + 1:
        return gpu(i)
    return cpu()

def try_all_gpus():
    """Return all available GPUs, or [cpu(),] if no GPU exists."""
    return [gpu(i) for i in range(num_gpus())]

try_gpu(), try_gpu(10), try_all_gpus()

(device(type='cuda', index=0),
 device(type='cpu'),
 [device(type='cuda', index=0)])

In [7]:
# By default, tensors are created on the CPU. We can query the device where the tensor is located.

x = torch.tensor([1, 2, 3])
x.device

# It is important to note that whenever we want to operate on multiple terms, they need to be on the same device. For instance, if we sum two tensors, we need to make sure that both arguments live on the same device.
# otherwise the framework would not know where to store the result or even how to decide where to perform the computation.


device(type='cpu')

In [12]:
X = torch.ones(2, 3, device=try_gpu())
X

tensor([[1., 1., 1.],
        [1., 1., 1.]], device='cuda:0')

If we want to compute X + Y, we need to decide where to perform this operation. For instance, as shown in Fig. 6.7.1, we can transfer X to the second GPU and perform the operation there. Do not simply add X and Y, since this will result in an exception. The runtime engine would not know what to do: it cannot find data on the same device and it fails. Since Y lives on the second GPU, we need to move X there before we can add the two.

![img](https://github.com/huiminyan2017/pytorch-basics/blob/main/notebooks/Images/Gpu1.png?raw=1)

In [14]:
X.cuda(0) is X

True

People use GPUs to do machine learning because they expect them to be fast. But transferring variables between devices is slow: much slower than computation. So we want you to be 100% certain that you want to do something slow before we let you do it. If the deep learning framework just did the copy automatically without crashing then you might not realize that you had written some slow code.

Transferring data is not only slow, it also makes parallelization a lot more difficult, since we have to wait for data to be sent (or rather to be received) before we can proceed with more operations. This is why copy operations should be taken with great care. As a rule of thumb, many small operations are much worse than one big operation. Moreover, several operations at a time are much better than many single operations interspersed in the code unless you know what you are doing. This is the case since such operations can block if one device has to wait for the other before it can do something else. It is a bit like ordering your coffee in a queue rather than pre-ordering it by phone and finding out that it is ready when you are.

Last, when we print tensors or convert tensors to the NumPy format, if the data is not in the main memory, the framework will copy it to the main memory first, resulting in additional transmission overhead. Even worse, it is now subject to the dreaded global interpreter lock that makes everything wait for Python to complete.

In [16]:
# puts the model parameters on the GPU.
net = nn.Sequential(nn.LazyLinear(1))
net = net.to(device=try_gpu())

In [18]:
# when the input is a tensor on the GPU, the model will calculate the result on the same GPU.
net(X)

tensor([[-0.6935],
        [-0.6935]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [19]:
# the model parameters are stored on the same GPU.
net[0].weight.data.device

device(type='cuda', index=0)

In [22]:
@d2l.add_to_class(d2l.Trainer)
def __init__(self, max_epochs, num_gpus=0, gradient_clip_val=0):
    self.save_hyperparameters()
    self.gpus = [d2l.gpu(i) for i in range(min(num_gpus, d2l.num_gpus()))]

@d2l.add_to_class(d2l.Trainer)
def prepare_batch(self, batch):
    if self.gpus:
        batch = [a.to(self.gpus[0]) for a in batch]
    return batch

@d2l.add_to_class(d2l.Trainer)
def prepare_model(self, model):
    model.trainer = self
    model.board.xlim = [0, self.max_epochs]
    if self.gpus:
        model.to(self.gpus[0])
    self.model = model

# In short, as long as all data and parameters are on the same device, we can learn models efficiently.