In [1]:
import torch
import numpy as np
from torch import optim, nn

#### In this chapter, we will:
- briefly review the steps of gradient descent (optional)
- use gradient descent to implement a linear regression in Numpy
- create tensors in PyTorch (finally!)
- understand the difference between CPU and GPU tensors
- understand PyTorch’s main feature, autograd, to perform automatic
differentiation
- create a loss function
- define an optimizer
- implement our own model class
- implement nested and sequential models, using PyTorch’s layers
- organize our code into three parts: data preparation, model configuration and
model training

In [3]:
scalar = torch.tensor(3.14159)
vector = torch.tensor([1, 2, 3])
matrix = torch.ones((2, 3), dtype=torch.float)
tensor = torch.randn((2, 3, 4), dtype=torch.float)
print(scalar)
print(vector)
print(matrix)
print(tensor)

tensor(3.1416)
tensor([1, 2, 3])
tensor([[1., 1., 1.],
        [1., 1., 1.]])
tensor([[[ 1.2434,  0.8843, -0.1261,  0.3930],
         [-0.8288, -0.2430, -1.0502, -1.0295],
         [-0.6323, -0.3248, -0.5428,  1.4598]],

        [[-0.1730, -0.9990,  0.0602,  1.0524],
         [-0.0621,  0.3701, -0.4136, -0.7180],
         [ 0.0387,  1.8177, -1.1853,  0.2027]]])


In [5]:
print(tensor.size(), tensor.shape)
print(scalar.size(), scalar.shape)

torch.Size([2, 3, 4]) torch.Size([2, 3, 4])
torch.Size([]) torch.Size([])


In [6]:
# We get a tensor with a different shape but it still is
# the SAME tensor
same_matrix = matrix.view(1, 6)
# If we change one of its elements...
same_matrix[0, 1] = 2.
# It changes both variables: matrix and same_matrix
print(matrix)
print(same_matrix)

tensor([[1., 2., 1.],
        [1., 1., 1.]])
tensor([[1., 2., 1., 1., 1., 1.]])


In [7]:
# We can use "new_tensor" method to REALLY copy it into a new one
different_matrix = matrix.new_tensor(matrix.view(1, 6))
# Now, if we change one of its elements...
different_matrix[0, 1] = 3.
# The original tensor (matrix) is left untouched!
# But we get a "warning" from PyTorch telling us
# to use "clone()" instead!
print(matrix)
print(different_matrix)

tensor([[1., 2., 1.],
        [1., 1., 1.]])
tensor([[1., 3., 1., 1., 1., 1.]])


  different_matrix = matrix.new_tensor(matrix.view(1, 6))


In [8]:
# Lets follow PyTorch's suggestion and use "clone" method
another_matrix = matrix.view(1, 6).clone().detach()
# Again, if we change one of its elements...
another_matrix[0, 1] = 4.
# The original tensor (matrix) is left untouched!
print(matrix)
print(another_matrix)

tensor([[1., 2., 1.],
        [1., 1., 1.]])
tensor([[1., 4., 1., 1., 1., 1.]])


In [53]:
N= 80
x = np.random.rand(N, 1)
true_b = 1
true_w = 2
epsilon = 0.01
y = true_b + true_w * x + epsilon

idx = np.arange(N)
np.random.shuffle(idx)
# Uses first 80 random indices for train
train_idx = idx[:int(N*.8)]
# Uses the remaining indices for validation
val_idx = idx[int(N*.8):]
# Generates train and validation sets
x_train, y_train = x[train_idx], y[train_idx]

x_train_tensor = torch.as_tensor(x_train)
x_train.dtype, x_train_tensor.dtype

(dtype('float64'), torch.float64)

In [22]:
float_tensor = x_train_tensor.float()
float_tensor.dtype

torch.float32

In [23]:
dummy_array = np.array([1, 2, 3])
dummy_tensor = torch.as_tensor(dummy_array)
# Modifies the numpy array
dummy_array[1] = 0
# Tensor gets modified too...
dummy_tensor

tensor([1, 0, 3], dtype=torch.int32)

In [24]:
dummy_tensor.numpy()

array([1, 0, 3])

In [54]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [28]:
n_cudas = torch.cuda.device_count()
print(n_cudas)

for i in range(n_cudas):
    print(torch.cuda.get_device_name(i))

1
NVIDIA GeForce GTX 1070 Ti


In [30]:
gpu_tensor = torch.as_tensor(x_train).to(device)
gpu_tensor[0]

tensor([0.7470], device='cuda:0', dtype=torch.float64)

In [55]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# Our data was in Numpy arrays, but we need to transform them
# into PyTorch's Tensors and then we send them to the
# chosen device
x_train_tensor = torch.as_tensor(x_train).float().to(device)
y_train_tensor = torch.as_tensor(y_train).float().to(device)

In [32]:
# Here we can see the difference - notice that .type() is more
# useful since it also tells us WHERE the tensor is (device)
print(type(x_train), type(x_train_tensor), x_train_tensor.type())

<class 'numpy.ndarray'> <class 'torch.Tensor'> torch.cuda.FloatTensor


In [33]:
back_to_numpy = x_train_tensor.numpy()

TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.

In [34]:
back_to_numpy = x_train_tensor.cpu().numpy()

In [35]:
# FIRST
# Initializes parameters "b" and "w" randomly, ALMOST as we
# did in Numpy since we want to apply gradient descent on
# these parameters we need to set REQUIRES_GRAD = TRUE
torch.manual_seed(42)
b = torch.randn(1, requires_grad=True, dtype=torch.float)
w = torch.randn(1, requires_grad=True, dtype=torch.float)
print(b, w)

tensor([0.3367], requires_grad=True) tensor([0.1288], requires_grad=True)


In [36]:
# SECOND
# But what if we want to run it on a GPU? We could just
# send them to device, right?
torch.manual_seed(42)
b = torch.randn(1, requires_grad=True, dtype=torch.float).to(device)
w = torch.randn(1, requires_grad=True, dtype=torch.float).to(device)
print(b, w)
# Sorry, but NO! The to(device) "shadows" the gradient...

tensor([0.3367], device='cuda:0', grad_fn=<ToCopyBackward0>) tensor([0.1288], device='cuda:0', grad_fn=<ToCopyBackward0>)


We succeeded in sending them to another device, but we ”lost” the gradients
somehow, since there is no more requires_grad=True, (don’t bother the weird
grad_fn). Clearly, we need to do better…
In the third chunk, we first send our tensors to the device and then use
requires_grad_() method to set its requires_grad attribute to True in place.

In [37]:
# THIRD
# We can either create regular tensors and send them to
# the device (as we did with our data)
torch.manual_seed(42)
b = torch.randn(1, dtype=torch.float).to(device)
w = torch.randn(1, dtype=torch.float).to(device)
# and THEN set them as requiring gradients...
b.requires_grad_()
w.requires_grad_()
print(b, w)

tensor([0.3367], device='cuda:0', requires_grad=True) tensor([0.1288], device='cuda:0', requires_grad=True)


In PyTorch, every method that ends with an underscore ( _ ), like
the requires_grad_() method above, makes changes in-place,
meaning, they will modify the underlying variable.

In [40]:
# FINAL
# We can specify the device at the moment of creation
# RECOMMENDED!
# Step 0 - Initializes parameters "b" and "w" randomly
torch.manual_seed(42)
b = torch.randn(1, requires_grad=True, \
                dtype=torch.float, device=device)
w = torch.randn(1, requires_grad=True, \
                dtype=torch.float, device=device)
print(b, w)

tensor([0.1940], device='cuda:0', requires_grad=True) tensor([0.1391], device='cuda:0', requires_grad=True)


#### Autograd
Autograd is PyTorch’s automatic differentiation package. Thanks to it, we don’t need
to worry about partial derivatives, chain rule, or anything like it.
#### backward
So, how do we tell PyTorch to do its thing and compute all gradients? That’s the
role of the backward() method. It will compute gradients for all (requiring gradient)
tensors involved in the computation of a given variable.
Do you remember the starting point for computing the gradients? It was the loss,
as we computed its partial derivatives w.r.t. our parameters. Hence, we need to
invoke the backward() method from the corresponding Python variable:
```loss.backward().```

In [42]:
# Step 1 - Computes our model's predicted output - forward pass
yhat = b + w * x_train_tensor
# Step 2 - Computes the loss
# We are using ALL data points, so this is BATCH gradient
# descent. How wrong is our model? That's the error!
error = (yhat - y_train_tensor)
# It is a regression, so it computes mean squared error (MSE)
loss = (error ** 2).mean()
# Step 3 - Computes gradients for both "b" and "w" parameters
# No more manual computation of gradients!
# b_grad = 2 * error.mean()
# w_grad = 2 * (x_tensor * error).mean()
loss.backward() 

In [43]:
print(error.requires_grad, yhat.requires_grad, \
b.requires_grad, w.requires_grad)
print(y_train_tensor.requires_grad, x_train_tensor.requires_grad)

True True True True
False False


In [44]:
print(b.grad, w.grad)

tensor([-7.0809], device='cuda:0') tensor([-4.2031], device='cuda:0')


OK, but that is actually a problem: we need to use the gradients corresponding to
the current loss to perform the parameter update. We should NOT use
accumulated gradients.
"If accumulating gradients is a problem, why does PyTorch do it by default?"

it turns out; this behavior can be useful to circumvent hardware limitations.
During the training of large models, the necessary number of data points in a mini-
batch may be too big to fit in memory (of the graphics card). How to solve this,
other than buying more expensive hardware?
One can split a mini-batch into "sub-mini-batches" (horrible name, I know, don’t
quote me on this!), compute the gradients for those "sub" and accumulate them to
achieve the same result of computing the gradients on the full mini-batch.

#### zero_
Every time we use the gradients to update the parameters, we need to zero the
gradients afterward. And that’s what zero_() is good for.

In [46]:
# This code will be placed _after_ Step 4
# (updating the parameters)
b.grad.zero_(), w.grad.zero_()

(tensor([0.], device='cuda:0'), tensor([0.], device='cuda:0'))

In [56]:
# Sets learning rate - this is "eta" ~ the "n"-like Greek letter
lr = 0.1

# Step 0 - Initializes parameters "b" and "w" randomly
torch.manual_seed(42)
b = torch.randn(1, requires_grad=True, \
                dtype=torch.float, device=device)
w = torch.randn(1, requires_grad=True, \
                dtype=torch.float, device=device)

# Defines number of epochs
n_epochs = 1000

for epoch in range(n_epochs):
    # Step 1 - Computes model's predicted output - forward pass
    yhat = b + w * x_train_tensor

    # Step 2 - Computes the loss
    # We are using ALL data points, so this is BATCH gradient
    # descent. How wrong is our model? That's the error!
    error = (yhat - y_train_tensor)
    # It is a regression, so it computes mean squared error (MSE)
    loss = (error ** 2).mean()
    
    # Step 3 - Computes gradients for both "b" and "w"
    # parameters. No more manual computation of gradients!
    # b_grad = 2 * error.mean()
    # w_grad = 2 * (x_tensor * error).mean()
    # We just tell PyTorch to work its way BACKWARDS
    # from the specified loss!
    loss.backward()
    
    # Step 4 - Updates parameters using gradients and
    # the learning rate. But not so fast...
    # FIRST ATTEMPT - just using the same code as before
    # AttributeError: 'NoneType' object has no attribute 'zero_'
    # b = b - lr * b.grad
    # w = w - lr * w.grad
    # print(b)
    
    # SECOND ATTEMPT - using in-place Python assingment
    # RuntimeError: a leaf Variable that requires grad
    # has been used in an in-place operation.
    # b -= lr * b.grad
    # w -= lr * w.grad
    
    # THIRD ATTEMPT - NO_GRAD for the win!
    # We need to use NO_GRAD to keep the update out of
    # the gradient computation. Why is that? It boils
    # down to the DYNAMIC GRAPH that PyTorch uses...
    with torch.no_grad():
        b -= lr * b.grad
        w -= lr * w.grad
    
    # PyTorch is "clingy" to its computed gradients, we
    # need to tell it to let it go...
    b.grad.zero_()
    w.grad.zero_()

print(b, w)

tensor([1.0100], device='cuda:0', requires_grad=True) tensor([2.0000], device='cuda:0', requires_grad=True)


### Adding Optimizer

In [60]:
# Sets learning rate - this is "eta" ~ the "n" like Greek letter
lr = 0.1

# Step 0 - Initializes parameters "b" and "w" randomly
torch.manual_seed(42)
b = torch.randn(1, requires_grad=True, \
dtype=torch.float, device=device)
w = torch.randn(1, requires_grad=True, \
dtype=torch.float, device=device)

# Defines a SGD optimizer to update the parameters
optimizer = optim.SGD([b, w], lr=lr)

# Defines number of epochs
n_epochs = 1000

for epoch in range(n_epochs):
    # Step 1 - Computes model's predicted output - forward pass
    yhat = b + w * x_train_tensor

    # Step 2 - Computes the loss
    # We are using ALL data points, so this is BATCH gradient
    # descent. How wrong is our model? That's the error!
    error = (yhat - y_train_tensor)
    # It is a regression, so it computes mean squared error (MSE)
    loss = (error ** 2).mean()

    # Step 3 - Computes gradients for both "b" and "w" parameters
    loss.backward()

    # Step 4 - Updates parameters using gradients and
    # the learning rate. No more manual update!
    # with torch.no_grad():
    # b -= lr * b.grad
    # w -= lr * w.grad
    optimizer.step()

    # No more telling Pytorch to let gradients go!
    # b.grad.zero_()
    # w.grad.zero_()
    optimizer.zero_grad()
    
print(b, w)

tensor([1.0100], device='cuda:0', requires_grad=True) tensor([2.0000], device='cuda:0', requires_grad=True)


In [67]:
# Defines a MSE loss function
loss_fn = nn.MSELoss(reduction='mean') #'mean' or 'sum'
loss_fn

MSELoss()

In [68]:
# This is a random example to illustrate the loss function
predictions = torch.tensor([0.5, 1.0])
labels = torch.tensor([2.0, 1.3])
loss_fn(predictions, labels)

tensor(1.1700)

### Using loss function

In [74]:
# Sets learning rate - this is "eta" ~ the "n" like
# Greek letter
lr = 0.1

# Step 0 - Initializes parameters "b" and "w" randomly
torch.manual_seed(42)
b = torch.randn(1, requires_grad=True, \
                dtype=torch.float, device=device)
w = torch.randn(1, requires_grad=True, \
                dtype=torch.float, device=device)

# Defines a SGD optimizer to update the parameters
optimizer = optim.SGD([b, w], lr=lr)

# Defines a MSE loss function
loss_fn = nn.MSELoss(reduction='mean')

# Defines number of epochs
n_epochs = 1000

for epoch in range(n_epochs):
    # Step 1 - Computes model's predicted output - forward pass
    yhat = b + w * x_train_tensor
    
    # Step 2 - Computes the loss
    # No more manual loss!
    # error = (yhat - y_train_tensor)
    # loss = (error ** 2).mean()
    loss = loss_fn(yhat, y_train_tensor)
    
    # Step 3 - Computes gradients for both "b" and "w" parameters
    loss.backward()
    
    # Step 4 - Updates parameters using gradients and
    # the learning rate
    optimizer.step()
    optimizer.zero_grad()
    
print(b, w)

tensor([1.0100], device='cuda:0', requires_grad=True) tensor([2.0000], device='cuda:0', requires_grad=True)


In [71]:
loss.cpu().numpy()

RuntimeError: Can't call numpy() on Tensor that requires grad. Use tensor.detach().numpy() instead.

In [72]:
loss.detach().cpu().numpy()

array(5.063061e-12, dtype=float32)

In [77]:
#or using
print(loss.item(), loss.tolist())

5.063061081500564e-12 5.063061081500564e-12


### Model
In PyTorch, a model is represented by a regular Python class that inherits from the
Module class.

In [80]:
class ManualLinearRegression(nn.Module):
    def __init__(self):
        super().__init__()
        # To make "b" and "w" real parameters of the model,
        # we need to wrap them with nn.Parameter
        self.b = nn.Parameter(torch.randn(1,
                            requires_grad=True,
                            dtype=torch.float))
        self.w = nn.Parameter(torch.randn(1,
                            requires_grad=True,
                            dtype=torch.float))
    def forward(self, x):
        # Computes the outputs / predictions
        return self.b + self.w * x

In [83]:
torch.manual_seed(42)
# Creates a "dummy" instance of our ManualLinearRegression model
dummy = ManualLinearRegression()

list(dummy.parameters())

[Parameter containing:
 tensor([0.3367], requires_grad=True),
 Parameter containing:
 tensor([0.1288], requires_grad=True)]

we can use our model’s parameters()
method to retrieve an iterator over all model’s parameters, including parameters
of nested models

### state_dict
Moreover, we can get the current values of all parameters using our model’s
state_dict() method.

In [84]:
dummy.state_dict()

OrderedDict([('b', tensor([0.3367])), ('w', tensor([0.1288]))])

The state_dict() of a given model is simply a Python dictionary that maps each
attribute/parameter to its corresponding tensor. But only ```learnable``` parameters
are included, as its purpose is to keep track of parameters that are going to be
updated by the optimizer.

By the way, the optimizer itself has a state_dict() too, which contains its internal
state, as well as other hyper-parameters. Let’s take a quick look at it:

In [85]:
optimizer.state_dict()

{'state': {0: {'momentum_buffer': None}, 1: {'momentum_buffer': None}},
 'param_groups': [{'lr': 0.1,
   'momentum': 0,
   'dampening': 0,
   'weight_decay': 0,
   'nesterov': False,
   'maximize': False,
   'foreach': None,
   'differentiable': False,
   'params': [0, 1]}]}

IMPORTANT: we need to send our model to the same device
where the data is. If our data is made of GPU tensors, our model
must “live” inside the GPU as well.

In [86]:
torch.manual_seed(42)
# Creates a "dummy" instance of our ManualLinearRegression model
# and sends it to the device
dummy = ManualLinearRegression().to(device)

DO NOT call model.forward(x)!
Otherwise, your model’s hooks will not work (if you have them).

In [88]:
# Sets learning rate - this is "eta" ~ the "n" like
# Greek letter
lr = 0.1

# Step 0 - Initializes parameters "b" and "w" randomly
torch.manual_seed(42)
# Now we can create a model and send it at once to the device
model = ManualLinearRegression().to(device)

# Defines a SGD optimizer to update the parameters
# (now retrieved directly from the model)
optimizer = optim.SGD(model.parameters(), lr=lr)

# Defines a MSE loss function
loss_fn = nn.MSELoss(reduction='mean')

# Defines number of epochs
n_epochs = 1000

for epoch in range(n_epochs):
    model.train()
    
    # Step 1 - Computes model's predicted output - forward pass
    # No more manual prediction!
    yhat = model(x_train_tensor) 
    # Step 2 - Computes the loss
    loss = loss_fn(yhat, y_train_tensor)
    
    # Step 3 - Computes gradients for both "b" and "w" parameters
    loss.backward()
    
    # Step 4 - Updates parameters using gradients and
    # the learning rate
    optimizer.step()
    optimizer.zero_grad()
    
# We can also inspect its parameters using its state_dict
print(model.state_dict())

OrderedDict([('b', tensor([1.0100], device='cuda:0')), ('w', tensor([2.0000], device='cuda:0'))])


### Nested Models

In [91]:
linear = nn.Linear(1, 1)
linear

Linear(in_features=1, out_features=1, bias=True)

In [92]:
linear.state_dict()

OrderedDict([('weight', tensor([[-0.4869]])), ('bias', tensor([0.5873]))])

#### Building Model using pytorch

In [93]:
class MyLinearRegression(nn.Module):
    def __init__(self):
        super().__init__()
        # Instead of our custom parameters, we use a Linear model
        # with a single input and a single output
        self.linear = nn.Linear(1, 1)
        
    def forward(self, x):
        # Now it only takes a call
        self.linear(x)

In [95]:
torch.manual_seed(42)
dummy = MyLinearRegression().to(device)
list(dummy.parameters())

[Parameter containing:
 tensor([[0.7645]], device='cuda:0', requires_grad=True),
 Parameter containing:
 tensor([0.8300], device='cuda:0', requires_grad=True)]

In [96]:
dummy.state_dict()

OrderedDict([('linear.weight', tensor([[0.7645]], device='cuda:0')),
             ('linear.bias', tensor([0.8300], device='cuda:0'))])

### Sequential Models

In [97]:
torch.manual_seed(42)
# Alternatively, you can use a Sequential model
model = nn.Sequential(nn.Linear(1, 1)).to(device)
model.state_dict()

OrderedDict([('0.weight', tensor([[0.7645]], device='cuda:0')),
             ('0.bias', tensor([0.8300], device='cuda:0'))])

### layers

In [98]:
torch.manual_seed(42)
# Building the model from the figure above
model = nn.Sequential(nn.Linear(3, 5), nn.Linear(5, 1)).to(device)
model.state_dict()

OrderedDict([('0.weight',
              tensor([[ 0.4414,  0.4792, -0.1353],
                      [ 0.5304, -0.1265,  0.1165],
                      [-0.2811,  0.3391,  0.5090],
                      [-0.4236,  0.5018,  0.1081],
                      [ 0.4266,  0.0782,  0.2784]], device='cuda:0')),
             ('0.bias',
              tensor([-0.0815,  0.4451,  0.0853, -0.2695,  0.1472], device='cuda:0')),
             ('1.weight',
              tensor([[-0.2060, -0.0524, -0.1816,  0.2967, -0.3530]], device='cuda:0')),
             ('1.bias', tensor([-0.2062], device='cuda:0'))])

In [99]:
#You can also use a model’s add_module() method to be able to name the layers:

torch.manual_seed(42)
# Building the model from the figure above
model = nn.Sequential()
model.add_module('layer1', nn.Linear(3, 5))
model.add_module('layer2', nn.Linear(5, 1))
model.to(device)

Sequential(
  (layer1): Linear(in_features=3, out_features=5, bias=True)
  (layer2): Linear(in_features=5, out_features=1, bias=True)
)

#### There are MANY different layers that can be used in PyTorch:
- Convolution Layers
- Pooling Layers
- Padding Layers
- Non-linear Activations
- Normalization Layers
- Recurrent Layers
- Transformer Layers
- Linear Layers
- Dropout Layers
- Sparse Layers (embeddings)
- Vision Layers
- DataParallel Layers (multi-GPU)
- Flatten Layer

# Putting It All Together

it is time to put it all together and organize our code so far into three fundamental
parts, namely:
- #### data preparation (not data generation!)
- #### model configuration
- #### model training

#### Data Preparation

In [2]:
%%writefile data_preparation/v0.py
#prepare data
N= 80
x = np.random.rand(N, 1)
true_b = 1
true_w = 2
y = true_b + true_w * x

idx = np.arange(N)
np.random.shuffle(idx)
# Uses first 80 random indices for train
train_idx = idx[:int(N*.8)]
# Uses the remaining indices for validation
val_idx = idx[int(N*.8):]
# Generates train and validation sets
x_train, y_train = x[train_idx], y[train_idx]

device = 'cuda' if torch.cuda.is_available() else 'cpu'
# Our data was in Numpy arrays, but we need to transform them
# into PyTorch's Tensors and then we send them to the
# chosen device
x_train_tensor = torch.as_tensor(x_train).float().to(device)
y_train_tensor = torch.as_tensor(y_train).float().to(device)

Overwriting data_preparation/v0.py


In [110]:
%run -i data_preparation/v0.py

- %%writefile:
  
    as its name says, it writes the contents of the cell to a file,
    but it does not run it, so we need to use yet another magic…

- %run:
  
    it runs the named file inside the notebook as a program - but
    independently from the rest of the notebook, so we need to use the -i
    option to make all variables available, both from the notebook and the
    file (technically speaking, the file is executed in IPython’s namespace).

In a nutshell, a cell containing one of our three fundamental parts will be
written to a versioned file inside the folder corresponding to that part.
In the example above, we write the cell to the data_preparation folder,
name it v0.py and then execute it using the %run -i magic.

all command for run:

- %run -i data_preparation/v0.py
- %run -i model_configuration/v1.py
- %run -i model_training/v0.py

#### Model Configuration

- a model
- a loss function (which needs to be chosen according to your model)
- an optimizer (although some people may disagree with this choice, it makes it
easier for further organizing the code…)

In [112]:
%%writefile model_configuration/v0.py

# This is redundant now, but it won't be when we introduce
# Datasets...
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Sets learning rate - this is "eta" ~ the "n" like Greek letter
lr = 0.1

torch.manual_seed(42)
# Now we can create a model and send it at once to the device
model = nn.Sequential(nn.Linear(1, 1)).to(device)

# Defines a SGD optimizer to update the parameters
# (now retrieved directly from the model)
optimizer = optim.SGD(model.parameters(), lr=lr)

# Defines a MSE loss function
loss_fn = nn.MSELoss(reduction='mean')

Writing model_configuration/v0.py


In [113]:
%run -i model_configuration/v0.py

#### Model Training

- Step 1: compute model’s predictions
- Step 2: compute the loss
- Step 3: compute the gradients
- Step 4: update the parameters

This sequence is repeated over and over until the number of epochs is reached.

What happened to the random initialization step?

Since we are not manually creating parameters anymore, the initialization is
handled inside each layer during model creation.

In [115]:
%%writefile model_training/v0.py

# Defines number of epochs
n_epochs = 1000

for epoch in range(n_epochs):
    # Sets model to TRAIN mode
    model.train()
    
    # Step 1 - Computes model's predicted output - forward pass
    yhat = model(x_train_tensor)
    
    # Step 2 - Computes the loss
    loss = loss_fn(yhat, y_train_tensor)
    
    # Step 3 - Computes gradients for both "b" and "w" parameters
    loss.backward()
    
    # Step 4 - Updates parameters using gradients and
    # the learning rate
    optimizer.step()
    optimizer.zero_grad()

Writing model_training/v0.py


In [116]:
%run -i model_training/v0.py

In [117]:
print(model.state_dict())

OrderedDict([('0.weight', tensor([[2.0000]], device='cuda:0')), ('0.bias', tensor([1.0100], device='cuda:0'))])
