In [32]:
# Laurent LEQUIEVRE
# Research Engineer, CNRS (France)
# Institut Pascal UMR6602
# laurent.lequievre@uca.fr

In [166]:
my_device = torch.device("cuda") # can be also "cpu"
my_dtype = torch.float

# Create random Tensors

# This returns a tensor of size 3 × 5
# filled with values from standard normal distribution, that is mean is 0 and variance is 1.
# Setting requires_grad=True indicates that we want to compute gradients during the backward pass.
input = torch.randn(3, 5, device=my_device, dtype=my_dtype, requires_grad=True) 

target = torch.randn(3, 5, device=my_device, dtype=my_dtype)

print('input: ', input)
print('target: ', target)

input:  tensor([[-0.9320,  1.6002,  0.3595,  1.8899, -0.1184],
        [-0.8379,  0.0553,  0.8553, -0.3518,  0.8582],
        [-0.7784, -0.9570, -1.0201,  1.6142,  2.0713]], device='cuda:0',
       requires_grad=True)
target:  tensor([[-2.8785e-01,  3.5644e-04,  1.3720e-01,  1.8768e+00,  4.2386e-01],
        [ 9.7907e-02,  1.8768e+00, -3.8424e-01,  1.6855e+00,  1.0096e+00],
        [ 6.5938e-01, -1.1361e-01, -2.6892e-01, -1.4951e+00,  6.3044e-01]],
       device='cuda:0')


In [167]:
# Get size of tensors input and target
print("input size={}, target size={}".format(input.shape,target.shape))

input size=torch.Size([3, 5]), target size=torch.Size([3, 5])


In [168]:
input.grad
# If x is a Tensor that has x.requires_grad=True then x.grad is another Tensor holding the gradient of x 
# with respect to some scalar value.
# No value actually !

In [169]:
mae_loss = nn.L1Loss() # Define a loss function : The Mean Absolute Error (MAE)
loss = mae_loss(input, target) # Apply the loss function to input and target
print('loss value : ', loss)

loss value :  tensor(1.1193, device='cuda:0', grad_fn=<L1LossBackward>)


loss.backward() computes dloss/dx for every parameter x which has requires_grad=True. 
These are accumulated into x.grad for every parameter x. In pseudo-code: x.grad += dloss/dx

optimizer.step updates the value of x using the gradient x.grad. 
For example, the SGD optimizer performs:
(lr = learning rate)

x += -lr * x.grad

optimizer.zero_grad() clears x.grad for every parameter x in the optimizer. 

It’s important to call this before loss.backward(), otherwise you’ll accumulate the gradients from multiple passes.

If you have multiple losses (loss1, loss2) you can sum them and then call backwards once:

loss3 = loss1 + loss2
loss3.backward()

In [170]:
loss.backward()

In [171]:
# Get gradient of tensor input
input.grad

tensor([[-0.0667,  0.0667,  0.0667,  0.0667, -0.0667],
        [-0.0667, -0.0667,  0.0667, -0.0667, -0.0667],
        [-0.0667, -0.0667, -0.0667,  0.0667,  0.0667]], device='cuda:0')

In [172]:
input.grad.zero_() # Manually zero the input gradient

tensor([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]], device='cuda:0')

In [173]:
input.grad

tensor([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]], device='cuda:0')

In [174]:
# A minimal example of SGD (Stochastic Gradient Descent)
# without a Neural Network

import torch
import torch.nn as nn
import torch.optim as optim

N = 64 # amount of values inside the tensors

# Define 2 equivalent tensors x and y
x0 = torch.randn(N, 1, requires_grad=False)
x = x0
y = x0

#print("x={}, y={}".format(x,y))

# Define the model parameters to optimize
A = torch.randn(1, 1, requires_grad=True) # size 1 x 1 (weights of the model) -> torch.Size([1, 1])
b = torch.randn(1, requires_grad=True) # Size 1 (biases of the model) -> torch.Size([1])

optimizer = optim.SGD([A, b], lr=1e-1) # Define SGD with [A, b] as parameters and learning rate to 1e-1

for t in range(20):
    print('-' * 50)
    optimizer.zero_grad() # Clean gradients of A and b (because they have requires_grad to True)
    y_pred = torch.matmul(x, A) + b # Calculate a noisy predicted value -> (x * A) + b
    loss = ((y_pred - y) ** 2).mean() # Calculate loss value (y is the actual value)
    print("{} -> {}".format(t, loss.item())) # print loss value, need item() function to get it
    loss.backward() # Compute gradient of A and b with loss value
    optimizer.step() # Update values of A and b by taking account of their gradient and the learning rate
    print("A={}, b={}".format(A,b))


--------------------------------------------------
0 -> 1.555985927581787
A=tensor([[0.7793]], requires_grad=True), b=tensor([0.8546], requires_grad=True)
--------------------------------------------------
1 -> 0.8872073888778687
A=tensor([[0.8757]], requires_grad=True), b=tensor([0.6718], requires_grad=True)
--------------------------------------------------
2 -> 0.513855516910553
A=tensor([[0.9402]], requires_grad=True), b=tensor([0.5308], requires_grad=True)
--------------------------------------------------
3 -> 0.30288925766944885
A=tensor([[0.9823]], requires_grad=True), b=tensor([0.4214], requires_grad=True)
--------------------------------------------------
4 -> 0.1819668561220169
A=tensor([[1.0090]], requires_grad=True), b=tensor([0.3362], requires_grad=True)
--------------------------------------------------
5 -> 0.11151330918073654
A=tensor([[1.0249]], requires_grad=True), b=tensor([0.2694], requires_grad=True)
--------------------------------------------------
6 -> 0.069713

In [175]:
# A simple test of gradient
# to proof the result of gradient
x = torch.ones(2, 2, requires_grad=True) # size 2 x 2 with all 1 values
y = x + 2  # size 2 x 2 with all 3 values
z = y * y * 2 # size 2 x 2 with all 18 values

out = z.mean() # mean is equal to 18 ! (18*4/4)
print("mean={}".format(out))

out.backward() # compute gradient of x
# out = 1/4 sum of z(j) (for j from 1 to 4)
# d.out/d.x(i) = 1/4 sum(j)[d.z(j)/d.x(i)]  ==> partial derivative of dout with respect to x (i and j from 1 to 4)
# = 1/4 sum(j)[d.2*y(j)*y(j)/d.x(i)]
# = 1/4 sum(j)[ 4*y(j)*(d.y(j)/d.x(i))]
# = 1/4 sum(j)[4*(x(j)+2)*(d.(x(j)+2)/d.x(i))]
# = sum(j)[(x(j)+2)*(d.(x(j)+2)/d.x(i))]
# = x(i)+2    -> d.x(j)/d.x(i) = 0 if i not equal to j
# = 3 -> for x(i) = 1  => tensor x : size 2 x 2 with all 1 values
print("x grad={}".format(x.grad))

mean=18.0
x grad=tensor([[3., 3.],
        [3., 3.]])


In [185]:
# Just to verify mult property of tensors
tensor1 = torch.randn(64, 1) # torch.Size([64, 1])
tensor2 = torch.randn(1, 1) # torch.Size([1, 1])

result = torch.matmul(tensor1, tensor2).size()  # torch.Size([64, 1])
print("result={}".format(result))
print("nb elem in result={}".format(result.numel()))

result=torch.Size([64, 1])
nb elem in result=64


In [199]:
# A minimal example of SGD (Stochastic Gradient Descent)
# with a Neural Network
import torch
import torch.nn as nn
import torch.optim as optim
from torchsummary import summary  # pip install torchsummary

# Let's make some data for a linear regression.
A = 3.1415926
B = 2.7189351
error = 0.1
N = 100 # number of data points

# Data
input_data = torch.randn(N, 1)

# (noisy) Target values that we want to learn.
target = A * input_data + B + (torch.randn(N, 1) * error)

# Creating a model, making the optimizer, defining loss
model = nn.Linear(1, 1) # create a Neural Network Linear with in_features 1 and out_features 1
# Note that the weights W have shape (out_features, in_features) and biases b have shape (out_features).

# Print model parameters
for name, param in model.named_parameters():
    print('name: ', name)
    print(type(param))
    print('param.shape: ', param.shape)
    print('param.requires_grad: ', param.requires_grad)
    print('=====')

# name:  weight
# <class 'torch.nn.parameter.Parameter'>
# param.shape:  torch.Size([1, 1])
# param.requires_grad:  True
# =====
# name:  bias
# <class 'torch.nn.parameter.Parameter'>
# param.shape:  torch.Size([1])
#param.requires_grad:  True
# =====

print("model weight={}, model bias={}".format(model.weight,model.bias))

#model_summary(model)

optimizer = optim.SGD(model.parameters(), lr=0.05)
loss_fn = nn.MSELoss()  # the mean squared error (squared L2 norm)

# Run training
nb_iter = 50
for _ in range(0, nb_iter):
    optimizer.zero_grad() # Clear gradient values of model parameters
    predictions = model(input_data)  # forward input_data into the model and get a prediction -> torch.Size([100, 1])
    #print(predictions.shape)
    loss = loss_fn(predictions, target) # calculate loss value
    loss.backward() # calculate gradient values
    optimizer.step() # update model parameters with gradient values

    print("-" * 50)
    print("error = {}".format(loss.item()))
    print("learned weight = {}".format(list(model.parameters())[0].data[0, 0]))
    print("learned bias = {}".format(list(model.parameters())[1].data[0]))
    

name:  weight
<class 'torch.nn.parameter.Parameter'>
param.shape:  torch.Size([1, 1])
param.requires_grad:  True
=====
name:  bias
<class 'torch.nn.parameter.Parameter'>
param.shape:  torch.Size([1])
param.requires_grad:  True
=====
model weight=Parameter containing:
tensor([[0.9866]], requires_grad=True), model bias=Parameter containing:
tensor([0.6161], requires_grad=True)
--------------------------------------------------
error = 9.865514755249023
learned weight = 1.2502708435058594
learned bias = 0.8141689896583557
--------------------------------------------------
error = 7.811840057373047
learned weight = 1.4815841913223267
learned bias = 0.9938012361526489
--------------------------------------------------
error = 6.192173957824707
learned weight = 1.6845104694366455
learned bias = 1.1566535234451294
--------------------------------------------------
error = 4.913486003875732
learned weight = 1.8625601530075073
learned bias = 1.3042585849761963
----------------------------------