In [67]:
# Laurent LEQUIEVRE
# Research Engineer, CNRS (France)
# Institut Pascal UMR6602
# laurent.lequievre@uca.fr

In [68]:
# torch.nn contains loss functions

import torch
import torch.nn as nn

In [69]:
import sys

#print python version
print('__python version:', sys.version)

# print torch version
print('__torch version:', torch.__version__)

# print if cuda is available (True or False)
print('__cuda is available:', torch.cuda.is_available())

# print cuda informations
print('__cuda version :', torch.version.cuda)
print('__number cuda devices:', torch.cuda.device_count())
print ('__available devices ', torch.cuda.device_count())
print ('__current cuda device ', torch.cuda.current_device())

__python version: 3.8.10 (default, Jun  2 2021, 10:49:15) 
[GCC 9.4.0]
__torch version: 1.9.0+cu102
__cuda is available: True
__cuda version : 10.2
__number cuda devices: 1
__available devices  1
__current cuda device  0


In [70]:
my_device = torch.device("cuda") # can be also "cpu"
my_dtype = torch.float

# Create random Tensors

# This returns a tensor of size 3 × 5
# filled with values from standard normal distribution, that is mean is 0 and variance is 1.
# Setting requires_grad=True indicates that we want to compute gradients during the backward pass.
input = torch.randn(3, 5, device=my_device, dtype=my_dtype, requires_grad=True) 

target = torch.randn(3, 5, device=my_device, dtype=my_dtype)

print('input: ', input)
print('target: ', target)

input:  tensor([[-0.7403, -0.3996,  1.1494, -2.1318,  0.0144],
        [-0.6482, -1.0157,  0.1490,  1.5103, -0.3767],
        [-0.5904, -0.3296,  0.9054, -0.2747,  0.5741]], device='cuda:0',
       requires_grad=True)
target:  tensor([[-0.4404, -2.3333, -0.3882,  0.3560,  1.2640],
        [ 1.1103, -1.6017,  1.5992, -0.9283,  0.1860],
        [-0.5014, -0.3676, -1.1072,  0.5219,  0.2854]], device='cuda:0')


In [71]:
# Get size of tensors input and target
print("input size={}, target size={}".format(input.shape,target.shape))

input size=torch.Size([3, 5]), target size=torch.Size([3, 5])


In [72]:
input.grad
# If x is a Tensor that has x.requires_grad=True then x.grad is another Tensor holding the gradient of x 
# with respect to some scalar value.
# No value actually !

In [73]:
mae_loss = nn.L1Loss() # Define a loss function : The Mean Absolute Error (MAE)
loss = mae_loss(input, target) # Apply the loss function to input and target
print('loss value : ', loss)

loss value :  tensor(1.1686, device='cuda:0', grad_fn=<L1LossBackward>)


loss.backward() computes dloss/dx for every parameter x which has requires_grad=True. 
These are accumulated into x.grad for every parameter x. In pseudo-code: x.grad += dloss/dx

optimizer.step updates the value of x using the gradient x.grad. 
For example, the SGD optimizer performs:
(lr = learning rate)

x += -lr * x.grad

optimizer.zero_grad() clears x.grad for every parameter x in the optimizer. 

It’s important to call this before loss.backward(), otherwise you’ll accumulate the gradients from multiple passes.

If you have multiple losses (loss1, loss2) you can sum them and then call backwards once:

loss3 = loss1 + loss2
loss3.backward()

In [74]:
loss.backward() # compute the gradient of input tensor (the only one with requires_grad=True)

In [75]:
# Get gradient of tensor input saved in a field named grad.
input.grad

tensor([[-0.0667,  0.0667,  0.0667, -0.0667, -0.0667],
        [-0.0667,  0.0667, -0.0667,  0.0667, -0.0667],
        [-0.0667,  0.0667,  0.0667, -0.0667,  0.0667]], device='cuda:0')

In [76]:
input.grad.zero_() # Manually zero the input gradient

tensor([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]], device='cuda:0')

In [77]:
input.grad

tensor([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]], device='cuda:0')

In [78]:
# Just to verify torch matmul properties
# see : The difference between torch.mul, torch.mm, torch.bmm, torch.matmul
# https://www.programmersought.com/article/80074664295/

import torch

x = torch.randn(2, 1) # torch.Size([2, 1])
A = torch.randn(1, 1) # torch.Size([1, 1])
b = torch.randn(1) # torch.Size([1])

# torch matmul -> If both arguments are 2-dimensional, the matrix-matrix product is returned.
result = torch.matmul(x,A) # torch.Size([2, 1])

print("x={}, A={}".format(x,A))
print("torch.matmul(x,A)={}, nb elem={}".format(result,result.numel()))
print("b={}".format(b))
result2 = result + b # torch.Size([2, 1])
print("torch.matmul(x,A)+b={}".format(result2))

x=tensor([[-0.7729],
        [ 0.6471]]), A=tensor([[-1.0801]])
torch.matmul(x,A)=tensor([[ 0.8348],
        [-0.6989]]), nb elem=2
b=tensor([-0.7126])
torch.matmul(x,A)+b=tensor([[ 0.1222],
        [-1.4115]])


In [79]:
# A minimal example of SGD (Stochastic Gradient Descent)
# without a Neural Network

import torch
import torch.nn as nn
import torch.optim as optim

N = 64 # amount of values inside the tensors

# Define 2 equivalent tensors x and y with a torch.Size([64, 1])
x0 = torch.randn(N, 1, requires_grad=False)
x = x0
y = x0

# Define the model parameters to optimize
A = torch.randn(1, 1, requires_grad=True) # size 1 x 1 (weights of the model) -> torch.Size([1, 1])
b = torch.randn(1, requires_grad=True) # Size 1 (biases of the model) -> torch.Size([1])

print("Initial values of A={}, b={}".format(A,b))

optimizer = optim.SGD([A, b], lr=1e-1) # Define SGD with [A, b] as parameters and learning rate to 1e-1

print("Learning rate={}".format(1e-1))

for t in range(20):
    print('-' * 50)
    optimizer.zero_grad() # Clean gradients of A and b (because they have requires_grad to True)
    # torch matmul -> If both arguments are 2-dimensional, the matrix-matrix product is returned.
    y_pred = torch.matmul(x, A) + b # Calculate a noisy predicted value -> (x * A) + b
    loss = ((y_pred - y) ** 2).mean() # Calculate loss value (y is the actual value)
    print("{} -> loss={}".format(t, loss.item())) # print loss value, need item() function to get it
    loss.backward() # Compute gradient of A and b with loss value
    print("A.grad={},b.grad={}".format(A.grad,b.grad))
    optimizer.step() # Update values of A and b by taking account of their gradient and the learning rate
    # A += -lr * A.grad  ; b += -lr * b.grad
    print("A={}, b={}".format(A,b))


Initial values of A=tensor([[-0.6964]], requires_grad=True), b=tensor([-1.7002], requires_grad=True)
Learning rate=0.1
--------------------------------------------------
0 -> loss=5.714437007904053
A.grad=tensor([[-3.4731]]),b.grad=tensor([-3.2568])
A=tensor([[-0.3491]], requires_grad=True), b=tensor([-1.3745], requires_grad=True)
--------------------------------------------------
1 -> loss=3.6726291179656982
A.grad=tensor([[-2.7602]]),b.grad=tensor([-2.6348])
A=tensor([[-0.0731]], requires_grad=True), b=tensor([-1.1110], requires_grad=True)
--------------------------------------------------
2 -> loss=2.3610503673553467
A.grad=tensor([[-2.1940]]),b.grad=tensor([-2.1312])
A=tensor([[0.1463]], requires_grad=True), b=tensor([-0.8979], requires_grad=True)
--------------------------------------------------
3 -> loss=1.5182826519012451
A.grad=tensor([[-1.7442]]),b.grad=tensor([-1.7235])
A=tensor([[0.3207]], requires_grad=True), b=tensor([-0.7255], requires_grad=True)
------------------------

In [80]:
# A simple test of gradient
# To proof the result of gradient

x = torch.ones(2, 2, requires_grad=True) # size 2 x 2 with all values equal to 1
y = x + 2  # size 2 x 2 with all values equal to 3
z = y * y * 2 # size 2 x 2 with all values equal to 18

out = z.mean() # mean is equal to 18 ! (18*4/4)
print("mean={}".format(out))

out.backward() # compute gradient of x

print("x grad={}".format(x.grad))

mean=18.0
x grad=tensor([[3., 3.],
        [3., 3.]])


<img src="math_demo.png">

In [81]:
# A minimal example of SGD (Stochastic Gradient Descent)
# with a Neural Network

import torch
import torch.nn as nn
import torch.optim as optim
#from torchsummary import summary  # pip install torchsummary

# Let's make some data for a linear regression.
A = 3.1415926
B = 2.7189351
error = 0.1
N = 100 # number of data points

# Data
input_data = torch.randn(N, 1)

# (noisy) Target values that we want to learn.
target = A * input_data + B + (torch.randn(N, 1) * error)

# Creating a model, making the optimizer, defining loss
model = nn.Linear(1, 1) 
# create a Neural Network Linear with in_features=1 and out_features=1 (nn.Linear(in_features,ou_features))
# Applies a linear transformation to the incoming x data: y = x*W^T + b
# Note that the weights W have shape (out_features, in_features) and biases b have shape (out_features).

# Verify model tensors parameters :
for name, param in model.named_parameters():
    print('name: ', name)
    print(type(param))
    print('param.shape: ', param.shape)
    print('param.requires_grad: ', param.requires_grad)
    print('=====')

# name:  weight
# <class 'torch.nn.parameter.Parameter'>
# param.shape:  torch.Size([1, 1])
# param.requires_grad:  True
# =====
# name:  bias
# <class 'torch.nn.parameter.Parameter'>
# param.shape:  torch.Size([1])
#param.requires_grad:  True
# =====

print("model weight={}, model bias={}".format(model.weight,model.bias))

#model_summary(model)

optimizer = optim.SGD(model.parameters(), lr=0.05)
loss_fn = nn.MSELoss()  # the mean squared error (squared L2 norm)

# Run training
nb_iter = 50
for _ in range(0, nb_iter):
    optimizer.zero_grad() # Clear gradient values of model parameters
    predictions = model(input_data)
    # forward input_data into the model
    # So multiply input_data to weight tensor parameter inside model
    # Sizes are : input_data -> torch.Size([100, 1]) ; weight -> torch.Size([1, 1])
    # torch.matmul(input_data,weight) -> torch.Size([100, 1])
    # If both arguments are at least 1-dimensional and at least one argument is N-dimensional (where N > 2), 
    # then a batched matrix multiply is returned
    # Then add bias tensor of size : torch.Size([1])
    # and get a prediction -> torch.Size([100, 1])
    loss = loss_fn(predictions, target) # calculate loss value
    loss.backward() # calculate gradient values
    optimizer.step() # update model parameters with gradient values

    print("-" * 50)
    print("error = {}".format(loss.item()))
    print("learned weight = {}".format(list(model.parameters())[0].data[0, 0]))
    print("learned bias = {}".format(list(model.parameters())[1].data[0]))
    

name:  weight
<class 'torch.nn.parameter.Parameter'>
param.shape:  torch.Size([1, 1])
param.requires_grad:  True
=====
name:  bias
<class 'torch.nn.parameter.Parameter'>
param.shape:  torch.Size([1])
param.requires_grad:  True
=====
model weight=Parameter containing:
tensor([[-0.5860]], requires_grad=True), model bias=Parameter containing:
tensor([-0.1954], requires_grad=True)
--------------------------------------------------
error = 23.932031631469727
learned weight = -0.17585963010787964
learned bias = 0.0979558527469635
--------------------------------------------------
error = 19.118099212646484
learned weight = 0.1892572045326233
learned bias = 0.3618992567062378
--------------------------------------------------
error = 15.274614334106445
learned weight = 0.5143064856529236
learned bias = 0.5994154214859009
--------------------------------------------------
error = 12.205626487731934
learned weight = 0.8036851286888123
learned bias = 0.8131507039070129
--------------------------

In [82]:
# An other test with nn.Linear, to verify sizes

input_size = 8
output_size = 14
batch_size = 64

input = torch.FloatTensor(batch_size, input_size) # torch.Size([64, 8])

net = nn.Linear(input_size, output_size) # in_features=8, out_features=14
# net parameters -> W (weight) shape is (out_features, in_features) and b (biases) shape is (out_features)
# W=(14x8), W^T=(8x14), b=(14), input(64x8)
# output = input*W^T + b
# output=(64x14)
# rmq: operator + add tensor b to each line of tensor input*W^T

for name, param in net.named_parameters():
    print('name: ', name)
    print(type(param))
    print('param.shape: ', param.shape)
    print('param.requires_grad: ', param.requires_grad)
    print('=====')

# name:  weight
# <class 'torch.nn.parameter.Parameter'>
# param.shape:  torch.Size([14, 8])
# param.requires_grad:  True
# =====
# name:  bias
# <class 'torch.nn.parameter.Parameter'>
# param.shape:  torch.Size([14])
# param.requires_grad:  True

output = net(input)  # forward input into net

print("Output size:", output.size()) # torch.Size([64, 14])

name:  weight
<class 'torch.nn.parameter.Parameter'>
param.shape:  torch.Size([14, 8])
param.requires_grad:  True
=====
name:  bias
<class 'torch.nn.parameter.Parameter'>
param.shape:  torch.Size([14])
param.requires_grad:  True
=====
Output size: torch.Size([64, 14])


In [83]:
tensor1 = torch.randn(4, 5)
tensor2 = torch.randn(5)
print(tensor1)
print(tensor2)
r = tensor1 + tensor2
print(r)

tensor([[ 1.6590, -1.1129, -0.5345,  0.7154,  1.3856],
        [-0.1057, -0.6941,  0.3661,  0.2485,  0.4170],
        [-0.8194, -0.0582, -0.4577, -0.9523, -0.1293],
        [-0.2390, -0.3480,  0.3054, -0.7721, -0.4149]])
tensor([-0.4298, -1.9213,  0.1027,  0.4520, -1.4254])
tensor([[ 1.2292, -3.0342, -0.4318,  1.1674, -0.0398],
        [-0.5356, -2.6154,  0.4688,  0.7005, -1.0085],
        [-1.2492, -1.9795, -0.3551, -0.5003, -1.5548],
        [-0.6688, -2.2693,  0.4081, -0.3201, -1.8404]])


In [98]:
import torch
import torch.nn as nn

# Create a tensor of 3 rows x 2 columns
x = torch.tensor([[1.0, -1.0], 
                  [0.0, 1.0], 
                  [0.0, 0.0]])

print("x-> shape[0] = {}, shape[1] = {}".format(x.shape[0], x.shape[1]))

# x contains 3 inputs (i.e. the batch size is 3), x[0], x[1] and x[2], each of size 2

in_features = x.shape[1] # 2 columns
out_features = 2

net = nn.Linear(in_features, out_features)


x-> shape[0] = 3, shape[1] = 2


![Network Linear 2 x 2](nn_Linear_2_x_2.png)

In [99]:
net.weight

Parameter containing:
tensor([[-0.5094, -0.2481],
        [-0.1743,  0.2661]], requires_grad=True)

In [100]:
net.weight.t() # W^T

tensor([[-0.5094, -0.1743],
        [-0.2481,  0.2661]], grad_fn=<TBackward>)

In [101]:
net.bias

Parameter containing:
tensor([ 0.5107, -0.0022], requires_grad=True)

In [102]:
# x contains 3 inputs (i.e. the batch size is 3), x[0], x[1] and x[2], each of size 2
# the output is going to be of shape (batch size, out_features) = (3, 2)

predict = net(x)

predict

tensor([[ 0.2495, -0.4427],
        [ 0.2626,  0.2639],
        [ 0.5107, -0.0022]], grad_fn=<AddmmBackward>)

In [103]:
y = x.matmul(net.weight.t()) + net.bias

y

tensor([[ 0.2495, -0.4427],
        [ 0.2626,  0.2639],
        [ 0.5107, -0.0022]], grad_fn=<AddBackward0>)

In [90]:
# predict is compute like this :
# predict = x.matmul(net.weight.t()) + net.bias  # y = x*W^T + b

# where i is in interval [0, batch_size) and j in [0, out_features).
# predict[i,j] == x[i,0] * net.weight[j,0] + x[i,1] * net.weight[j,1] + net.bias[j]

# i = 0
# j = 0
# predict[0,0] = x[0,0] * net.weight[0,0] + x[0,1] * net.weight[0,1] + net.bias[0]
# j = 1
# predict[0,1] = x[0,0] * net.weight[1,0] + x[0,1] * net.weight[1,1] + net.bias[1]

# i = 1
# j = 0
# predict[1,0] = x[1,0] * net.weight[0,0] + x[1,1] * net.weight[0,1] + net.bias[0]
# j = 1
# predict[1,1] = x[1,0] * net.weight[1,0] + x[1,1] * net.weight[1,1] + net.bias[1]


# Weight   ( [0,0], [0,1]
#            [1,0], [1,1] )

# Weight^T ( [0,0], [1,0]
#            [0,1], [1,1] )



In [149]:
in_features = 3
out_features = 2
net2 = nn.Linear(in_features, out_features)

![Network Linear 3 x 2](nn_Linear_3_x_2.png)

In [150]:
net2.weight

Parameter containing:
tensor([[-0.1512, -0.1415,  0.0083],
        [ 0.4374,  0.5671,  0.2462]], requires_grad=True)

In [151]:
net2.bias

Parameter containing:
tensor([-0.3472,  0.4851], requires_grad=True)

In [152]:
net2.weight.t()

tensor([[-0.1512,  0.4374],
        [-0.1415,  0.5671],
        [ 0.0083,  0.2462]], grad_fn=<TBackward>)

In [153]:
# Create a tensor of 4 rows x 3 columns
x2 = torch.tensor([[1.0, -1.0, 2.0], 
                  [0.0, 1.0, -5.0], 
                  [0.0, 0.0, 1.0],
                  [-2.0, 1.0, 0.0]])

# x2 contains 4 inputs (i.e. the batch size is 4), x[0], x[1], x[2] and x[3], each of size 3
# the output is going to be of shape (batch size = 4, out_features) = (4, 2)

In [155]:
predict2 = net2(x2)

print(predict2)

tensor([[-0.3402,  0.8478],
        [-0.5304, -0.1786],
        [-0.3389,  0.7313],
        [-0.1864,  0.1774]], grad_fn=<AddmmBackward>)


In [157]:
# Calculate the prediction by myself

net2_wt = net2.weight.t()

print(net2_wt)

y = torch.zeros((x2.shape[0], out_features))

print(y)

tensor([[-0.1512,  0.4374],
        [-0.1415,  0.5671],
        [ 0.0083,  0.2462]], grad_fn=<TBackward>)
tensor([[0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.]])


In [158]:
for i in range(x2.shape[0]): # each rows of x2
    for j in range(net2_wt.shape[1]): # each columns of net2^T
        for k in range(net2_wt.shape[0]): # each rows of net2^T
           y[i,j] += x2[i,k] * net2_wt[k,j]
           print("y[{},{}] += x2[{},{}]*wt[{},{}]".format(i,j,i,k,k,j))
        
        y[i,j] += net2.bias[j]
        print("y[{},{}] += bias[{}]".format(i,j,j))
    
   

y

y[0,0] += x2[0,0]*wt[0,0]
y[0,0] += x2[0,1]*wt[1,0]
y[0,0] += x2[0,2]*wt[2,0]
y[0,0] += bias[0]
y[0,1] += x2[0,0]*wt[0,1]
y[0,1] += x2[0,1]*wt[1,1]
y[0,1] += x2[0,2]*wt[2,1]
y[0,1] += bias[1]
y[1,0] += x2[1,0]*wt[0,0]
y[1,0] += x2[1,1]*wt[1,0]
y[1,0] += x2[1,2]*wt[2,0]
y[1,0] += bias[0]
y[1,1] += x2[1,0]*wt[0,1]
y[1,1] += x2[1,1]*wt[1,1]
y[1,1] += x2[1,2]*wt[2,1]
y[1,1] += bias[1]
y[2,0] += x2[2,0]*wt[0,0]
y[2,0] += x2[2,1]*wt[1,0]
y[2,0] += x2[2,2]*wt[2,0]
y[2,0] += bias[0]
y[2,1] += x2[2,0]*wt[0,1]
y[2,1] += x2[2,1]*wt[1,1]
y[2,1] += x2[2,2]*wt[2,1]
y[2,1] += bias[1]
y[3,0] += x2[3,0]*wt[0,0]
y[3,0] += x2[3,1]*wt[1,0]
y[3,0] += x2[3,2]*wt[2,0]
y[3,0] += bias[0]
y[3,1] += x2[3,0]*wt[0,1]
y[3,1] += x2[3,1]*wt[1,1]
y[3,1] += x2[3,2]*wt[2,1]
y[3,1] += bias[1]


tensor([[-0.3402,  0.8478],
        [-0.5304, -0.1786],
        [-0.3389,  0.7313],
        [-0.1864,  0.1774]], grad_fn=<CopySlices>)