In [30]:
# Laurent LEQUIEVRE
# Research Engineer, CNRS (France)
# Institut Pascal UMR6602
# laurent.lequievre@uca.fr

REMINDER
========

CLASS torch.nn.Linear(in_features, out_features, bias=True)

Applies a linear transformation to the incoming data: y = x*W^T + b

Parameters:

in_features -> size of each input sample (i.e. size of x)

out_features -> size of each output sample (i.e. size of y)

bias -> If set to False, the layer will not learn an additive bias. Default: True
    
Note that the weights W have shape (out_features, in_features) and biases b have shape (out_features). 
They are initialized randomly and can be changed later 
(e.g. during the training of a Neural Network they are updated by some optimization algorithm).

In [31]:
import torch
import torch.nn as nn

In [32]:
# Concrete example of nn.Linear
# Create a tensor x of size 3 x 2
# Where x contains three inputs (i.e. the batch size is 3), x[0], x[1] and x[3], each of size 2
x = torch.tensor([[1.0, -1.0],
                  [0.0,  1.0],
                  [0.0,  0.0]])

in_features = x.shape[1]  # = 2
out_features = 2

m = nn.Linear(in_features, out_features)

# create a fully connected linear layer, which takes input x of shape (batch_size, in_features), 
# where batch size is the number of inputs (each of size in_features) 
# which are passed to the network at once (as a single tensor), 
# and transforms it by the linear equation y = x*W^T + b into a tensor y of shape (batch_size, out_features).
# So output m is going to be of shape (batch size, out_features) = (3, 2).

# Internal parameters shape :
# -> Note that the weights W have shape (out_features, in_features) 
# -> and biases b have shape (out_features)

In [33]:
# m.weight and m.bias were initialized randomly.
print(m.weight)

Parameter containing:
tensor([[ 0.2184,  0.4888],
        [ 0.1568, -0.4866]], requires_grad=True)


In [34]:
print(m.bias)

Parameter containing:
tensor([-0.0847, -0.3124], requires_grad=True)


In [35]:
# The output is :
y = m(x)
print(y)

tensor([[-0.3550,  0.3311],
        [ 0.4042, -0.7990],
        [-0.0847, -0.3124]], grad_fn=<AddmmBackward>)


and (behind the scenes) it is computed as:

y = x.matmul(m.weight.t()) + m.bias  # y = x*W^T + b

i.e.

y[i,j] == x[i,0] * m.weight[j,0] + x[i,1] * m.weight[j,1] + m.bias[j]
where i is in interval [0, batch_size) and j in [0, out_features).