# Mini tutorial to understand how a layer in pytorch operates

This is a tutorial to finally understand how a layer in pytorch and dimensions work

In [1]:
import torch

### Dimensions of A matrix of NN and bias

In [5]:
layer = torch.nn.Linear(in_features=3, out_features=4, bias=True)
print(layer.weight.shape)
print(layer.bias.shape)

print(layer.weight)
print(layer.bias)

torch.Size([4, 3])
torch.Size([4])
Parameter containing:
tensor([[-0.3120, -0.0455,  0.0326],
        [-0.3490,  0.4752, -0.1979],
        [-0.0791,  0.5161, -0.0465],
        [ 0.2355, -0.3889,  0.5403]], requires_grad=True)
Parameter containing:
tensor([ 0.1351, -0.0136,  0.2817, -0.2702], requires_grad=True)


### Apply the layer

In [31]:
X = torch.rand(3)

print(X.shape)
print(layer.weight.shape)

# The generic way to apply any layer
Y = layer(X)

#What torch.nn.Linear does interlanny
Y_manual = X @ layer.weight.transpose(0, 1) + layer.bias
Y_manual = torch.matmul(layer.weight, X) + layer.bias

print(X)
print(X.shape)
print(Y)
print(Y.shape)
print(Y_manual)
print(Y.shape)


torch.Size([3])
torch.Size([4, 3])
tensor([0.9273, 0.1604, 0.7790])
torch.Size([3])
tensor([-0.5268,  0.6297, -0.2435, -0.8854], grad_fn=<ViewBackward0>)
torch.Size([4])
tensor([-0.5268,  0.6297, -0.2435, -0.8854], grad_fn=<AddBackward0>)
torch.Size([4])


### Broadcoasting

In [25]:
t1 = torch.tensor([1, 2, 3])                     # Shape [3] <--- line vector
t2 = torch.tensor([[10, 20, 30], [40, 50, 60]])  # Shape [2, 3]  <--- 2D matrix

s = t1 + t2
print(s)         # Prints [[11, 22, 33], [41, 52, 63]]
print(s.shape)   # Prints [2, 3]  <--- 2D matrix

tensor([[11, 22, 33],
        [41, 52, 63]])
torch.Size([2, 3])


In [None]:
layer = torch.nn.Linear(in_features=3, out_features=4, bias=True)

X = torch.rand(3)                  # Shape    [3]
Wt = layer.weight.transpose(0, 1)  # Shape [3, 4]
B = layer.bias                     # Shape    [4]

Y_manual =        X @ Wt     + B
# Shapes:       [3] @ [3, 4] + [4]
# Shapes:    [1, 3] @ [3, 4] + [4]     (after broadcasting X)
# Shapes:         [1, 4]     + [4]     (after matrix multiply)
# Shapes:         [1, 4]     + [1, 4]  (after broadcasting B)
# Shapes:                [1, 4]        (after adding B)

### Apply the layer to a lot of batches

In [30]:
X = torch.rand((10, 2, 3))  # Shape [10, 2, 3]

# Like before, this has a [3, 4] weight tensor and a [4] bias tensor
layer = torch.nn.Linear(in_features=3, out_features=4, bias=True)

# These are all identical, and end up with shape [10, 2, 4]
Y = layer(X)
Y_manual = torch.matmul(X, layer.weight.transpose(0,1)) + layer.bias

print(X)
print(X.shape)
#print(Y)
print(Y.shape)
#print(Y_manual)
print(Y.shape)

print((Y == Y_manual).all().item())  # y and y_manual are the exact same

tensor([[[0.1013, 0.4577, 0.4679],
         [0.7341, 0.2850, 0.9302]],

        [[0.3686, 0.1674, 0.7011],
         [0.0015, 0.2222, 0.3001]],

        [[0.3607, 0.6911, 0.8464],
         [0.1292, 0.0443, 0.7671]],

        [[0.1531, 0.4537, 0.7685],
         [0.1254, 0.4405, 0.7585]],

        [[0.3561, 0.7914, 0.4904],
         [0.6081, 0.3517, 0.2927]],

        [[0.6258, 0.3477, 0.8748],
         [0.1450, 0.4207, 0.4621]],

        [[0.5434, 0.5358, 0.7982],
         [0.6219, 0.0922, 0.9663]],

        [[0.0404, 0.7370, 0.0628],
         [0.1872, 0.6405, 0.3090]],

        [[0.8970, 0.1372, 0.4719],
         [0.2629, 0.2618, 0.8036]],

        [[0.1099, 0.8973, 0.4023],
         [0.7942, 0.0454, 0.8940]]])
torch.Size([10, 2, 3])
torch.Size([10, 2, 4])
torch.Size([10, 2, 4])
True


### Tensor two d multiplication with broadcasting, think it like first number as channlesof how many times it has to multiply

In [32]:
A = torch.tensor([  # Shape [2, 2, 3]
    [[1, 2, 3],
     [4, 5, 6]],

    [[7, 8, 9],
     [10, 11, 12]]
])

B = torch.tensor([  # Shape [2, 3, 4]
    [[2, 2, 2, 2],
     [3, 3, 3, 3],
     [4, 4, 4, 4]],

    [[2, 2, 2, 2],
     [3, 3, 3, 3],
     [4, 4, 4, 4]],
])

C = A @ B           # Shape [2, 2, 4]
# C corresponds to torch.tensor([
#    [[ 20,  20,  20,  20]
#     [ 47,  47,  47,  47]],
#
#    [[ 74,  74,  74,  74],
#     [101, 101, 101, 101]]
#])
assert (C[0] == A[0] @ B[0]).all().item()
assert (C[1] == A[1] @ B[1]).all().item()

In [34]:
A = torch.rand((2, 2, 3))  # Theses are the shapes of the tensors
B = torch.rand((2, 3, 4))
C = A @ B  # Ok
print(C)
print(C.shape)

A = torch.rand((2, 2, 3))
B = torch.rand(   (3, 4))
C = A @ B  # Ok
print(C)
print(C.shape)

A = torch.rand((2, 2, 3))
B = torch.rand((1, 3, 4))
C = A @ B  # Ok
print(C)
print(C.shape)

A = torch.rand((1, 2, 2, 3))
B = torch.rand(   (1, 3, 4))
C = A @ B  # Ok
print(C)
print(C.shape)

A = torch.rand((1, 2, 2, 3))
B = torch.rand((4, 1, 3, 4))
C = A @ B  # Ok
print(C)
print(C.shape)

# A = torch.rand((2, 2, 2, 3))
# B = torch.rand((4, 1, 3, 4))
# C = A @ B  # Error
# print(C)
# print(C.shape)

# A = torch.rand((1, 2, 2, 3))
# B = torch.rand(   (4, 3, 4))
# C = A @ B  # Error
# print(C)
# print(C.shape)

A = torch.rand((1, 2, 2, 3))
B = torch.rand(      (3, 4))
C = A @ B  # Ok
print(C)
print(C.shape)

tensor([[[0.7221, 1.0402, 0.4207, 0.6766],
         [0.9258, 0.7155, 0.6025, 0.8515]],

        [[1.4024, 0.6454, 1.0943, 1.1515],
         [0.5777, 0.3426, 0.2367, 0.4478]]])
torch.Size([2, 2, 4])
tensor([[[0.9812, 0.7265, 1.4551, 0.4005],
         [0.1903, 0.6345, 0.8130, 0.4009]],

        [[0.9705, 0.6698, 1.4406, 0.3796],
         [0.5633, 0.3919, 0.9306, 0.2486]]])
torch.Size([2, 2, 4])
tensor([[[0.4832, 0.9160, 1.1869, 0.5524],
         [0.5347, 0.8007, 1.0764, 0.8938]],

        [[0.5931, 0.9079, 1.1486, 0.4924],
         [0.3466, 0.8789, 1.1557, 0.5042]]])
torch.Size([2, 2, 4])
tensor([[[[0.5672, 1.0865, 0.7029, 0.6031],
          [0.9190, 1.4110, 0.9768, 0.8817]],

         [[1.0862, 1.0324, 0.8355, 1.0679],
          [1.1707, 1.2931, 0.9848, 1.1950]]]])
torch.Size([1, 2, 2, 4])
tensor([[[[0.8851, 0.7248, 0.1796, 1.0206],
          [0.9677, 0.4719, 0.1712, 1.2509]],

         [[0.5841, 0.6484, 0.1916, 0.8656],
          [0.7559, 0.2884, 0.1176, 0.9676]]],


        [[[0.8938,