In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

In [4]:
# get device for training
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(device)

cuda


In [5]:
# define neural network by subclassing nn.Module
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28,512), 
            nn.ReLU(), 
            nn.Linear(512, 512), 
            nn.ReLU(), 
            nn.Linear(512, 10)
        )
    def forward(self,x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

In [6]:
model = NeuralNetwork().to(device)
print(model)

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


In [72]:
X = torch.rand(1,28,28, device=device)
logits = model(X)
pred_probab = nn.Softmax(dim=1)(logits)
y_pred = pred_probab.argmax(1)

In [73]:
logits

tensor([[ 0.0162,  0.0550, -0.0485, -0.1115, -0.0713,  0.1006,  0.0743, -0.0884,
          0.0174,  0.0422]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [74]:
pred_probab

tensor([[0.1015, 0.1055, 0.0952, 0.0894, 0.0930, 0.1105, 0.1076, 0.0915, 0.1017,
         0.1042]], device='cuda:0', grad_fn=<SoftmaxBackward0>)

In [75]:
y_pred

tensor([5], device='cuda:0')

In [79]:
list(model.named_parameters())

[('linear_relu_stack.0.weight',
  Parameter containing:
  tensor([[-0.0266, -0.0027,  0.0121,  ...,  0.0248, -0.0183, -0.0178],
          [-0.0100,  0.0170, -0.0274,  ..., -0.0066, -0.0259,  0.0235],
          [ 0.0285, -0.0312, -0.0129,  ..., -0.0282, -0.0305, -0.0258],
          ...,
          [-0.0227, -0.0184, -0.0150,  ...,  0.0106,  0.0193, -0.0041],
          [-0.0197, -0.0225, -0.0007,  ...,  0.0077,  0.0291, -0.0277],
          [-0.0219,  0.0295, -0.0223,  ...,  0.0316, -0.0083,  0.0128]],
         device='cuda:0', requires_grad=True)),
 ('linear_relu_stack.0.bias',
  Parameter containing:
  tensor([-3.0015e-02, -1.6832e-04, -5.6299e-03,  8.2852e-03,  3.8866e-03,
           2.6253e-02,  1.7013e-02,  9.3673e-04, -1.2860e-02,  3.2772e-02,
           1.3723e-02, -9.5581e-03,  3.2115e-02, -1.9888e-02, -1.1253e-03,
          -2.6112e-03,  2.0083e-02, -2.1564e-02, -1.1684e-03,  3.9821e-04,
          -3.4614e-02,  2.6544e-02, -3.4822e-02, -1.7542e-02, -2.5858e-02,
          -2.9467e-

Lets go through the transformations manually and see what happens

In [39]:
input_image = torch.rand(3,28,28)
flatten = nn.Flatten()
flat_image = flatten(input_image)
layer1 = nn.Linear(in_features=28*28, out_features=20)
hidden1 = layer1(flat_image)
relu = nn.ReLU()
hidden1_relu = relu(hidden1)

In [40]:
input_image, input_image.shape

(tensor([[[0.4047, 0.3265, 0.9336,  ..., 0.9449, 0.2127, 0.5980],
          [0.1530, 0.2212, 0.6325,  ..., 0.8357, 0.7831, 0.6753],
          [0.6460, 0.1714, 0.4423,  ..., 0.8541, 0.2490, 0.9009],
          ...,
          [0.7913, 0.8038, 0.2167,  ..., 0.0418, 0.6079, 0.9968],
          [0.6320, 0.1235, 0.1826,  ..., 0.5830, 0.2637, 0.2465],
          [0.0547, 0.5544, 0.8689,  ..., 0.1137, 0.7727, 0.9784]],
 
         [[0.9554, 0.1592, 0.2311,  ..., 0.1519, 0.8800, 0.5123],
          [0.2088, 0.0072, 0.0413,  ..., 0.6915, 0.0325, 0.2674],
          [0.2771, 0.5601, 0.5384,  ..., 0.6019, 0.4632, 0.0047],
          ...,
          [0.5246, 0.6885, 0.6237,  ..., 0.6351, 0.1440, 0.3297],
          [0.8154, 0.7596, 0.3202,  ..., 0.2589, 0.9657, 0.6551],
          [0.8759, 0.0815, 0.3772,  ..., 0.0349, 0.7726, 0.5019]],
 
         [[0.1588, 0.3539, 0.5743,  ..., 0.1617, 0.4004, 0.7798],
          [0.6482, 0.7424, 0.2814,  ..., 0.0768, 0.1063, 0.1109],
          [0.8949, 0.9843, 0.9782,  ...,

In [41]:
flat_image, flat_image.shape

(tensor([[0.4047, 0.3265, 0.9336,  ..., 0.1137, 0.7727, 0.9784],
         [0.9554, 0.1592, 0.2311,  ..., 0.0349, 0.7726, 0.5019],
         [0.1588, 0.3539, 0.5743,  ..., 0.3773, 0.9325, 0.9361]]),
 torch.Size([3, 784]))

In [42]:
hidden1, hidden1.shape

(tensor([[ 0.2594, -0.0765, -0.7092, -0.1715,  0.0937, -0.1292, -0.2596, -0.0565,
          -0.0399,  0.3109,  0.3510, -0.0388, -0.1398, -0.2954,  0.0936,  0.1403,
          -0.1770, -0.2248,  0.1037,  0.5428],
         [ 0.1227, -0.2223,  0.0086, -0.2165,  0.5908, -0.2631, -0.5805, -0.1244,
          -0.0355,  0.5415,  0.0993, -0.3688, -0.0882, -0.4390,  0.3960,  0.2425,
          -0.2355, -0.0672,  0.1070,  0.5454],
         [ 0.0835,  0.2384, -0.7010,  0.0251,  0.6562, -0.1876, -0.4529, -0.2066,
           0.2494,  0.7765,  0.1737,  0.2156,  0.1488, -0.3757,  0.4363,  0.3504,
          -0.3304,  0.0879, -0.1116,  0.6818]], grad_fn=<AddmmBackward0>),
 torch.Size([3, 20]))

In [43]:
hidden1_relu, hidden1_relu.shape

(tensor([[0.2594, 0.0000, 0.0000, 0.0000, 0.0937, 0.0000, 0.0000, 0.0000, 0.0000,
          0.3109, 0.3510, 0.0000, 0.0000, 0.0000, 0.0936, 0.1403, 0.0000, 0.0000,
          0.1037, 0.5428],
         [0.1227, 0.0000, 0.0086, 0.0000, 0.5908, 0.0000, 0.0000, 0.0000, 0.0000,
          0.5415, 0.0993, 0.0000, 0.0000, 0.0000, 0.3960, 0.2425, 0.0000, 0.0000,
          0.1070, 0.5454],
         [0.0835, 0.2384, 0.0000, 0.0251, 0.6562, 0.0000, 0.0000, 0.0000, 0.2494,
          0.7765, 0.1737, 0.2156, 0.1488, 0.0000, 0.4363, 0.3504, 0.0000, 0.0879,
          0.0000, 0.6818]], grad_fn=<ReluBackward0>),
 torch.Size([3, 20]))

In [44]:
seq_modules = nn.Sequential(
    flatten,
    layer1,
    relu,
)
logits = seq_modules(input_image)

In [45]:
logits, logits.shape

(tensor([[0.2594, 0.0000, 0.0000, 0.0000, 0.0937, 0.0000, 0.0000, 0.0000, 0.0000,
          0.3109, 0.3510, 0.0000, 0.0000, 0.0000, 0.0936, 0.1403, 0.0000, 0.0000,
          0.1037, 0.5428],
         [0.1227, 0.0000, 0.0086, 0.0000, 0.5908, 0.0000, 0.0000, 0.0000, 0.0000,
          0.5415, 0.0993, 0.0000, 0.0000, 0.0000, 0.3960, 0.2425, 0.0000, 0.0000,
          0.1070, 0.5454],
         [0.0835, 0.2384, 0.0000, 0.0251, 0.6562, 0.0000, 0.0000, 0.0000, 0.2494,
          0.7765, 0.1737, 0.2156, 0.1488, 0.0000, 0.4363, 0.3504, 0.0000, 0.0879,
          0.0000, 0.6818]], grad_fn=<ReluBackward0>),
 torch.Size([3, 20]))

In [46]:
softmax = nn.Softmax(dim=1)
pred_probab = softmax(logits)