In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

In [3]:
# get device for training
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(device)

cuda


In [4]:
# define neural network by subclassing nn.Module
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28,512), 
            nn.ReLU(), 
            nn.Linear(512, 512), 
            nn.ReLU(), 
            nn.Linear(512, 10)
        )
    def forward(self,x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

In [5]:
model = NeuralNetwork().to(device)
print(model)

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


In [6]:
X = torch.rand(1,28,28, device=device)
logits = model(X)
pred_probab = nn.Softmax(dim=1)(logits)
y_pred = pred_probab.argmax(1)

In [7]:
logits

tensor([[-0.1029, -0.0239, -0.0720, -0.0344,  0.0598,  0.0150, -0.0353, -0.0618,
         -0.0971, -0.0480]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [8]:
pred_probab

tensor([[0.0938, 0.1015, 0.0967, 0.1005, 0.1104, 0.1055, 0.1004, 0.0977, 0.0943,
         0.0991]], device='cuda:0', grad_fn=<SoftmaxBackward0>)

In [9]:
y_pred

tensor([4], device='cuda:0')

In [10]:
list(model.named_parameters())

[('linear_relu_stack.0.weight',
  Parameter containing:
  tensor([[-0.0336, -0.0124, -0.0040,  ...,  0.0144, -0.0093,  0.0078],
          [ 0.0340, -0.0029,  0.0064,  ..., -0.0232, -0.0142,  0.0190],
          [ 0.0275,  0.0288,  0.0353,  ..., -0.0254,  0.0273, -0.0191],
          ...,
          [ 0.0283, -0.0316,  0.0040,  ...,  0.0097, -0.0102,  0.0157],
          [ 0.0130, -0.0022, -0.0208,  ...,  0.0178,  0.0060, -0.0230],
          [ 0.0014,  0.0119, -0.0252,  ..., -0.0269,  0.0284,  0.0168]],
         device='cuda:0', requires_grad=True)),
 ('linear_relu_stack.0.bias',
  Parameter containing:
  tensor([ 5.2250e-03,  1.9070e-02,  1.3119e-02, -3.0882e-02, -1.1450e-02,
          -3.4203e-03,  3.1625e-02,  2.1947e-02, -2.4132e-03, -1.8640e-02,
          -1.1260e-02, -2.1287e-02, -1.8879e-02, -1.0992e-02,  3.5017e-02,
          -2.2926e-02,  8.4331e-03, -2.8174e-02, -1.7430e-02,  2.8775e-02,
           1.7894e-02, -4.8709e-03,  2.5710e-02, -1.2642e-02,  2.5030e-02,
          -1.4120e-

Lets go through the transformations manually and see what happens

In [11]:
input_image = torch.rand(3,28,28)
flatten = nn.Flatten()
flat_image = flatten(input_image)
layer1 = nn.Linear(in_features=28*28, out_features=20)
hidden1 = layer1(flat_image)
relu = nn.ReLU()
hidden1_relu = relu(hidden1)

In [12]:
input_image, input_image.shape

(tensor([[[0.0690, 0.8257, 0.5937,  ..., 0.5465, 0.7191, 0.8144],
          [0.9796, 0.7266, 0.6807,  ..., 0.0358, 0.6888, 0.6597],
          [0.3417, 0.3234, 0.4264,  ..., 0.2947, 0.8082, 0.7677],
          ...,
          [0.4933, 0.0857, 0.8396,  ..., 0.8034, 0.5075, 0.4573],
          [0.3017, 0.8600, 0.5950,  ..., 0.0429, 0.5823, 0.7499],
          [0.1743, 0.5589, 0.8669,  ..., 0.0474, 0.8600, 0.4481]],
 
         [[0.3089, 0.4701, 0.3180,  ..., 0.4845, 0.9386, 0.5876],
          [0.1881, 0.1064, 0.0636,  ..., 0.2790, 0.0440, 0.3621],
          [0.9704, 0.2656, 0.1733,  ..., 0.2421, 0.8065, 0.5675],
          ...,
          [0.6808, 0.6332, 0.5332,  ..., 0.4538, 0.5910, 0.7088],
          [0.9253, 0.2610, 0.1641,  ..., 0.3909, 0.5281, 0.8140],
          [0.4575, 0.2909, 0.3510,  ..., 0.3246, 0.0381, 0.0498]],
 
         [[0.6313, 0.6158, 0.5119,  ..., 0.4923, 0.6867, 0.5002],
          [0.2636, 0.1009, 0.5857,  ..., 0.3952, 0.3202, 0.5580],
          [0.3352, 0.1677, 0.4544,  ...,

In [13]:
flat_image, flat_image.shape

(tensor([[0.0690, 0.8257, 0.5937,  ..., 0.0474, 0.8600, 0.4481],
         [0.3089, 0.4701, 0.3180,  ..., 0.3246, 0.0381, 0.0498],
         [0.6313, 0.6158, 0.5119,  ..., 0.2462, 0.4016, 0.8970]]),
 torch.Size([3, 784]))

In [14]:
hidden1, hidden1.shape

(tensor([[-0.1041, -0.1315, -0.3719,  0.1057,  0.2584,  0.1116,  0.3671, -0.0408,
           0.0544, -0.1050,  0.2746, -0.3360,  0.3004,  0.2739,  0.1728,  0.5654,
          -0.3783,  0.3118, -0.2582, -0.0245],
         [-0.0397, -0.0838,  0.0648,  0.1285,  0.1357, -0.0414, -0.1033,  0.1602,
           0.0265,  0.0150, -0.2368, -0.3880,  0.2613,  0.1556,  0.4028,  0.5191,
          -0.2246,  0.4836, -0.4123,  0.2709],
         [-0.2173, -0.2504, -0.3125,  0.4368, -0.0740, -0.2800, -0.3058,  0.3395,
          -0.1577,  0.0284,  0.2190, -0.0389,  0.4493,  0.0275,  0.4807,  0.2330,
          -0.3482,  0.1136, -0.5696, -0.0381]], grad_fn=<AddmmBackward0>),
 torch.Size([3, 20]))

In [15]:
hidden1_relu, hidden1_relu.shape

(tensor([[0.0000, 0.0000, 0.0000, 0.1057, 0.2584, 0.1116, 0.3671, 0.0000, 0.0544,
          0.0000, 0.2746, 0.0000, 0.3004, 0.2739, 0.1728, 0.5654, 0.0000, 0.3118,
          0.0000, 0.0000],
         [0.0000, 0.0000, 0.0648, 0.1285, 0.1357, 0.0000, 0.0000, 0.1602, 0.0265,
          0.0150, 0.0000, 0.0000, 0.2613, 0.1556, 0.4028, 0.5191, 0.0000, 0.4836,
          0.0000, 0.2709],
         [0.0000, 0.0000, 0.0000, 0.4368, 0.0000, 0.0000, 0.0000, 0.3395, 0.0000,
          0.0284, 0.2190, 0.0000, 0.4493, 0.0275, 0.4807, 0.2330, 0.0000, 0.1136,
          0.0000, 0.0000]], grad_fn=<ReluBackward0>),
 torch.Size([3, 20]))

In [16]:
seq_modules = nn.Sequential(
    flatten,
    layer1,
    relu,
)
logits = seq_modules(input_image)

In [17]:
logits, logits.shape

(tensor([[0.0000, 0.0000, 0.0000, 0.1057, 0.2584, 0.1116, 0.3671, 0.0000, 0.0544,
          0.0000, 0.2746, 0.0000, 0.3004, 0.2739, 0.1728, 0.5654, 0.0000, 0.3118,
          0.0000, 0.0000],
         [0.0000, 0.0000, 0.0648, 0.1285, 0.1357, 0.0000, 0.0000, 0.1602, 0.0265,
          0.0150, 0.0000, 0.0000, 0.2613, 0.1556, 0.4028, 0.5191, 0.0000, 0.4836,
          0.0000, 0.2709],
         [0.0000, 0.0000, 0.0000, 0.4368, 0.0000, 0.0000, 0.0000, 0.3395, 0.0000,
          0.0284, 0.2190, 0.0000, 0.4493, 0.0275, 0.4807, 0.2330, 0.0000, 0.1136,
          0.0000, 0.0000]], grad_fn=<ReluBackward0>),
 torch.Size([3, 20]))

In [18]:
softmax = nn.Softmax(dim=1)
pred_probab = softmax(logits)

In [19]:
pred_probab

tensor([[0.0429, 0.0429, 0.0429, 0.0477, 0.0555, 0.0479, 0.0619, 0.0429, 0.0453,
         0.0429, 0.0564, 0.0429, 0.0579, 0.0564, 0.0510, 0.0755, 0.0429, 0.0586,
         0.0429, 0.0429],
        [0.0432, 0.0432, 0.0461, 0.0491, 0.0495, 0.0432, 0.0432, 0.0507, 0.0444,
         0.0439, 0.0432, 0.0432, 0.0561, 0.0505, 0.0646, 0.0726, 0.0432, 0.0701,
         0.0432, 0.0567],
        [0.0438, 0.0438, 0.0438, 0.0678, 0.0438, 0.0438, 0.0438, 0.0615, 0.0438,
         0.0451, 0.0545, 0.0438, 0.0687, 0.0450, 0.0709, 0.0553, 0.0438, 0.0491,
         0.0438, 0.0438]], grad_fn=<SoftmaxBackward0>)

In [20]:
y_pred = pred_probab.argmax(1)

In [21]:
y_pred

tensor([15, 15, 14])