In [1]:
import torch
import torch.utils
import torchvision
from torchvision import transforms, datasets

train = datasets.MNIST(
    root='',
    train=True,
    download=True,
    transform=transforms.Compose([transforms.ToTensor()])
)

test = datasets.MNIST(
    root='',
    train=False,
    download=True,
    transform=transforms.Compose([transforms.ToTensor()])
)

trainset = torch.utils.data.DataLoader(
    train,
    batch_size=16,
    shuffle=True
)

testset = torch.utils.data.DataLoader(
    test,
    batch_size=16,
    shuffle=True
)

In [2]:
import torch.nn as nn # class
import torch.nn.functional as F # function
# usually we use class, but sometimes we want to write one function
# so these 2 libraries are exchangable, they are similar
# in F, it needs params, in nn, we initialize things

# you would choose the library you want based on your case 

In [3]:
class Net(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.fc1 = nn.Linear(28 * 28, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 64)
        self.fc4 = nn.Linear(64, 10)

    def forward(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        x = self.fc4(x)
        return x
    

# net is defined!!!
# but note that the net wont learn, because it wont be scaled properly, thats because no activation function
# it can work now, but not work as we want because no active function

In [4]:
class Net(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.fc1 = nn.Linear(28 * 28, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 64)
        self.fc4 = nn.Linear(64, 10)

    def forward(self, x):
        x = F.relu(self.fc1(x)) # all neurons have the same active function
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.log_softmax(self.fc4(x), dim=1) 
        # dim here is more likely to be axis= in pandas, it makes sure our y is distribution across numbers instead of batched
        return x
    
# active function is mainly used to avoid number explosion

In [5]:
net = Net() # net is object from class
print(net)

Net(
  (fc1): Linear(in_features=784, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=64, bias=True)
  (fc4): Linear(in_features=64, out_features=10, bias=True)
)


In [6]:
# try to use our nn
X = torch.rand([28, 28])
X

tensor([[0.1995, 0.5587, 0.4291, 0.1885, 0.0928, 0.1360, 0.6073, 0.6862, 0.0127,
         0.0463, 0.3490, 0.4747, 0.3312, 0.6269, 0.7363, 0.9269, 0.0083, 0.4318,
         0.0078, 0.3723, 0.6368, 0.3397, 0.1478, 0.3539, 0.7702, 0.6180, 0.9213,
         0.0522],
        [0.3617, 0.5986, 0.4816, 0.6594, 0.0240, 0.7190, 0.1491, 0.4719, 0.9932,
         0.6130, 0.0106, 0.1370, 0.3656, 0.2042, 0.6430, 0.5918, 0.8912, 0.8170,
         0.5593, 0.8446, 0.9371, 0.8863, 0.3228, 0.5213, 0.2010, 0.5730, 0.5018,
         0.0931],
        [0.1573, 0.0876, 0.0186, 0.9652, 0.1592, 0.7263, 0.0763, 0.6054, 0.4710,
         0.1383, 0.8548, 0.9172, 0.0116, 0.4027, 0.8768, 0.8957, 0.8236, 0.1566,
         0.0445, 0.2059, 0.9920, 0.2170, 0.7839, 0.6346, 0.6259, 0.9942, 0.1023,
         0.8088],
        [0.9661, 0.0274, 0.3696, 0.2738, 0.3659, 0.0612, 0.4967, 0.3157, 0.4349,
         0.8020, 0.5652, 0.8290, 0.0742, 0.9666, 0.7791, 0.0469, 0.3069, 0.5598,
         0.7079, 0.8601, 0.4695, 0.3197, 0.1431, 0.7987

In [7]:
output = net(X)

RuntimeError: mat1 and mat2 shapes cannot be multiplied (28x28 and 784x64)

In [8]:
X = X.view([-1, 28 * 28])
output = net(X)
output

tensor([[-2.3224, -2.2490, -2.3370, -2.4818, -2.2935, -2.2039, -2.3357, -2.2374,
         -2.3684, -2.2269]], grad_fn=<LogSoftmaxBackward0>)