In [1]:
import torch, torchvision
from torchvision import transforms
import matplotlib.pyplot as plt

from torch import nn, optim

In [2]:
# refer to previous notebooks to details
transform = transforms.Compose([transforms.ToTensor()])
trainset = torchvision.datasets.MNIST(root="./data", train=True, transform=transform, download=True)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True)

In [3]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        # use a linear layer, please refer to docs to understand params
        self.hidden = nn.Linear(28*28, 512)
        # output layer's input should match hidden layer's output (512)
        # mnist contains 10 classes. thats why output is 10
        self.output = nn.Linear(512, 10)
        
        # we will use softmax activation function to give probability distributed across K classes
        # softmax is gonna be activation funciton for the output layer
        # while sigmoid is gonna be used for hidden layers 

        # lets use the imported sigmoid from nn module
        self.sigmoid = nn.Sigmoid()
        
        # refer to LogSoftmax docs to understand difference from Softmax and dim parameter
        self.softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, x):
        # just thread last x on activation functions
        x = self.hidden(x)
        # pipe the output of the hidden in the sigmoid activation function
        x = self.sigmoid(x)
        x = self.output(x)
        x = self.softmax(x)
        
        return x

In [5]:
model = Net()

In [6]:
model

Net(
  (hidden): Linear(in_features=784, out_features=512, bias=True)
  (output): Linear(in_features=512, out_features=10, bias=True)
  (sigmoid): Sigmoid()
  (softmax): LogSoftmax(dim=1)
)

In [8]:
# to calculate the loss, follow the pytorch convention (criterion)
criterion = nn.NLLLoss()
# optimizer now goes after the loss
optimizer = optim.SGD(model.parameters(), lr=0.01)

# iterate in 5 epochs
for epoch in range(5):
    # for the beginning of each epoch, create a loss
    running_loss = 0
    for images, labels in trainloader:
        images = images.view(images.shape[0], -1)
        # [16, 3:50] add optim zerograd before tranloader
        # you have aweights and gradients for the model
        # in the beginning of the training gradients will be null because you
        #   do not have a computational graph yet.
        # first we do a forward prop to calculate the loss
        # after we do the backward prop to fill the gradients
        #   after that, the gradients will have values.
        # optim.step takes the gradients and update the weights with base on the gradients
        #   -> optim.zero_grad will guarantee that gradients are null and
        #      avoid vanishing or exploding the weights.
        optimizer.zero_grad()
        logits = model(images)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        # increase for every batch
        running_loss += loss.item()
    else:
        print('The running loss is: {}'.format(running_loss/len(trainloader)))

# notice that the loss will drop. this is good
# the amount of how much the loss drop and how much the model learns
# depends on the learning rate (second line of code on this block)

The running loss is: 2.0915346334992186
The running loss is: 1.4585676632964535
The running loss is: 0.9606050949361025
The running loss is: 0.7250438460281917
The running loss is: 0.6050712338833413
