In [62]:
# Check GPU

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [63]:
# Load Dataset
# Note: this is a workaround to bypass known issue:
# Failed to download (trying next): <urlopen error [Errno 110] Connection timed out>
import torchvision.datasets as datasets
import socket

socket.setdefaulttimeout(5)

data_root = './data'

train_set0 = datasets.MNIST(
    root = data_root,
    train = True,
    download = True
)

In [64]:
# Preprocess Dataset
import torchvision.transforms as transforms

transform = transforms.Compose([
    # this transforms PIL.Image data into FloatTensor
    # changes each pixel value into 0.0 ... 1.0
    transforms.ToTensor(),
    # this normalizes each tensors into -1 ... 1
    # normalized_output = (input - avg) / std deviation
    # input 0.0 ... 1.0 so output is -1 ... 1
    transforms.Normalize(0.5, 0.5),
    # flatten because l1 is FC Layer
    transforms.Lambda(lambda x: x.view(-1))
])

In [65]:
# Load and preprocess train dataset from MNIST
train_set = datasets.MNIST(
    root = data_root, train = True,
    download = True, transform = transform
    )

In [66]:
# Load and preprocess test dataset from MNIST
test_set = datasets.MNIST(
    root = data_root, train = False,
    download = True, transform = transform
    )

In [67]:
# Chunk train, test dataset into mini batch size.
# Mini-batch helps GD avoid local minima because it adds randomness
# when updating the gradient. Also, shuffle helps the model avoid 
# learning noise from the order of dataset.

from torch.utils.data import DataLoader

# Mini-batch size
batch_size = 500

train_loader = DataLoader(
    train_set, batch_size = batch_size,
    # shuffle only for train to avoid learning
    # noise from the order of dataset
    shuffle = True
    )

test_loader = DataLoader(
    test_set, batch_size = batch_size,
    # does not require shuffle, because we don't update
    # the model parameter while validating
    shuffle = False
    )

In [68]:
# set the input, output, hidden sizes
import numpy as np
image, _ = train_set[0]
n_input = image.shape[0]
n_output = len(train_set0.classes)
n_hidden = 128

In [69]:
# define the NN model (FC Layer with ReLU)

import torch.nn as nn

class Net(nn.Module):
    def __init__(self, n_input, n_output, n_hidden):
        super().__init__()
        self.l1 = nn.Linear(n_input, n_hidden)
        self.l2 = nn.Linear(n_hidden, n_output)
        self.relu = nn.ReLU(inplace=True)
    
    def forward(self, x):
        x1 = self.l1(x)
        x2 = self.relu(x1)
        x3 = self.l2(x2)
        return x3

In [70]:
# initialize NN

net = Net(n_input, n_output, n_hidden).to(device)

In [71]:
print(net)

Net(
  (l1): Linear(in_features=784, out_features=128, bias=True)
  (l2): Linear(in_features=128, out_features=10, bias=True)
  (relu): ReLU(inplace=True)
)


In [72]:
for parameter in net.named_parameters():
    print(parameter)

('l1.weight', Parameter containing:
tensor([[-0.0111,  0.0211,  0.0067,  ...,  0.0282,  0.0356, -0.0220],
        [-0.0291, -0.0093, -0.0304,  ...,  0.0054, -0.0076,  0.0129],
        [ 0.0084,  0.0202, -0.0305,  ...,  0.0146,  0.0057, -0.0161],
        ...,
        [-0.0174,  0.0172, -0.0101,  ...,  0.0084,  0.0011,  0.0103],
        [-0.0299, -0.0187, -0.0180,  ...,  0.0122, -0.0326, -0.0079],
        [ 0.0175, -0.0126,  0.0212,  ..., -0.0032, -0.0081,  0.0093]],
       device='cuda:0', requires_grad=True))
('l1.bias', Parameter containing:
tensor([ 0.0092, -0.0268, -0.0168,  0.0035,  0.0045,  0.0210, -0.0231, -0.0174,
         0.0137,  0.0195,  0.0066, -0.0076,  0.0289, -0.0040, -0.0153,  0.0230,
         0.0335, -0.0096, -0.0021, -0.0020,  0.0082,  0.0120, -0.0273, -0.0338,
        -0.0220,  0.0289, -0.0003,  0.0140, -0.0059, -0.0146, -0.0156,  0.0345,
        -0.0028, -0.0109,  0.0269,  0.0259, -0.0119,  0.0266, -0.0074,  0.0159,
         0.0184,  0.0218, -0.0014,  0.0277, -0.0124

In [73]:
# set parameters for training, validating

from torch import optim

history = np.zeros((0,5))
num_epochs = 50
criterion = nn.CrossEntropyLoss()
lr = 0.01
optimizer = optim.SGD(net.parameters(), lr=lr)

In [74]:
# train and validate
import numpy as np
from tqdm.notebook import tqdm

# 500 x (120 + 20) = 70,000
for epoch in range(num_epochs):
    train_correct, train_loss = 0,0
    val_correct, val_loss = 0,0
    n_train, n_test = 0,0

    # set the model to training mode
    net.train()
    # main difference between train and validation is whether
    # we are going to update the gradient or not
    # 60,000 / batch_size=500 = 120 times
    for inputs, labels in tqdm(train_loader):
        n_train += len(labels) # 0 ~ 120

        # load to CUDA GPU
        inputs = inputs.to(device)
        labels = labels.to(device)

        # initialize gradient to zero for every step
        optimizer.zero_grad()

        # forward function defined in NN is activated.
        # implement l1 -> relu -> l2 to input
        outputs = net(inputs)
        # SGD on outputs and labels
        loss = criterion(outputs, labels)
        # backward propagation
        loss.backward()
        # update gradient efficiently
        optimizer.step()
        # only get the indice of the max value
        predicted = torch.max(outputs, 1)[1]
        
        train_loss += loss.item()
        train_correct += (predicted == labels).sum().item()

    # set the model to evaluation mode
    net.eval()
    # 10,000 / batch_size=500 = 20 times
    # torch.no_grad disables gradient calculation for validation
    with torch.no_grad():
        for inputs_test, labels_test in tqdm(test_loader):
            n_test += len(labels)

            inputs_test = inputs_test.to(device)
            labels_test = labels_test.to(device)

            optimizer.zero_grad()

            outputs_test = net(inputs_test)
            loss_test = criterion(outputs_test, labels_test)

            predicted_test = torch.max(outputs_test, 1)[1]

            val_loss += loss_test.item()
            val_correct += (predicted_test == labels_test).sum().item()
    
    train_acc = train_correct / n_train
    val_acc = val_correct / n_test

    print(f"Epoch [{epoch+1}/{num_epochs}], train_loss: {train_loss: .5f}, \
            val_loss: {val_loss: .5f}, train_acc: {train_acc: .5f}, \
            val_acc: {val_acc: .5f}")
    
    item = np.array([epoch+1, train_loss, train_acc, val_loss, val_acc])
    history = np.vstack((history, item))

  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [1/50], train_loss:  218.93081,             val_loss:  26.22046, train_acc:  0.58257,             val_acc:  0.77390


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [2/50], train_loss:  122.44262,             val_loss:  15.55150, train_acc:  0.80657,             val_acc:  0.84530


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [3/50], train_loss:  83.00241,             val_loss:  11.79095, train_acc:  0.84637,             val_acc:  0.86480


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [4/50], train_loss:  67.44711,             val_loss:  10.04496, train_acc:  0.86172,             val_acc:  0.87680


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [5/50], train_loss:  59.43935,             val_loss:  9.06265, train_acc:  0.87178,             val_acc:  0.88480


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [6/50], train_loss:  54.50665,             val_loss:  8.41135, train_acc:  0.87962,             val_acc:  0.88890


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [7/50], train_loss:  51.14661,             val_loss:  7.95011, train_acc:  0.88428,             val_acc:  0.89310


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [8/50], train_loss:  48.67792,             val_loss:  7.58547, train_acc:  0.88843,             val_acc:  0.89550


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [9/50], train_loss:  46.76969,             val_loss:  7.33467, train_acc:  0.89075,             val_acc:  0.89830


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [10/50], train_loss:  45.25613,             val_loss:  7.10226, train_acc:  0.89485,             val_acc:  0.90040


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [11/50], train_loss:  43.97045,             val_loss:  6.92480, train_acc:  0.89645,             val_acc:  0.90360


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [12/50], train_loss:  42.91024,             val_loss:  6.77833, train_acc:  0.89867,             val_acc:  0.90460


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [13/50], train_loss:  41.96687,             val_loss:  6.65597, train_acc:  0.90063,             val_acc:  0.90500


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [14/50], train_loss:  41.17319,             val_loss:  6.52700, train_acc:  0.90185,             val_acc:  0.90740


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [15/50], train_loss:  40.47266,             val_loss:  6.42383, train_acc:  0.90318,             val_acc:  0.90840


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [16/50], train_loss:  39.78980,             val_loss:  6.34285, train_acc:  0.90513,             val_acc:  0.90940


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [17/50], train_loss:  39.21239,             val_loss:  6.25502, train_acc:  0.90667,             val_acc:  0.91110


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [18/50], train_loss:  38.65179,             val_loss:  6.17499, train_acc:  0.90790,             val_acc:  0.91260


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [19/50], train_loss:  38.16213,             val_loss:  6.09762, train_acc:  0.90893,             val_acc:  0.91210


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [20/50], train_loss:  37.69191,             val_loss:  6.05432, train_acc:  0.90985,             val_acc:  0.91460


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [21/50], train_loss:  37.24921,             val_loss:  5.96601, train_acc:  0.91077,             val_acc:  0.91480


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [22/50], train_loss:  36.82177,             val_loss:  5.91802, train_acc:  0.91215,             val_acc:  0.91620


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [23/50], train_loss:  36.43077,             val_loss:  5.85684, train_acc:  0.91287,             val_acc:  0.91710


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [24/50], train_loss:  36.04090,             val_loss:  5.80335, train_acc:  0.91370,             val_acc:  0.91760


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [25/50], train_loss:  35.67210,             val_loss:  5.75057, train_acc:  0.91480,             val_acc:  0.91760


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [26/50], train_loss:  35.32687,             val_loss:  5.71313, train_acc:  0.91575,             val_acc:  0.91870


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [27/50], train_loss:  34.98146,             val_loss:  5.67654, train_acc:  0.91642,             val_acc:  0.92060


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [28/50], train_loss:  34.65913,             val_loss:  5.60208, train_acc:  0.91700,             val_acc:  0.92010


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [29/50], train_loss:  34.32410,             val_loss:  5.56740, train_acc:  0.91837,             val_acc:  0.92070


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [30/50], train_loss:  34.02147,             val_loss:  5.52587, train_acc:  0.91903,             val_acc:  0.92180


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [31/50], train_loss:  33.70092,             val_loss:  5.47155, train_acc:  0.91997,             val_acc:  0.92230


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [32/50], train_loss:  33.41537,             val_loss:  5.43811, train_acc:  0.92010,             val_acc:  0.92290


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [33/50], train_loss:  33.08825,             val_loss:  5.38269, train_acc:  0.92107,             val_acc:  0.92380


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [34/50], train_loss:  32.80287,             val_loss:  5.37866, train_acc:  0.92192,             val_acc:  0.92460


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [35/50], train_loss:  32.53898,             val_loss:  5.30759, train_acc:  0.92265,             val_acc:  0.92380


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [36/50], train_loss:  32.25151,             val_loss:  5.27606, train_acc:  0.92315,             val_acc:  0.92440


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [37/50], train_loss:  31.95764,             val_loss:  5.23826, train_acc:  0.92373,             val_acc:  0.92600


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [38/50], train_loss:  31.67968,             val_loss:  5.19217, train_acc:  0.92480,             val_acc:  0.92590


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [39/50], train_loss:  31.42717,             val_loss:  5.15656, train_acc:  0.92502,             val_acc:  0.92650


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [40/50], train_loss:  31.14969,             val_loss:  5.12276, train_acc:  0.92605,             val_acc:  0.92690


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [41/50], train_loss:  30.87855,             val_loss:  5.08228, train_acc:  0.92680,             val_acc:  0.92700


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [42/50], train_loss:  30.63609,             val_loss:  5.03800, train_acc:  0.92737,             val_acc:  0.92760


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [43/50], train_loss:  30.35929,             val_loss:  5.01321, train_acc:  0.92802,             val_acc:  0.92780


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [44/50], train_loss:  30.10963,             val_loss:  4.96296, train_acc:  0.92852,             val_acc:  0.92800


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [45/50], train_loss:  29.88225,             val_loss:  4.92693, train_acc:  0.92947,             val_acc:  0.93010


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [46/50], train_loss:  29.61082,             val_loss:  4.90576, train_acc:  0.93003,             val_acc:  0.92890


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [47/50], train_loss:  29.36425,             val_loss:  4.85819, train_acc:  0.93072,             val_acc:  0.93050


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [48/50], train_loss:  29.10852,             val_loss:  4.80815, train_acc:  0.93147,             val_acc:  0.93130


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [49/50], train_loss:  28.86355,             val_loss:  4.77119, train_acc:  0.93213,             val_acc:  0.93230


  0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch [50/50], train_loss:  28.60604,             val_loss:  4.75254, train_acc:  0.93257,             val_acc:  0.93170
