Term: YAI 2021 summer session

Team Member: Dongha Kim, Jeongeun Lee, Junho Lee, Suyeong Choi.

In [1]:
#
# Import and prepare libraries
#

# Import numpy and pytorch

import torch
from torchvision import transforms, datasets
from torch.autograd import Variable
import numpy as np

# Random seeding: it will all result accuracy values identical

torch.manual_seed(77)


<torch._C.Generator at 0x1057c4730>

In [2]:
#
# Define Shortcut Functions
#

# Sigmoid: exp(x) / 1 + exp(x)

def sigmoid(x):
  return torch.div(torch.tensor(1.0), torch.add(torch.tensor(1.0), torch.exp(torch.negative(x))))


# Sigmoid derivative: y * (1 - y)

def sigmoid_prime(x):
  return torch.mul(sigmoid(x), torch.subtract(torch.tensor(1.0), sigmoid(x)))


In [3]:
#
# Prepare dataset
#

# Use MNIST dataset
# Download and process dataset, and transform from numpy array to tensor
train_MNIST = datasets.MNIST("MNIST_data/", train=True, transform=transforms.ToTensor(), download=True)

# Make data loader from dataset
# Since we will only train model, we can prepare data without splitting val/test set.
# And we will make data without batching.
train_loader = torch.utils.data.DataLoader(dataset=train_MNIST, shuffle=True, drop_last=True)  # type: ignore


**Model information**

---

The MLP has an input layer, one hidden layer, and one output layer.

The input layer, the hidden layer, and the output layer has 784 nodes, 128 nodes, and 10 nodes, respectively.

Used given sigmoid function as activation function.

---

Equation to implement forward pass: $(x = input, a_2 = prediction)$


$$ z_1 = W_1 x + b_1 $$
$$ a_1 = \sigma(z_1) $$
$$ z_2 = W_2 a_1 + b_2 $$
$$ a_2 = \sigma(z_2) $$


In [4]:
#
# Set hyper-parameters and initialize model params
#

# Configure layer
# In this task, we use two-layer-net
D_in, H, D_out = 784, 128, 10

# Learning rate for SGD
learning_rate = 0.1

# Epochs
epochs = 5

# Initialize model-params, with normalizing (multiply 1 / sqrt(num_layer))

# A weight and a bias for input nodes
w1 = Variable(torch.randn(D_in, H, dtype=torch.float32, requires_grad=True)) * np.sqrt(1. / D_in)
b1 = Variable(torch.randn(1, H, dtype=torch.float32, requires_grad=True)) * np.sqrt(1. / D_in)

# A weight and a bias for hidden nodes
w2 = Variable(torch.randn(H, D_out, dtype=torch.float32, requires_grad=True)) * np.sqrt(1. / H)
b2 = Variable(torch.randn(1, D_out, dtype=torch.float32, requires_grad=True)) * np.sqrt(1. / H)


In [5]:
#
# Start Training
#

for epoch in range(epochs):

  # Counter for measuring train accuracy
  corrects = 0

  # For each data,
  for i, (x, y) in enumerate(train_loader):

    # Batch size = 1 (without batching)
    x = x.reshape((1,-1))

    # Convert label to one-hot vector
    y_onehot = torch.zeros((1,10))
    y_onehot[0,y] += 1

    #
    # forward pass
    #

    # z1 = x @ w1 + b1
    z1 = torch.add(torch.mm(x, w1), b1)
    a1 = sigmoid(z1)
    # z2 = a1 @ w2 + b2
    z2 = torch.add(torch.mm(a1, w2), b2)
    a2 = sigmoid(z2)

    diff = a2 - y_onehot

    #
    # backward pass
    #

    # dE/dz2 = diff * da2/dz2
    d_z2 = torch.mul(diff, sigmoid_prime(z2))
    # dE/db2 = dE/dz2 의 element sum
    d_b2 = sum(d_z2)
    # dE/dw2 = a1_transpose @ dE/dz2
    d_w2 = torch.mm(torch.transpose(a1, 0, 1), d_z2)

    # dE/da1 = dE/dz2 @ w2_transpose
    d_a1 = torch.mm(d_z2, torch.transpose(w2, 0, 1))
    # dE/dz1 = dE/da1 * da1/dz1
    d_z1 = torch.mul(d_a1, sigmoid_prime(z1))
    # dE/db1 = dE/dz1 의 element sum
    d_b1 = sum(d_z1)
    # dE/dw1 = x_transpose @ dE/dz1
    d_w1 = torch.mm(torch.transpose(x, 0, 1), d_z1)

    #
    # optimize weight by Gradient Descent
    #

    w1 -= d_w1 * learning_rate
    b1 -= d_b1 * learning_rate
    w2 -= d_w2 * learning_rate
    b2 -= d_b2 * learning_rate

    # Update counter
    if torch.argmax(a2) == y:
      corrects += 1

    # Report procedure
    if i % 10000 == 0:
      print("\rEpoch {}: {}/{}".format(epoch+1, i, len(train_MNIST)), end='')

  # Report accuracy per each epoch
  print("Epoch {}, Accuracy: {:.3f}".format(epoch+1, corrects/len(train_MNIST))) 



Epoch 1: 0/60000
Epoch 1: 10000/60000
Epoch 1: 20000/60000
Epoch 1: 30000/60000
Epoch 1: 40000/60000
Epoch 1: 50000/60000
Epoch 1, Accuracy: 0.896
Epoch 2: 0/60000
Epoch 2: 10000/60000
Epoch 2: 20000/60000
Epoch 2: 30000/60000
Epoch 2: 40000/60000
Epoch 2: 50000/60000
Epoch 2, Accuracy: 0.948
Epoch 3: 0/60000
Epoch 3: 10000/60000
Epoch 3: 20000/60000
Epoch 3: 30000/60000
Epoch 3: 40000/60000
Epoch 3: 50000/60000
Epoch 3, Accuracy: 0.961
Epoch 4: 0/60000
Epoch 4: 10000/60000
Epoch 4: 20000/60000
Epoch 4: 30000/60000
Epoch 4: 40000/60000
Epoch 4: 50000/60000
Epoch 4, Accuracy: 0.969
Epoch 5: 0/60000
Epoch 5: 10000/60000
Epoch 5: 20000/60000
Epoch 5: 30000/60000
Epoch 5: 40000/60000
Epoch 5: 50000/60000
Epoch 5, Accuracy: 0.975
