# Lab 8-4: Backpropagation with MNIST

**Jonathan Choi 2021**

**[Deep Learning By Torch] End to End study scripts of Deep Learning by implementing code practice with Pytorch.**

If you have an any issue, please PR below.

[[Deep Learning By Torch] - Github @JonyChoi](https://github.com/jonychoi/Deep-Learning-By-Torch)
Here, we are going to learn how to backpropagation works at the deep inside. Before we used ```cost.backward()```, but here we are going to implement backward at the low level.

## Imports

In [12]:
import torch
import torchvision.datasets as datasets
import torchvision.transforms as transforms

In [13]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

torch.manual_seed(1)
if device == 'cuda':
    torch.cuda.manual_seed_all(1)

## Set Hyperparameters

In [14]:
learning_rate = 0.5
batch_size = 10

## MNIST datasets

In [15]:
mnist_train = datasets.MNIST(root='MNIST_data/',
                             download=True,
                             train=True,
                             transform=transforms.ToTensor())
mnist_test = datasets.MNIST(root='MNIST_data/',
                            train = False,
                            transform=transforms.ToTensor(),
                            download=True)

In [16]:
data_loader = torch.utils.data.DataLoader(dataset=mnist_train,
                                          batch_size = batch_size,
                                          shuffle = True,
                                          drop_last = True)

## Set Parameters

In [17]:
w1 = torch.nn.Parameter(torch.Tensor(784, 30)).to(device)
b1 = torch.nn.Parameter(torch.Tensor(30)).to(device)
w2 = torch.nn.Parameter(torch.Tensor(30, 10)).to(device)
b2 = torch.nn.Parameter(torch.Tensor(10)).to(device)

In [18]:
torch.nn.init.normal_(w1)
torch.nn.init.normal_(b1)
torch.nn.init.normal_(w2)
torch.nn.init.normal_(b2)

tensor([ 0.1696, -1.2966,  0.3153,  0.9196, -0.1853, -1.0896, -0.2633,  0.3830,
        -0.6385,  1.4271], device='cuda:0', grad_fn=<CopyBackwards>)

## Set Activation Functions and Its derivative terms

In [19]:
def sigmoid(x):
    return 1.0 / (1.0 + torch.exp(-x))

In [20]:
def sigmoid_prime(x):
    #derivative of the sigmoid function
    return sigmoid(x) * (1 - sigmoid(x))

In [21]:
X_test = mnist_test.test_data.view(-1, 28 * 28).float().to(device)[:1000]
Y_test = mnist_test.test_labels.to(device)[:1000]

i = 0

while not i == 10000:
    for X, Y in data_loader:
        i += 1

        #forward
        X = X.view(-1 ,28* 28).to(device)
        Y = torch.zeros((batch_size , 10)).scatter_(1, Y.unsqueeze(1), 1).to(device)

        #one-hot
        layer1 = torch.add(torch.matmul(X, w1), b1)
        activation1 = sigmoid(layer1)

        layer2 = torch.add(torch.matmul(activation1, w2), b2)
        y_pred = sigmoid(layer2)

        diff = y_pred - Y

        #backward (back prop: Chain Rule)
        diff_layer2 = diff * sigmoid_prime(layer2)
        diff_b2 = diff_layer2
        diff_w2 = torch.matmul(torch.transpose(activation1, 0, 1), diff_layer2)

        diff_activation1 = torch.matmul(diff_layer2, torch.transpose(w2, 0, 1))
        diff_layer1 = diff_activation1 * sigmoid_prime(layer1)
        diff_b1 = diff_layer1
        diff_w1 = torch.matmul(torch.transpose(X, 0, 1), diff_layer1)

        w1 = w1 - learning_rate * diff_w1
        b1 = b1 - learning_rate * torch.mean(diff_b1, 0)
        w2 = w2 - learning_rate * diff_w2
        b2 = b2 - learning_rate * torch.mean(diff_b2, 0)

        if i % 1000 == 0:
            layer1 = torch.add(torch.matmul(X_test, w1), b1)
            activation1 = sigmoid(layer1)
            layer2 = torch.add(torch.matmul(activation1, w2), b2)
            y_pred = sigmoid(layer2)
            accuracy_mat = torch.argmax(y_pred, 1) == Y_test
            accuracy_res = accuracy_mat.sum()
            print(accuracy_res.item())

        if i == 10000:
            break


752
858
882
868
891
889
905
896
897
