## (2) Design an MLP model for multiclass classification for MNIST dataset under the condition that the number of parameters in your NN should be less than  1𝑀 . Then, train it with MNIST dataset by using the minibatch SGD under the condition that batch size is a power of 2 less than 1024.

In [33]:
import torch
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn as nn
from torchvision import datasets
import torchvision.transforms as transforms

### dataset for training parameters
train_dataset = datasets.MNIST(root='./mnist_data/', train=True, transform=transforms.ToTensor(), download=True)
### dataset for testing the accuracy of the trained parameters
test_dataset = datasets.MNIST(root='./mnist_data/', train=False, transform=transforms.ToTensor())

'''
(1) output shape of model: output is 10 instead of 1
(2) Sigmoid --> Softmax
(3) BCELoss --> CrossEntropyLoss
'''

'''
Using GPU
'''
import torch
import torch.optim as optim # See https://pytorch.org/docs/stable/optim.html
from utils import *

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")

'''
Define Model
'''
# https://pytorch.org/docs/stable/generated/torch.nn.Module.html
class MLP(nn.Module) :
    def __init__(self, in_dim, out_dim, inter_dim1=512, inter_dim2=512, inter_dim3=512) :
        super().__init__()
        self.in_dim = in_dim
        self.linear1 = nn.Linear(in_dim, inter_dim1, bias=True)
        self.act1 = nn.ReLU()        
        self.linear2 = nn.Linear(inter_dim1, inter_dim2, bias=True)                
        self.act2 = nn.ReLU()        
        self.linear3 = nn.Linear(inter_dim2, inter_dim3, bias=True)  
        self.act3 = nn.ReLU()        
        self.linear4 = nn.Linear(inter_dim3, out_dim, bias=True)  
        
    def forward(self, x) :
        z = self.linear1(x)
        z = self.act1(z)
        z = self.linear2(z)
        z = self.act2(z)
        z = self.linear3(z)  
        z = self.act3(z)   
        z = self.linear4(z)
        return z

'''
Choose loss function
'''
# https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html
loss = torch.nn.CrossEntropyLoss() # (2) Change to CrossEntropyLoss


''' Define model and optimizer'''
MLP_model = MLP(28*28,10).to(device) # (4) change input/output shape
# Reverse mode automatic differentiation
OPTIMIZER = optim.SGD(MLP_model.parameters(),  lr=0.1)

'''
DataLoader is used to apply minibatch SGD
'''
batch_size = 128
image_size = 28*28
# https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)


import time
start = time.time()
num_epoch = 50
for epoch in range(num_epoch):
    MLP_model.to(device)
    for image,label in train_loader :

        # Forward process
        hat_y = MLP_model(image.to(device).view(-1, 28*28)).to(device)
        cost = loss(hat_y, label.to(device))

        # Wipe up gradient in the previous step
        OPTIMIZER.zero_grad() 

        # Computes the gradient of current tensor w.r.t. graph leaves.
        cost.backward() 

        # Updates the parameters
        OPTIMIZER.step()
    
    print("{}-epoch/total {}, loss: {:.4f}".format(epoch, num_epoch, cost))

end = time.time()
print("Training Time: {:.2f}".format(end-start))



MLP_model.to("cpu")
''' Test for train_dataset '''
count = 0
for i in range(len(train_dataset.targets)):
    image = train_dataset.data[i]
    label = train_dataset.targets[i]    

    prediction = torch.argmax(MLP_model(image.float().to("cpu").view(-1, 28*28)))

    if prediction == label: 
        count += 1
        
print('Test for train_dataset: correctly matched/total: {}/{} accuracy: {:.2f}%\n'.format(count, train_dataset.targets.size()[0], count/train_dataset.targets.size()[0] * 100. ) )    

''' Test for test_dataset '''
count = 0
wrong_answer = []
for i in range(len(test_dataset.targets)):
    image = test_dataset.data[i]
    label = test_dataset.targets[i]   
    
    prediction = torch.argmax(MLP_model(image.float().to("cpu").view(-1, 28*28)))
    if prediction == label: 
        count += 1
    else:
        wrong_answer += [i]   
        
print('Test for test_dataset: correctly matched/total: {}/{} accuracy: {:.2f}%\n'.format(count, test_dataset.targets.size()[0], count/test_dataset.targets.size()[0] * 100. ) )    

total_params = sum(p.numel() for p in MLP_model.parameters())
print("Number of Parameters: ", total_params)

0-epoch/total 50, loss: 0.4479
1-epoch/total 50, loss: 0.2593
2-epoch/total 50, loss: 0.1000
3-epoch/total 50, loss: 0.0913
4-epoch/total 50, loss: 0.0452
5-epoch/total 50, loss: 0.0470
6-epoch/total 50, loss: 0.0323
7-epoch/total 50, loss: 0.0325
8-epoch/total 50, loss: 0.0531
9-epoch/total 50, loss: 0.0173
10-epoch/total 50, loss: 0.0963
11-epoch/total 50, loss: 0.0295
12-epoch/total 50, loss: 0.0097
13-epoch/total 50, loss: 0.0544
14-epoch/total 50, loss: 0.0088
15-epoch/total 50, loss: 0.0043
16-epoch/total 50, loss: 0.0042
17-epoch/total 50, loss: 0.0101
18-epoch/total 50, loss: 0.0026
19-epoch/total 50, loss: 0.0046
20-epoch/total 50, loss: 0.0022
21-epoch/total 50, loss: 0.0024
22-epoch/total 50, loss: 0.0018
23-epoch/total 50, loss: 0.0011
24-epoch/total 50, loss: 0.0012
25-epoch/total 50, loss: 0.0010
26-epoch/total 50, loss: 0.0008
27-epoch/total 50, loss: 0.0015
28-epoch/total 50, loss: 0.0004
29-epoch/total 50, loss: 0.0006
30-epoch/total 50, loss: 0.0007
31-epoch/total 50,

# --- Test Code ---

In [35]:
''' Test for test_dataset '''
count = 0
wrong_answer = []
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
for image,label in test_loader :
    label = label
    prediction = torch.argmax(MLP_model(image.view(len(image),image_size)),dim=1)
    count += (prediction == label).sum()
    
#     else:
#         wrong_answer += [i]   
        
print('Test for test_dataset: correctly matched/total: {}/{} accuracy: {:.2f}%\n'.format(count, len(test_dataset.targets), count/len(test_dataset.targets) * 100. ) )    

Test for test_dataset: correctly matched/total: 9807/10000 accuracy: 98.07%



# --- Review ---

### layer 수를 4개 미만으로 내리니 정확도 미묘하게 내려감.
### batch_size를 128 위로 올리니 정확도 내려감.
### (batch_size를 올린 만큼 epoch 수를 올려야 하지만 학습 시간이 오래걸려 test 불가)
### batch = 128, epoch = 50으로 마무리
### inter_dim 512이상은 Parameter 1M 제한을 넘어감.
### inter_dim 512미만은 정확도 내려감.