# Load Data
Note that we normalize the training data by dividing the each pixel value with 255.

In [1]:
import mnist
import torch
import torch.nn as nn
import torch.tensor as tensor
import torch.nn.functional as F
from time import time
x_train, t_train, x_test, t_test = mnist.load()
xtrain=tensor(x_train/255,dtype=torch.float)
ttrain=tensor(t_train,dtype=torch.int64)
xtest=tensor(x_test/255,dtype=torch.float)
ttest=tensor(t_test,dtype=torch.int64)
print(xtrain.shape)

torch.Size([60000, 784])


# Set the Model
We use CNN (convolution neural network) to predict the data.
* Conv1: We use **Conv2d(1,6,5)** to build our first convolution layer. Since our digit image is not colorful, it only have one channel. We set the first parameter to one as the inital channel of given image. We set our output channel to six, and set our kernel size to five (i.e., each filter kernel is 5*5)

* Conv2: we call **Conv2d(6,16,5)** which accept six-channel input to builf second convolution layer. We set the out channel to sixteen and the kernal size is five (5*5 kernel).

* MaxPooling: After each convolution layer, we will call **MaxPool2d(2,2)** to do the max pooling. We set the size of the window (2,2) to take a max over.

 

In the Net.forward() we set our procedure.
* We first apply *conv1*, call the relu (rectified linear unit) funtion to filter the unnecessary features, and then apply max pooling.
* We then apply *conv2*, call the relu function, and then apply max pooling.
* We transform the dataset to have 256 columns which can fit in the the fisrt linear layer *fc1*, and then apply *tanh* 
* We fit the data to second linear layer *fc2* and then apply *softmax* along with dimension 1 (softmax between different classes)

Note that we set the first linear layer transform data with 256 features. Since sfter we fit the data into conv1, max pooling, conv2, and max pooling again, the data shape became (16,4,4) where 16 denotes the channel number and other two denote the width and height. So after we flatten the data, the data size became 256(4*4*16).

In [2]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1,6,5)
        self.pool = nn.MaxPool2d(2,2)
        self.conv2 = nn.Conv2d(6,16,5)
        self.fc1 = nn.Linear(256, 120)#(28-4)/2=12,(12-4)/2=4(size=4*4), there are 16 filters
        self.fc2 = nn.Linear(120, 10)
        
    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1,256)
        x = torch.tanh(self.fc1(x))
        x=F.softmax(self.fc2(x),dim=1)
        return x
net = Net()

# Model training 
* CrossEntropyLoss: cross entropy loss in pytorch already encoded softmax. so we mark out the softmax function in our model (above cell).

* optimizer: We use the most popular optimer Adam, a methed for a method for stochastic Optimization. We also try the SGD(Stochastic gradient descent) with momentun 0.9, but the result is not good, only achive 0.7-0.8 accuracy.

* We run 20 epoch, in each epoch:
    * **optimizer.zero_grad** is to set gradient to zero to avoid mixing the result with previous epoch.
    * we fit the training data (xtrain) to our model, and have the output y
    * we compute the cross entropy loss with our output y and the training labels
    * we call **loss.backward** to run the backpogation to and compute the error derivative for model parameters
    * we call **optimizer.step** to update the model parameters with the 
    
* We can see the loss decreases with increasing epoch. 

In [None]:
criterion = torch.nn.CrossEntropyLoss() 
#optimizer = torch.optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
optimizer = torch.optim.Adam(net.parameters(),lr = 0.01)

start=time()
for epoch in range(1):
    for i,x in enumerate(xtrain):
        optimizer.zero_grad()
        y = net(x.view(-1,1,28,28))
        loss = criterion(y, tensor([ttrain[i]]))
        
        if i%2000==0: print('epoch: ', epoch, 'loss:', loss.item())
        
        loss.backward()
        optimizer.step()
print(time()-start)

epoch:  0 loss: 2.3072292804718018
epoch:  0 loss: 2.4611213207244873
epoch:  0 loss: 2.332186460494995
epoch:  0 loss: 2.338225841522217
epoch:  0 loss: 2.261850357055664
epoch:  0 loss: 2.4608004093170166
epoch:  0 loss: 2.461057424545288
epoch:  0 loss: 2.461148262023926
epoch:  0 loss: 2.4611024856567383
epoch:  0 loss: 2.4611501693725586
epoch:  0 loss: 2.4611496925354004
epoch:  0 loss: 1.461152195930481
epoch:  0 loss: 2.4611501693725586
epoch:  0 loss: 2.4611501693725586
epoch:  0 loss: 2.4611501693725586
epoch:  0 loss: 2.4611501693725586
epoch:  0 loss: 2.4611501693725586
epoch:  0 loss: 2.4611501693725586
epoch:  0 loss: 2.4611501693725586


In [None]:


ypred=net(xtest.view(-1,1,28,28))

print(ypred.max(1)[1])
print(ttest)

In [None]:
cmp=ttest.eq(ypred.max(1)[1])
true=len([x for x in cmp if x==1])
false=len([x for x in cmp if x==0])
total=true+false
print(true/total)