# Loading and normalize data

In [1]:
import mnist
import torch
import torch.nn as nn
import torch.tensor as tensor
import torch.nn.functional as F

x_train, t_train, x_test, t_test = mnist.load()

xtrain=tensor(x_train/255,dtype=torch.float)
ttrain=tensor(t_train,dtype=torch.int64)
xtest=tensor(x_test/255,dtype=torch.float)
ttest=tensor(t_test,dtype=torch.int64)
print(xtrain.shape)

torch.Size([60000, 784])


# Setting the model
We define a class Net which inherent from **nn.Module** 

Intial model:

**nn.Linear**: apply a linear transformation to the incoming data
1. We set the first layer *ly1* as a linear transformation from 784 neurons (784 pixels) to 80 neurons.
2. We set the second layer *ly2* as a linear transformation from 80 neurons to 30 neurons. 
3. We set the third layer *ly3* as a linear transformation from 30 neurons to 10 neurons (digit 0-9).

**Note:** We set *ly1_drop* to drop out 50% of the output of *ly1*, and *ly2_drop* to drop out 50% of the output of *ly2*. But since the performance is not good with drop out here, we mark it out.



In [2]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.ly1 = nn.Linear(784,80)
        #self.ly1_drop = torch.nn.Dropout(0.5) #we try drop out but not work well here
        self.ly2 = nn.Linear(80,30)
        #self.ly2_drop = torch.nn.Dropout(0.5)
        self.ly3 = nn.Linear(30,10)
        
    def forward(self, x):
        x = torch.relu(self.ly1(x))
        #x = self.ly1_drop(torch.relu(self.ly1(x))) #we try drop out but not work well
        x = torch.relu(self.ly2(x))
        #x = self.ly2_drop(torch.relu(self.ly2(x)))
        x=self.ly3(x)
        #x=torch.softmax(self.ly3(x),dim=1) #cross entropy already encode th softmax
        return x
net = Net()

# Model training 
* CrossEntropyLoss: cross entropy loss in pytorch already encoded softmax. so we mark out the softmax function in our model (above cell).

* optimizer: We use the most popular optimer Adam, a methed for a method for stochastic Optimization. We also try the SGD(Stochastic gradient descent) with momentun 0.9, but the result is not good, only achive 0.7-0.8 accuracy.

* We run 20 epoch, in each epoch:
    * **optimizer.zero_grad** is to set gradient to zero to avoid mixing the result with previous epoch.
    * we fit the training data (xtrain) to our model, and have the output y
    * we compute the cross entropy loss with our output y and the training labels
    * we call **loss.backward** to run the backpogation to and compute the error derivative for model parameters
    * we call **optimizer.step** to update the model parameters with the 
    
* We can see the loss decreases with increasing epoch. 

In [3]:
criterion = torch.nn.CrossEntropyLoss() # mean square error loss
#optimizer = torch.optim.SGD(net.parameters(), lr=0.1, momentum=0.9)
optimizer = torch.optim.Adam(net.parameters(),lr = 0.01)
for epoch in range(20):
    
        optimizer.zero_grad()
        y = net(xtrain)
        loss = criterion(y, ttrain)
        print('epoch: ', epoch, 'loss:', loss.item())
        
        loss.backward()
        optimizer.step()
print('done')

epoch:  0 loss: 2.306840181350708
epoch:  1 loss: 2.216726303100586
epoch:  2 loss: 2.009770631790161
epoch:  3 loss: 1.7456607818603516
epoch:  4 loss: 1.4443638324737549
epoch:  5 loss: 1.1498793363571167
epoch:  6 loss: 0.9130261540412903
epoch:  7 loss: 0.7470263242721558
epoch:  8 loss: 0.6418042778968811
epoch:  9 loss: 0.5765392780303955
epoch:  10 loss: 0.5262182950973511
epoch:  11 loss: 0.4958030581474304
epoch:  12 loss: 0.4630966782569885
epoch:  13 loss: 0.4482825994491577
epoch:  14 loss: 0.4266660213470459
epoch:  15 loss: 0.4154680073261261
epoch:  16 loss: 0.4017115831375122
epoch:  17 loss: 0.3927520215511322
epoch:  18 loss: 0.38061603903770447
epoch:  19 loss: 0.37106236815452576
done


# Evaluation 
We fit the testing data(xtest) into our mode, and print out the result and compare with the testing labels. And got the final accuracy 0.9074.

Note that by tuning the parameters like learning rate, batch size or epochs, it is a chance to accelerate the training process or improve the accuracy.

In [4]:
ypred=net(xtest)

print(ypred.max(1)[1])
print(ttest)

tensor([7, 2, 1,  ..., 4, 5, 6])
tensor([7, 2, 1,  ..., 4, 5, 6])


In [5]:
#cmp=ttest.eq(ypred.max(1)[1])
#true=len([x for x in cmp if x==1])
#false=len([x for x in cmp if x==0])
#total=true+false
#print(true/total)

float((ttest==ypred.max(1)[1]).sum())/len(ttest)

0.9074