## <font color='green'> <div align="center">In the name of God </div></font>

### <font color='red'> Author: Sayed Kamaledin Ghiasi-Shrirazi</font> <a href="http://profsite.um.ac.ir/~k.ghiasi">(http://profsite.um.ac.ir/~k.ghiasi)</a> 

# A multi-layer feedforward neural network in PyTorch

### importing general modules

In [1]:
import numpy as np
import scipy.io as sio
import matplotlib as mpl
import matplotlib.pyplot as plt

### importing PyTorch modules

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim

## Defining the MLFF network by inheriting from nn.Module

Note that the following command is crucial:

``
self.layers = nn.ModuleList(layersList)
``

In [3]:
class MLFF(nn.Module):
    def __init__(self, num_hidden_layers, num_input_neurons, 
                 num_hidden_neurons, num_output_neurons):
        super().__init__()
        self.layersCount =  num_hidden_layers+1
        layersList = [None] * (self.layersCount)
        layersList[0] = nn.Linear(num_input_neurons, num_hidden_neurons)
        for i in range (1, num_hidden_layers):
            layersList[i] = nn.Linear(num_hidden_neurons, num_hidden_neurons)
        layersList[num_hidden_layers] = nn.Linear(num_hidden_neurons, num_output_neurons)
        self.layers  = nn.ModuleList(layersList)
    
    def forward(self, x):
        for i in range(self.layersCount):
            x = self.layers[i](x)
            if (i < self.layersCount - 1):
                x = torch.relu(x)
        return x

In [4]:
net = MLFF(5, 784, 100, 10)
print (net)

MLFF(
  (layers): ModuleList(
    (0): Linear(in_features=784, out_features=100, bias=True)
    (1): Linear(in_features=100, out_features=100, bias=True)
    (2): Linear(in_features=100, out_features=100, bias=True)
    (3): Linear(in_features=100, out_features=100, bias=True)
    (4): Linear(in_features=100, out_features=100, bias=True)
    (5): Linear(in_features=100, out_features=10, bias=True)
  )
)


### Choosing device

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print (device)

cuda:0


## Loading MNIST

In [6]:
MnistTrainX = sio.loadmat ('../../datasets/mnist/MnistTrainX')['MnistTrainX'] / 255;
MnistTrainY = sio.loadmat ('../../datasets/mnist/MnistTrainY')['MnistTrainY'];
MnistTestX  = sio.loadmat ('../../datasets/mnist/MnistTestX')['MnistTestX'] / 255;
MnistTestY  = sio.loadmat ('../../datasets/mnist/MnistTestY')['MnistTestY'];

N = 60000
MnistTrainX = MnistTrainX[:N,:]
MnistTrainY = MnistTrainY[:N,:]
XTrain = MnistTrainX
yTrain = MnistTrainY.squeeze()
XTest = MnistTestX
yTest = MnistTestY.squeeze()
N, dim = XTrain.shape

## Optimization

In [7]:
num_epochs = 2
batch_size = 100
report_after_X_iterations = 100
learning_rate = 0.1
num_batches = N // batch_size

In [8]:
net.to(device)

optimizer = optim.SGD(net.parameters(), lr= learning_rate, momentum = 0.9)
criterion = nn.CrossEntropyLoss()

for epoch in range (num_epochs):
    for itr in range (num_batches):
        X = torch.tensor (MnistTrainX[itr*batch_size:(itr+1)*batch_size,:], dtype=torch.float)
        T = MnistTrainY[itr*batch_size:(itr+1)*batch_size]
        T = torch.tensor (T.squeeze(), dtype = torch.long)
        X = X.to(device)
        T = T.to(device)
        output = net(X)
        loss = criterion(output, T)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (itr % report_after_X_iterations == 0):
            print('\n---- iteration #{0} of {1} at epoch #{2} of {3} ---- :'.format(
                itr, num_batches, epoch, num_epochs))
            score = 0.0
            with torch.no_grad():
                for i in range(num_batches):
                    X = MnistTrainX[i * batch_size:(i + 1) * batch_size, :]
                    T = MnistTrainY[i * batch_size:(i + 1) * batch_size]
                    T = T.squeeze()
                    X = torch.tensor(X, dtype=torch.float).to(device)
                    #T = torch.tensor(T, dtype=torch.long).to(device)
                    output = net(X)
                    prediction = torch.argmax(output, dim=1).cpu().numpy()
                    score += np.sum(prediction == T)
            score /= N
            score *= 100
            print('Loss = {0}, Accuracy on training data = {1}%'.format(loss.item(), score))


---- iteration #0 of 600 at epoch #0 of 2 ---- :
Loss = 2.300250291824341, Accuracy on training data = 10.218333333333334%

---- iteration #100 of 600 at epoch #0 of 2 ---- :
Loss = 1.4320164918899536, Accuracy on training data = 60.105%


KeyboardInterrupt: 

In [11]:
optimizer.__dict__

{'defaults': {'lr': 0.1,
  'momentum': 0.9,
  'dampening': 0,
  'weight_decay': 0,
  'nesterov': False},
 'state': defaultdict(dict, {Parameter containing:
              tensor([[ 0.0053, -0.0174,  0.0354,  ...,  0.0093,  0.0181,  0.0264],
                      [ 0.0276, -0.0252,  0.0225,  ..., -0.0288, -0.0337, -0.0163],
                      [ 0.0146,  0.0033,  0.0016,  ..., -0.0317,  0.0240,  0.0204],
                      ...,
                      [ 0.0248,  0.0107,  0.0045,  ...,  0.0246, -0.0100,  0.0021],
                      [ 0.0300,  0.0112,  0.0305,  ...,  0.0124, -0.0256,  0.0116],
                      [-0.0017, -0.0070,  0.0314,  ..., -0.0340, -0.0182, -0.0340]],
                     device='cuda:0', requires_grad=True): {'momentum_buffer': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
                       [0., 0., 0.,  ..., 0., 0., 0.],
                       [0., 0., 0.,  ..., 0., 0., 0.],
                       ...,
                       [0., 0., 0.,  ..., 0., 0., 0.],


In [25]:
p = optimizer.param_groups[0]['params'][0]

In [26]:
optimizer.state[p]

{'momentum_buffer': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')}