# Introduction


**What?** Dealing with ANNs with functional API



# Import modules

In [32]:
import torch
from torch import nn
from torchvision import datasets, transforms
from torch import optim
import torch.nn.functional as F

# Load the dataset

In [None]:
"""
For the particular case that we are dealing with, an image consisting of 28 x 28 grayscale pixels, we first need
to read from the image and convert it into a tensor using a transforms.ToTensor() transform. We then make the 
mean and standard deviation of the pixel values 0.5 and 0.5 respectively so that it becomes easier for the model 
to train; to do this, we use transforms.Normalize((0.5,),(0.5,)). We combine all of the transformations together 
with transform.Compose().
"""

In [2]:
transform = transforms.Compose([transforms.ToTensor(),
                              transforms.Normalize((0.5,), (0.5,)),
                              ])

In [3]:
batch_size = 64

In [None]:
"""
We will be using the Fashion–MNIST dataset. This is a dataset of Zalando's article images, consisting of a 
training set of 60,000 examples and a test set of 10,000 examples. We will take an individual grayscale 
image 28 x 28 in size and convert it into a vector of 784.
"""

In [None]:
trainset = datasets.FashionMNIST('Fashion_MNIST/', download=True, train=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)

testset = datasets.FashionMNIST('Fashion_MNIST/', download=True, train=False, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=True)

# Create the model

In [None]:
"""
We could define the model class with any name, but what is important is that it is a subclass of nn.Module and
has super().__init__(), which provides the model with a lot of useful methods and attributes and retains 
knowledge of the architecture.

nn.Linear describes a fully connected
784 -> 256 -> 128 ->10

We can define the network architecture without defining a network class using the nn.Sequential module, 
and it is important to ensure that the sequence of operation in the forward method is ordered properly.
"""

In [9]:
class FashionNetwork(nn.Module):
    
    """
    Describes the architecture for the fashion-MINST dataset
    The nn.Module automatically creates the weight and bias 
    tensors that we'll use in the forward method.     
    """
    
    def __init__(self):
        """
        Initialisation of the elemnts. They can be initialise in any order
        The linear unit by itself defines a linear function, such as xW + B; 
        to have nonlinear capabilities, we need to insert nonlinear activation 
        functions, and here we use one of the most popular activation functions, ReLU.
        """
        super().__init__()        
        self.hidden1 = nn.Linear(784, 256)
        self.hidden2 = nn.Linear(256, 128)
        self.output = nn.Linear(128, 10)
        # dim = 1, ensures that logsoftmax is taken across the columns of the output
        self.softmax = nn.LogSoftmax(dim=1)
        self.activation = nn.ReLU()
    
    def forward(self, x):
        x = self.hidden1(x)
        x = self.activation(x)
        x = self.hidden2(x)
        # RELU because there are 10 output class
        x = self.activation(x)
        x = self.output(x)
        output = self.softmax(x)
        return output

In [None]:
class FashionNetwork(nn.Module):       
    def __init__(self):
        super().__init__()        
        self.hidden1 = nn.Linear(784, 256)
        self.hidden2 = nn.Linear(256, 128)
        self.output = nn.Linear(128, 10)
        self.softmax = nn.LogSoftmax(dim=1)
        self.activation = nn.ReLU()
    
    def forward(self, x):
        x = self.hidden1(x)
        x = self.activation(x)
        x = self.hidden2(x)        
        x = self.activation(x)
        x = self.output(x)
        output = self.softmax(x)
        return output

In [10]:
model = FashionNetwork()

In [11]:
print(model)

FashionNetwork(
  (hidden1): Linear(in_features=784, out_features=256, bias=True)
  (hidden2): Linear(in_features=256, out_features=128, bias=True)
  (output): Linear(in_features=128, out_features=10, bias=True)
  (softmax): LogSoftmax(dim=1)
  (activation): ReLU()
)


In [12]:
# Inpsecting the weights
model.hidden1.weight

Parameter containing:
tensor([[-0.0278, -0.0343, -0.0317,  ..., -0.0337, -0.0053,  0.0335],
        [-0.0080, -0.0127,  0.0107,  ...,  0.0160, -0.0034, -0.0128],
        [-0.0190, -0.0076,  0.0054,  ...,  0.0203,  0.0209, -0.0223],
        ...,
        [-0.0176, -0.0125,  0.0177,  ..., -0.0221,  0.0114,  0.0014],
        [-0.0318,  0.0170, -0.0219,  ...,  0.0044,  0.0261,  0.0231],
        [ 0.0220,  0.0277, -0.0186,  ...,  0.0126, -0.0341,  0.0317]],
       requires_grad=True)

# Loss function

In [None]:
"""
In PyTorch, the loss function is called a criterion, and so we named our
loss. “The log would ensure that we are not dealing with very small values
between 0 and 1, and negative values would ensure that a logarithm of 
probability that is less than 1 is nonzero. Our goal would be to reduce this
negative log loss error function.
"""

In [15]:
criterion = nn.NLLLoss()

In [16]:
criterion

NLLLoss()

# Optimiser

In [None]:
"""
Other optimizer functions, such as Adadelta, Adagrad, SGD.
"""

In [18]:
optimizer = optim.Adam(model.parameters())

In [19]:
optimizer.defaults

{'lr': 0.001,
 'betas': (0.9, 0.999),
 'eps': 1e-08,
 'weight_decay': 0,
 'amsgrad': False}

In [20]:
optimizer = optim.Adam(model.parameters(), lr=3e-3)

In [23]:
optimizer.defaults

{'lr': 0.003,
 'betas': (0.9, 0.999),
 'eps': 1e-08,
 'weight_decay': 0,
 'amsgrad': False}

In [24]:
# One epoch sees all the images in the training set
epoch = 10

In [28]:
epochNo = 0
for _ in range(epoch):
    epochNo +=1
    running_loss = 0
    for image, label in trainloader:
        # Zeroing the gradient because pytroch accumulated the gradients
        # on each backward pass
        optimizer.zero_grad()
        # reshape each batch of 64 images
        # from “64 x 28 x 28 to 64 x 784”
        image = image.view(image.shape[0],-1)
        # get a prediction
        pred = model(image)
        # calculate the loss
        loss = criterion(pred, label)
        # propagate the error backward
        # partial derivative of the error wrt the weigths
        loss.backward()
        # we update the weights
        optimizer.step()
        # keep track of the loss
        # the .item() method pulled a scalar out of the tensor
        running_loss += loss.item()
    else:
        print("Epoch No: ", epochNo)
        print(f'Training loss: {running_loss/len(trainloader):.4f}')

Epoch No:  1
Training loss: 0.2109
Epoch No:  2
Training loss: 0.2092
Epoch No:  3
Training loss: 0.2018
Epoch No:  4
Training loss: 0.2020
Epoch No:  5
Training loss: 0.1941
Epoch No:  6
Training loss: 0.1930
Epoch No:  7
Training loss: 0.1943
Epoch No:  8
Training loss: 0.1922
Epoch No:  9
Training loss: 0.1863
Epoch No:  10
Training loss: 0.1833


# Adding dropout

In [None]:
"""
We add the dropout layer with a dropout probability of 0.25, which means that 25% 
of the neurons in the layer where this dropout is applied will be turned off randomly. 
Then, we edited our forward function, applied it to the first hidden layer with 256 
units in it, and then we applied the dropout on the second layer, which has 128 units. 
We applied the activation in both the layers after going through the activation functions.
We have to keep in mind that dropouts must be applied only on hidden layers in order to 
prevent us from losing the input data and missing outputs.
"""

In [29]:
class FashionNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden1 = nn.Linear(784, 256)
        self.hidden2 = nn.Linear(256, 128)
        self.output = nn.Linear(128, 10)
        self.log_softmax = nn.LogSoftmax()
        self.activation = nn.ReLU()
        self.drop = nn.Dropout(p=0.25)
    def forward(self, x):
        x = self.hidden1(x)
        x = self.activation(x)
        x = self.drop(x)
        x = self.hidden2(x)
        x = self.activation(x)
        x = self.drop(x)
        x = self.output(x)
        output = self.log_softmax(x)
        return output

In [30]:
model = FashionNetwork()

In [31]:
model

FashionNetwork(
  (hidden1): Linear(in_features=784, out_features=256, bias=True)
  (hidden2): Linear(in_features=256, out_features=128, bias=True)
  (output): Linear(in_features=128, out_features=10, bias=True)
  (log_softmax): LogSoftmax(dim=None)
  (activation): ReLU()
  (drop): Dropout(p=0.25, inplace=False)
)

# Implementing functional APIs

In [25]:
"""
In this recipe, we defined the exact same network as before, but replaced the activation
function and the log softmax with function.relu and function.log_softmax, which makes our 
code look a lot cleaner and more concise.
"""

In [26]:
class FashionNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden1 = nn.Linear(784,256)
        self.hidden2 = nn.Linear(256,128)
        self.output = nn.Linear(128,10)
        # This two lines are removed
        #self.log_softmax = nn.LogSoftmax()
        #self.activation = nn.ReLU()
    """
    def forward(self, x):
        x = self.hidden1(x)
        x = self.activation(x)
        x = self.hidden2(x)
        x = self.activation(x)
        x = self.output(x)
        output = self.log_softmax(x)
        return output
    """
    def forward(self,x):
        x = F.relu(self.hidden1(x))
        x = F.relu(self.hidden2(x))
        x = F.log_softmax(self.output(x))
        return x
        

In [None]:
class FashionNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden1 = nn.Linear(784,256)
        self.hidden2 = nn.Linear(256,128)
        self.output = nn.Linear(128,10)

    def forward(self,x):
        x = F.relu(self.hidden1(x))
        x = F.relu(self.hidden2(x))
        x = F.log_softmax(self.output(x))
        return x
        

In [None]:
"""
An EASIER way to understand when to use functional and when NOT to use is the following:

As a result, layers with parameters are usually initialized in init to be shared by the whole module, while 
some connections or simple operations without parameters can be defined in forward to be used in forward 
propagation. torch.nn module is used mainly for methods which have learnable parameters, whereas
functional is used for methods which do not have learnable parameters.

A nn.Module is actually a OO wrapper around the functional interface, that contains a number of utility methods,
like eval() and parameters(), and it automatically creates the parameters of the modules for you.

Reference: https://discuss.pytorch.org/t/how-to-choose-between-torch-nn-functional-and-torch-nn-module/2800/2
"""

# References


- Jibin Mathew, PyTorch Artificial Intelligence Fundamentals  
- https://github.com/jibinmathew69
    
