# Introduction to Neural Networks
## Fully connected neural net for FashionMNIST

### Objectives

The main objective is to implement a neural network for FashionMNIST. 

* The architecture of the network is as follows:
    1. Input layer - 28x28 grayscale image (784 neurons + 1 bias neuron at the input layer) 
    2. Layer 1 - 300 neurons followed by a reLu layer
    3. Layer 2 - 100 neurons followed by a reLu layer
    4. Layer 3/ Output Layer - 10 neurons where every neuron indicates a class


* Points to note:
    1. Validation set == Test set for this problem set
    2. Use -log(softmax) as the loss function
    3. Use GPU if possible
    
    
* Features
    1. Classwise accuracy and rankings
    2. Which class is hard to predict?
    3. Return loss and accuracies for every training step
    

### Import relevant libraries

In [1]:
import torch
import torch.nn as nn # Neural networks module of torch package
from torchvision import datasets, transforms
from tqdm import tqdm
import torch.nn.functional as F # Functions such as sigmoid, softmax, cross entropy etc
import torch.optim as optim
import numpy as np
from torch.utils.data import SubsetRandomSampler
import matplotlib.pyplot as plt

### Define Parameters

In [2]:
batch_size_train = 64
batch_size_valid = 64
seed = 2019
epochs = 30

### Pytorch insights
1. You have dataset class and dataloaders class
2. torch.nn.Module is the Base class for all neural network modules. Your models should also subclass this class.

### Load data - Create dataset class and dataloader class

In [3]:
# Create a dataset object
dataset = datasets.FashionMNIST("../data", train=True, download=True, transform=transforms.Compose([
                           transforms.ToTensor()
                       ]))

# Create a 80%, 20% train, validation split
dataset_size = len(dataset)
indices = [i for i in range(dataset_size)]

# shuffle dataset
np.random.seed(seed)
np.random.shuffle(indices)

split = int(np.floor(0.2 * dataset_size))
train_indices, val_indices = indices[split:], indices[:split]

train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(val_indices)

# if sampler specified, shuffle should be false for dataloaders
# Create dataloaders for train and validation. (Note that test set == validation set in this question)
train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size_train, sampler=train_sampler)
valid_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size_valid, sampler=valid_sampler)

### Model - Define model

In [4]:
class SimpleNN(nn.Module):
    def __init__(self, input_dims):
        super(SimpleNN, self).__init__()
        self.l1 = nn.Linear(input_dims, 300)
        self.l2 = nn.Linear(300, 100)
        self.l3 = nn.Linear(100, 10)
    
    def forward(self, x):
        
        x = x.view(-1, 784)
        #print(x.size())
        # Pass through layer 1 block
        x = self.l1(x)
        x = F.relu(x)
        
        # Pass through layer 2 block
        x = self.l2(x)
        x = F.relu(x)
        
        # Output layer
        x = self.l3(x)
        return F.log_softmax(x, dim=1)

### Define Train Function that includes loss function

In [5]:
def train(model, device, train_loader, optimizer):
    # Set the module in training mode.
    model.train(True)
    
    running_loss = 0
    running_correct = 0
    
    for batch_idx, (data, target) in enumerate(train_loader):
        # Load batch data to device
        data, target = data.to(device), target.to(device)
        
        # Set optimizer gradients to zero
        optimizer.zero_grad()
        
        # Feed forward the network to determine the output
        output = model(data)
        
        # Calculate the loss. Here we use Negative log loss (Used for classifying C classes)
        # Calculating two losses here. One is the mean of the loss and then the sum of the loss
        loss = F.nll_loss(output, target, reduction="mean")
        
        # Use torch.Tensor.item() to get a Python number from a tensor containing a single value
        # reduction = 'sum' to sum up all the batch loss values and add to the running loss
        batch_loss = F.nll_loss(output, target, reduction="sum").item()
        running_loss += batch_loss
        
        # Get the number of correctly predicted samples
        pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
        
        # View the target tensor as the same size as pred tensor 
        running_correct += pred.eq(target.view_as(pred)).sum().item()
        
        # Backpropagate the system the determine the gradients
        loss.backward()
        
        # Update the paramteres of the model
        optimizer.step()
        
    
    num_samples = float(len(train_loader.sampler))
    avg_train_loss = running_loss/num_samples
    
    print('loss: {:.4f}, accuracy: {}/{} ({:.3f})'.format(
        avg_train_loss, running_correct, num_samples,
        running_correct / num_samples))
        
    return avg_train_loss, running_correct/num_samples

In [6]:
def validation(model, device, valid_loader):
    # Set the module in non-training mode.
    model.train(False)
    
    running_loss = 0
    running_correct = 0
    with torch.no_grad():
        for data, target in valid_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            
            # No need to backpropagate here
            running_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
            running_correct += pred.eq(target.view_as(pred)).sum().item()
    
    num_samples = float(len(valid_loader.sampler))
    avg_valid_loss = running_loss/num_samples

    print('val_loss: {:.4f}, val_accuracy: {}/{} ({:.3f})'.format(
        avg_valid_loss, running_correct, num_samples,
        running_correct / num_samples))
    
    return avg_valid_loss, running_correct/num_samples

In [7]:
def test(model, device, test_loader):
    model.eval()
    
    running_loss = 0
    running_correct = 0
    
    clf_matrix = torch.zeros(10, 10)
        
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            running_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
            running_correct += pred.eq(target.view_as(pred)).sum().item()
            
            for t, p in zip(target.view(-1), pred.view(-1)):
                clf_matrix[t.long(), p.long()] += 1
                
    num_samples = float(len(test_loader.sampler))
    avg_test_loss = running_loss/num_samples

    print('test_loss: {:.4f}, test_accuracy: {}/{} ({:.3f})\n'.format(
        avg_test_loss, running_correct, num_samples,
        running_correct / num_samples))
    
    clf_report = clf_matrix.diag()/clf_matrix.sum(1)
    
    return avg_test_loss, running_correct/num_samples, clf_report

In [9]:
use_cuda = torch.cuda.is_available()
torch.manual_seed(seed)
device = torch.device("cuda" if use_cuda else "cpu")

print("Available device = ", device)
model = SimpleNN(input_dims=28*28).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01)

# training and validation history
loss_hist = []
acc_hist = []

best_val_loss = 1.0
for epoch in tqdm(range(epochs)):
    tr_loss, tr_acc = train(model, device, train_loader, optimizer)
    val_loss, val_acc = validation(model, device, valid_loader)
    loss_hist.append((tr_loss, val_loss))
    acc_hist.append((tr_acc, val_acc))
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        print("Saving best model")
        torch.save(model.state_dict(), "model")
    print("--------------------------------")


Available device =  cuda



  0%|                                                                                           | 0/30 [00:00<?, ?it/s]

loss: 1.5164, accuracy: 25113/48000.0 (0.523)
val_loss: 0.9399, val_accuracy: 7863/12000.0 (0.655)
Saving best model
--------------------------------



  3%|██▊                                                                                | 1/30 [00:09<04:42,  9.76s/it]

loss: 0.7874, accuracy: 34333/48000.0 (0.715)
val_loss: 0.7010, val_accuracy: 8997/12000.0 (0.750)
Saving best model
--------------------------------



  7%|█████▌                                                                             | 2/30 [00:19<04:31,  9.71s/it]

loss: 0.6399, accuracy: 37226/48000.0 (0.776)
val_loss: 0.6204, val_accuracy: 9428/12000.0 (0.786)
Saving best model
--------------------------------



 10%|████████▎                                                                          | 3/30 [00:29<04:21,  9.70s/it]

loss: 0.5671, accuracy: 38595/48000.0 (0.804)
val_loss: 0.5577, val_accuracy: 9658/12000.0 (0.805)
Saving best model
--------------------------------



 13%|███████████                                                                        | 4/30 [00:39<04:18,  9.94s/it]

loss: 0.5241, accuracy: 39320/48000.0 (0.819)
val_loss: 0.5207, val_accuracy: 9862/12000.0 (0.822)
Saving best model
--------------------------------



 17%|█████████████▊                                                                     | 5/30 [00:50<04:19, 10.40s/it]

loss: 0.4958, accuracy: 39726/48000.0 (0.828)
val_loss: 0.4923, val_accuracy: 9981/12000.0 (0.832)
Saving best model
--------------------------------



 20%|████████████████▌                                                                  | 6/30 [01:02<04:16, 10.69s/it]

loss: 0.4758, accuracy: 39991/48000.0 (0.833)
val_loss: 0.4863, val_accuracy: 9950/12000.0 (0.829)
Saving best model
--------------------------------



 23%|███████████████████▎                                                               | 7/30 [01:13<04:07, 10.77s/it]

loss: 0.4631, accuracy: 40243/48000.0 (0.838)
val_loss: 0.4746, val_accuracy: 9986/12000.0 (0.832)
Saving best model
--------------------------------



 27%|██████████████████████▏                                                            | 8/30 [01:24<03:59, 10.87s/it]

loss: 0.4513, accuracy: 40388/48000.0 (0.841)
val_loss: 0.4629, val_accuracy: 10050/12000.0 (0.838)
Saving best model
--------------------------------



 30%|████████████████████████▉                                                          | 9/30 [01:35<03:49, 10.91s/it]

loss: 0.4407, accuracy: 40597/48000.0 (0.846)
val_loss: 0.4587, val_accuracy: 10080/12000.0 (0.840)
Saving best model
--------------------------------



 33%|███████████████████████████▎                                                      | 10/30 [01:46<03:41, 11.06s/it]

loss: 0.4325, accuracy: 40756/48000.0 (0.849)
val_loss: 0.4505, val_accuracy: 10156/12000.0 (0.846)
Saving best model
--------------------------------



 37%|██████████████████████████████                                                    | 11/30 [01:58<03:34, 11.29s/it]

loss: 0.4256, accuracy: 40888/48000.0 (0.852)
val_loss: 0.4647, val_accuracy: 10073/12000.0 (0.839)
--------------------------------



 40%|████████████████████████████████▊                                                 | 12/30 [02:09<03:21, 11.22s/it]

loss: 0.4171, accuracy: 40975/48000.0 (0.854)
val_loss: 0.4387, val_accuracy: 10149/12000.0 (0.846)
Saving best model
--------------------------------



 43%|███████████████████████████████████▌                                              | 13/30 [02:20<03:10, 11.23s/it]

loss: 0.4101, accuracy: 41154/48000.0 (0.857)
val_loss: 0.4261, val_accuracy: 10256/12000.0 (0.855)
Saving best model
--------------------------------



 47%|██████████████████████████████████████▎                                           | 14/30 [02:32<03:02, 11.39s/it]

loss: 0.4037, accuracy: 41244/48000.0 (0.859)
val_loss: 0.4216, val_accuracy: 10267/12000.0 (0.856)
Saving best model
--------------------------------



 50%|█████████████████████████████████████████                                         | 15/30 [02:44<02:51, 11.46s/it]

loss: 0.3978, accuracy: 41334/48000.0 (0.861)
val_loss: 0.4179, val_accuracy: 10269/12000.0 (0.856)
Saving best model
--------------------------------



 53%|███████████████████████████████████████████▋                                      | 16/30 [02:55<02:39, 11.37s/it]

loss: 0.3914, accuracy: 41439/48000.0 (0.863)
val_loss: 0.4182, val_accuracy: 10280/12000.0 (0.857)
--------------------------------



 57%|██████████████████████████████████████████████▍                                   | 17/30 [03:07<02:29, 11.51s/it]

loss: 0.3866, accuracy: 41479/48000.0 (0.864)
val_loss: 0.4058, val_accuracy: 10310/12000.0 (0.859)
Saving best model
--------------------------------



 60%|█████████████████████████████████████████████████▏                                | 18/30 [03:18<02:17, 11.48s/it]

loss: 0.3812, accuracy: 41626/48000.0 (0.867)
val_loss: 0.4009, val_accuracy: 10334/12000.0 (0.861)
Saving best model
--------------------------------



 63%|███████████████████████████████████████████████████▉                              | 19/30 [03:31<02:10, 11.86s/it]

loss: 0.3759, accuracy: 41729/48000.0 (0.869)
val_loss: 0.4089, val_accuracy: 10306/12000.0 (0.859)
--------------------------------



 67%|██████████████████████████████████████████████████████▋                           | 20/30 [03:43<02:00, 12.03s/it]

loss: 0.3703, accuracy: 41758/48000.0 (0.870)
val_loss: 0.4005, val_accuracy: 10317/12000.0 (0.860)
Saving best model
--------------------------------



 70%|█████████████████████████████████████████████████████████▍                        | 21/30 [03:58<01:56, 12.91s/it]

loss: 0.3662, accuracy: 41827/48000.0 (0.871)
val_loss: 0.3988, val_accuracy: 10350/12000.0 (0.863)
Saving best model
--------------------------------



 73%|████████████████████████████████████████████████████████████▏                     | 22/30 [04:10<01:40, 12.57s/it]

loss: 0.3629, accuracy: 41876/48000.0 (0.872)
val_loss: 0.3894, val_accuracy: 10340/12000.0 (0.862)
Saving best model
--------------------------------



 77%|██████████████████████████████████████████████████████████████▊                   | 23/30 [04:22<01:25, 12.20s/it]

loss: 0.3581, accuracy: 41961/48000.0 (0.874)
val_loss: 0.3808, val_accuracy: 10407/12000.0 (0.867)
Saving best model
--------------------------------



 80%|█████████████████████████████████████████████████████████████████▌                | 24/30 [04:33<01:12, 12.03s/it]

loss: 0.3538, accuracy: 42034/48000.0 (0.876)
val_loss: 0.3885, val_accuracy: 10354/12000.0 (0.863)
--------------------------------



 83%|████████████████████████████████████████████████████████████████████▎             | 25/30 [04:47<01:02, 12.48s/it]

loss: 0.3496, accuracy: 42086/48000.0 (0.877)
val_loss: 0.3863, val_accuracy: 10390/12000.0 (0.866)
--------------------------------



 87%|███████████████████████████████████████████████████████████████████████           | 26/30 [04:59<00:49, 12.33s/it]

loss: 0.3472, accuracy: 42150/48000.0 (0.878)
val_loss: 0.3749, val_accuracy: 10415/12000.0 (0.868)
Saving best model
--------------------------------



 90%|█████████████████████████████████████████████████████████████████████████▊        | 27/30 [05:11<00:36, 12.27s/it]

loss: 0.3418, accuracy: 42234/48000.0 (0.880)
val_loss: 0.3691, val_accuracy: 10439/12000.0 (0.870)
Saving best model
--------------------------------



 93%|████████████████████████████████████████████████████████████████████████████▌     | 28/30 [05:23<00:24, 12.29s/it]

loss: 0.3371, accuracy: 42323/48000.0 (0.882)
val_loss: 0.3726, val_accuracy: 10457/12000.0 (0.871)
--------------------------------



 97%|███████████████████████████████████████████████████████████████████████████████▎  | 29/30 [05:36<00:12, 12.52s/it]

loss: 0.3340, accuracy: 42310/48000.0 (0.881)
val_loss: 0.3679, val_accuracy: 10455/12000.0 (0.871)
Saving best model
--------------------------------



100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [05:48<00:00, 12.42s/it]


In [None]:
# Plot training and validation loss
xi = [i for i in range(0, len(loss_hist), 2)]
plt.plot([i[0] for i in loss_hist], label = "Training Loss")
plt.plot([i[1] for i in loss_hist], label = "Validation Loss")
plt.xticks(xi)
plt.legend()
plt.show()

In [None]:
# Plot training and validation accuracy
xi = [i for i in range(0, len(loss_hist), 2)]
plt.plot([i[0] for i in acc_hist], label = "Training Accuracy")
plt.plot([i[1] for i in acc_hist], label = "Validation Accuracy")
plt.xticks(xi)
plt.legend()
plt.show()

### Evaluate Model on Test Set

In [None]:
# Load model with best saved weights
model = SimpleNN(input_dims=28*28).to(device)
model.load_state_dict(torch.load("model"))

In [None]:
# Evaluate the model on test set which is the validation set
test_loss, test_acc, clf_report = test(model, device, valid_loader)

Best accuracy based on test set (which is the validation set in our case) = 0.871

### Class-Wise Accuracy Report

In [None]:
CLASS_CLOTHING = {0 :'T-shirt/top',
                  1 :'Trouser',
                  2 :'Pullover',
                  3 :'Dress',
                  4 :'Coat',
                  5 :'Sandal',
                  6 :'Shirt',
                  7 :'Sneaker',
                  8 :'Bag',
                  9 :'Ankle boot'}

# Create dictionary of class and accuracy
class_wise_acc = dict()
for i in range(len(clf_report)):
    class_wise_acc[CLASS_CLOTHING[i]] = clf_report[i].item()

class_wise_acc = dict(sorted(class_wise_acc.items(), key=lambda x: x[1]))
plt.bar(range(len(class_wise_acc)), list(class_wise_acc.values()), align='center')
plt.xticks(range(len(class_wise_acc)), list(class_wise_acc.keys()), rotation = 60)
plt.title("Classification Accuracy per class")
plt.show()

Analysing the classification report, **Shirt** is the most difficult class to predict.
Ranking of classes based on accuracies is clearly shown in the bar chart above