In [None]:
## libraries

from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

In [None]:
## data transforms
train_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

test_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

In [None]:
## datasets

train = datasets.MNIST("./data", train=True, download=True, transform=train_transforms)
test = datasets.MNIST("./data", train=False, download=True, transform=test_transforms)

In [None]:
## dataloader args

SEED = 1

cuda = torch.cuda.is_available()
print("CUDA Available: ", cuda)

if cuda:
    torch.manual_seed(SEED)

dataloader_args = dict(shuffle=True, batch_size=128, num_workers=4, pin_memory=True)

train_loader = torch.utils.data.DataLoader(train, **dataloader_args)
test_loader = torch.utils.data.DataLoader(test, **dataloader_args)


In [None]:
## load data
import matplotlib.pyplot as plt

dataiter = iter(train_loader)
images, labels = next(dataiter)

fig = plt.figure()
num_images = 70

for idx in range(1, num_images+1):
    plt.subplot(7, 10, idx)
    plt.axis('off')
    plt.imshow(images[idx].numpy().squeeze(), cmap='gray_r')
    

In [None]:
## Neural Network model

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=10, kernel_size=(3, 3)),
            nn.ReLU()) # (28-3) + 1 = 26, # Jout = 1, # Rfout = 1 + (3-1)*1 = 3

        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels=10, out_channels=20, kernel_size=(3,3)),
            nn.ReLU() 
                    
        ) # (26-3+1) = 24, # Jout = 1 # Rfout = 3 + (3-1)*1 = 5 


        self.conv3 = nn.Sequential(
            nn.Conv2d(in_channels=20, out_channels=20, kernel_size=(3,3)),
            nn.ReLU()
        ) # (24-3+1) =22, # Jout = 1 # Rfout = 5 + (3-1)*1 = 7

        self.maxpool1 = nn.MaxPool2d(2, 2)
        # (22-2)/2 + 1 = 11, # jout = 2*1 = 2, Rfout = 7 + (2-1)*1 = 9

        self.conv4 = nn.Sequential(
            nn.Conv2d(in_channels=20, out_channels=32, kernel_size=(3,3)),
            nn.ReLU() 
        ) # (11-3+1) = 9, # Jout = 2, # Rfout = 9 + (3-1)*2 = 13

        self.conv5 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(3,3)),
            nn.ReLU()
        ) # (9-3+1) = 7, # jout = 2 # Rfout = 13 + (3-1)*2 = 17

        self.conv6 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=20, kernel_size=(1,1)),
            nn.ReLU()
        ) # (7-1+1) = 7, # Jout = 2, # Rfout = 17 + (1-1)*2 = 17

        self.conv7 = nn.Sequential(
            nn.Conv2d(in_channels=20, out_channels=20, kernel_size=(3, 3)),
            nn.ReLU()
        ) # (7-3+1) = 5, # Jout = 2, # Rfout = 17 + (3-1)*2 = 21
        
        self.conv8 = nn.Sequential(
            nn.Conv2d(in_channels=20, out_channels=10, kernel_size=(5,5))
        ) # (5-5+1) = 1, # Jout = 2, # Rfout = 21 + (5-1)*2 = 29


    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.maxpool1(x)
        x = self.conv4(x)
        x = self.conv5(x)
        x = self.conv6(x)
        x = self.conv7(x)
        x = self.conv8(x)
        x = x.view(-1, 10)
        return F.log_softmax(x, dim=-1)

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
model = Net().to(device)

In [None]:
# !pip install torchsummary

In [None]:
from torchsummary import summary

summary(model, input_size=(1, 28, 28))

In [None]:
## Train & Test Loops

from tqdm import tqdm

train_losses = []
test_losses = []
train_acc = []
test_acc = []

def train(model, device, train_loader, optimizer, epoch):
  model.train()
  pbar = tqdm(train_loader)
  correct = 0
  processed = 0
  for batch_idx, (data, target) in enumerate(pbar):
    # get samples
    data, target = data.to(device), target.to(device)

    # Init
    optimizer.zero_grad()
    # In PyTorch, we need to set the gradients to zero before starting to do backpropragation because PyTorch accumulates the gradients on subsequent backward passes. 
    # Because of this, when you start your training loop, ideally you should zero out the gradients so that you do the parameter update correctly.

    # Predict
    y_pred = model(data)

    # Calculate loss
    loss = F.nll_loss(y_pred, target)
    train_losses.append(loss)

    # Backpropagation
    loss.backward()
    optimizer.step()

    # Update pbar-tqdm
    
    pred = y_pred.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
    correct += pred.eq(target.view_as(pred)).sum().item()
    processed += len(data)

    pbar.set_description(desc= f'Loss={loss.item()} Batch_id={batch_idx} Accuracy={100*correct/processed:0.2f}')
    train_acc.append(100*correct/processed)

def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)
    test_losses.append(test_loss)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))
    
    test_acc.append(100. * correct / len(test_loader.dataset))

In [None]:
model =  Net().to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
EPOCHS = 20
for epoch in range(EPOCHS):
    print("EPOCH:", epoch)
    train(model, device, train_loader, optimizer, epoch)
    test(model, device, test_loader)

In [None]:
t = [t_items.item() for t_items in train_losses]
%matplotlib inline
import matplotlib.pyplot as plt
fig, axs = plt.subplots(2,2,figsize=(15,10))
axs[0, 0].plot(t)
axs[0, 0].set_title("Training Loss")
axs[1, 0].plot(train_acc)
axs[1, 0].set_title("Training Accuracy")
axs[0, 1].plot(test_losses)
axs[0, 1].set_title("Test Loss")
axs[1, 1].plot(test_acc)
axs[1, 1].set_title("Test Accuracy")

In [None]:
## Batch Normalization with Regularization

In [None]:
## Neural Network model with batchNormalization

class Net2(nn.Module):
    def __init__(self):
        super(Net2, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=10, kernel_size=(3, 3)),
            nn.BatchNorm2d(10),
            nn.ReLU()) # (28-3) + 1 = 26, # Jout = 1, # Rfout = 1 + (3-1)*1 = 3

        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels=10, out_channels=20, kernel_size=(3,3)),
            nn.BatchNorm2d(20),
            nn.ReLU() 
                    
        ) # (26-3+1) = 24, # Jout = 1 # Rfout = 3 + (3-1)*1 = 5 


        self.conv3 = nn.Sequential(
            nn.Conv2d(in_channels=20, out_channels=20, kernel_size=(3,3)),
            nn.BatchNorm2d(20),
            nn.ReLU()
        ) # (24-3+1) =22, # Jout = 1 # Rfout = 5 + (3-1)*1 = 7

        self.maxpool1 = nn.MaxPool2d(2, 2)
        # (22-2)/2 + 1 = 11, # jout = 2*1 = 2, Rfout = 7 + (2-1)*1 = 9

        self.conv4 = nn.Sequential(
            nn.Conv2d(in_channels=20, out_channels=32, kernel_size=(3,3)),
            nn.BatchNorm2d(32),
            nn.ReLU() 
        ) # (11-3+1) = 9, # Jout = 2, # Rfout = 9 + (3-1)*2 = 13

        self.conv5 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(3,3)),
            nn.BatchNorm2d(32),
            nn.ReLU()
        ) # (9-3+1) = 7, # jout = 2 # Rfout = 13 + (3-1)*2 = 17

        self.conv6 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=20, kernel_size=(1,1)),
            nn.BatchNorm2d(20),
            nn.ReLU()
        ) # (7-1+1) = 7, # Jout = 2, # Rfout = 17 + (1-1)*2 = 17

        self.conv7 = nn.Sequential(
            nn.Conv2d(in_channels=20, out_channels=20, kernel_size=(3, 3)),
            nn.BatchNorm2d(20),
            nn.ReLU()
        ) # (7-3+1) = 5, # Jout = 2, # Rfout = 17 + (3-1)*2 = 21
        
        self.conv8 = nn.Sequential(
            nn.Conv2d(in_channels=20, out_channels=10, kernel_size=(5,5))
        ) # (5-5+1) = 1, # Jout = 2, # Rfout = 21 + (5-1)*2 = 29


        self.dropout = nn.Dropout(0.05)

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.dropout(x)
        x = self.conv3(x)
        x = self.dropout(x)
        x = self.maxpool1(x)
        x = self.conv4(x)
        x = self.conv5(x)
        x = self.dropout(x)
        x = self.conv6(x)
        x = self.dropout(x)
        x = self.conv7(x)
        x = self.conv8(x)
        x = x.view(-1, 10)
        return F.log_softmax(x, dim=-1)

model2 = Net2().to(device)

In [None]:
summary(model2, input_size=(1, 28, 28))

In [None]:
optimizer2 = optim.SGD(model2.parameters(), lr=0.01, momentum=0.9)
EPOCHS = 20 
for epoch in range(EPOCHS):
    print(f"[EPOCH]: ", epoch)
    train(model2, device, train_loader, optimizer2, epoch)
    test(model2, device, test_loader)

In [None]:
## Seemingly using Batch Normalization with Droout-Regularization has helped w.r.t to overfiting.

**Target**:
----------

1. Setup a lighter model related to problem of MNIST with image size of (28x28) as a first step, add Batch Normalization & Dropout Regularization as a next step.

2. Basic necessary transforms of ToTensor(), Normalize

3. Stored via Dataloader

4. less Parameters Neural Network

5. Basic Training & Testing Loops

6. Train for 20 Epochs


**Result**:
---------
A. Parameters: 30K (30,178)

B. Best Training Accuracy: 99.84 (19th EPOCH)

C. Best Test Accuracy: 99.48 (19th EPOCH)


**Analysis**:
------------

A. Model with bit of lesser number of parameters.

B. Model when made with lesser parameters with no techniques produced model with varying logs, along with overfitting. But, when we started to use BatchNormalization along with dropout regularization, model performance improved with less overfitting which produced a good inference model.