# Convolutional NN on [MNIST](https://github.com/pytorch/examples/tree/master/mnist) with [torch](https://pytorch.org/)
stough 202-

Following a simple digit classification on MNIST, using a small convolutional neural network (CNN).

A [convolutional layer](https://pytorch.org/docs/stable/nn.html#conv2d) solves for simple spatial filtering
operations where the output (feature map) contains useful information for the downstream or deeper layers 
in the network. With $in\_channels$ input channels, $out\_channels$ output feature maps, and $kernel\_size$ for the filtering, the layer will have $in\_channels * out\_channels * kernel\_size^2 + out\_channels$ parameters 
to optimize. 

In [None]:
%matplotlib inline
# or widget
import matplotlib.pyplot as plt

import numpy as np
from random import shuffle
import copy
import tempfile

# from keras.datasets import mnist
from torchvision import datasets, transforms
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import StepLR

# torch
import torch
import torch.optim as optim
import torch.backends.cudnn as cudnn

import torch.nn.functional as F

from torch import nn
from torch.autograd import Variable
from torch.nn import Module

from torchvision.transforms import ToTensor
from torchvision.utils import make_grid

# For timing.
import time
tic, toc = (time.time, time.time)

## Define the Network
A more complicated network, using Convolutional Layers to transform the original image
into a collection of features that a linear layer can use to discriminate 
among the classes/digits.

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1) # padding is 0 by default, so 
                                            # we lose a pixel on each side.
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout2d(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.conv1(x) # input: 1x28x28, output: 32x26x26
        x = F.relu(x)
        x = self.conv2(x) # input: 32x26x26, output: 64x24x24
        x = F.relu(x)
        x = F.max_pool2d(x, 2) # input: 64x24x24, output: 64x12x12
        x = self.dropout1(x) # randomly zero out some of the features. (in training only)
        x = torch.flatten(x, 1) # flatten the 64x12x12 to a single dimension (9216) 
        x = self.fc1(x) 
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        output = F.log_softmax(x, dim=1)
#         output = torch.sigmoid(x)
        return output

## Training and Test functions.
During training, the optimizer modifies the parameters of the model in a way that minimizes the loss
function. See more detail [here](https://stackoverflow.com/questions/53975717/pytorch-connection-between-loss-backward-and-optimizer-step), but a lot is hidden from you. Just viewing it pythonically for example, it is not clear how the loss
and the optimizer are connected.

In [None]:
def train(args, model, device, train_loader, optimizer, epoch):
    starttime = tic()
    model.train()
    loss_sum = 0
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        # loss = F.cross_entropy(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % args.log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))
        loss_sum += loss.item()
    
    loss_avg = loss_sum/len(train_loader.dataset)
    
    endtime = toc()
    print('\nTrain set: Average loss: {:.4f} ({:.3f} sec)'.\
          format(loss_avg, 
                 endtime-starttime))
    
    return loss_avg


def test(args, model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            # test_loss += F.cross_entropy(output, target, reduction='sum').item()
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)
    test_acc = correct / len(test_loader.dataset)

    print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * test_acc))
    
    return test_loss, test_acc

## Namespace to replace the argparse business

In [None]:
from argparse import Namespace
# What is a Namespace? It looks like this. 
# Such objects allow you to use the dot operator, like a struct in C.
# class Namespace:
#     def __init__(self, **kwargs):
#         self.__dict__.update(kwargs)

args = Namespace(
    no_cuda=False, 
    seed=1, 
    batch_size=64,
    test_batch_size=1000,
    epochs=5,
    lr=1.0,
    gamma=0.7,
    log_interval=250,
    save_model=False
)

## Organize the MNIST data
We'll use the [torchvision transforms](https://pytorch.org/docs/stable/torchvision/transforms.html#) to 
modify the dataset without having to convert to numpy arrays ourselves. If organized correctly, we won't
need our own Dataset class and collate function.

In [None]:
# Thank you: https://www.aiworkbox.com/lessons/load-mnist-dataset-from-pytorch-torchvision
# https://pytorch.org/docs/stable/torchvision/datasets.html
mnist_trainset = datasets.MNIST(root='/home/dip365/data', train=True, download=True, 
                                transform=transforms.Compose([
                                    transforms.ToTensor(),
                                    transforms.Normalize((0.1307,), (0.3081,))
                                ]))
mnist_testset = datasets.MNIST(root='/home/dip365/data', train=False, download=True, 
                               transform=transforms.Compose([
                                   transforms.ToTensor(),
                                   transforms.Normalize((0.1307,), (0.3081,))
                               ]))

In [None]:
use_cuda = not args.no_cuda and torch.cuda.is_available()
torch.manual_seed(args.seed)
device = torch.device("cuda" if use_cuda else "cpu")

kwargs = {'num_workers': 4, 'pin_memory': False} if use_cuda else {}

train_loader = torch.utils.data.DataLoader(mnist_trainset,
    batch_size=args.batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(mnist_testset,
    batch_size=args.test_batch_size, shuffle=False, **kwargs)

&nbsp;

## Instantiate the model and count parameters

In [None]:
model = Net().to(device)
# Count the number of parameters: 
print(f'model has {sum(p.numel() for p in model.parameters() if p.requires_grad)} parameters.')

In [None]:
model

In [None]:
# Is the model actually on the GPU?
# Thanks: https://discuss.pytorch.org/t/how-to-check-if-model-is-on-cuda/180
next(model.parameters()).is_cuda

&nbsp;

## Run the optimization
- [Momentum, Learning Rate, etc](https://distill.pub/2017/momentum/)
- [Learning Rate Schedulers](https://towardsdatascience.com/learning-rate-schedules-and-adaptive-learning-rate-methods-for-deep-learning-2c8f433990d1)

In [None]:
optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)

test_loss = []
test_acc = []
train_loss = []

for epoch in range(1, args.epochs + 1):
    train_loss.append(train(args, model, device, train_loader, optimizer, epoch))
    results = test(args, model, device, test_loader)
    test_loss.append(results[0])
    test_acc.append(results[1])
    scheduler.step()

if args.save_model:
    torch.save(model.state_dict(), "mnist_cnn.pth")

In [None]:
# https://matplotlib.org/gallery/api/two_scales.html

train_loss = np.array(train_loss)
test_loss = np.array(test_loss)
test_acc = np.array(test_acc)

fig, ax1 = plt.subplots()
ax1.plot(np.stack([train_loss, test_loss]).T);

ax2 = ax1.twinx()
ax2.plot(test_acc, 'r--', label='test_acc');


ax1.legend(labels=['train_loss', 'test_loss'], loc='upper left')
ax2.legend(loc='upper right');

# plt.savefig('../dip_outs/conv_MNIST_torch_training.png', dpi=300)

&nbsp;

## Let's get the output of the model for all the test data.
Since the test_loader is also shuffled, we're going to want to keep track of the target too, to know what the answer should have been. In fact, let's just keep all of it. 

In the below cell we loop over all the test data, push it through the model and store the resulting classifier outputs. The expression `model(data.to(device))` sends the data to the GPU (where the model resides) and applies the model to it. The sequence that happens after, `.cpu().detach().numpy()`, takes the resulting outputs and brings it back to the cpu memory space in the form of a numpy array.

A [notes on pretty-printing arrays](https://stackoverflow.com/questions/2891790/how-to-pretty-print-a-numpy-array-without-scientific-notation-and-with-given-pre).

In [None]:
model.eval() # make sure the model weights don't change.
outputs = np.concatenate([model(data.to(device)).cpu().detach().numpy() 
              for data, target in test_loader], axis=0)

In [None]:
outputs[0]

In [None]:
# The last function in the model is a log_softmax, which is a log after softmax.
# Softmax makes the outputs all in [0,1] and sum to 1, but then there is a log 
# after that. So we undo that to see the [0,1] numbers, which can be thought of 
# as probabilities.
with np.printoptions(precision=4, suppress=True):
    print(np.exp(outputs[0]))

## Just for fun, let's see some of the mistakes...

In [None]:
correct_labels = np.concatenate([target for data, target in test_loader], axis=0)

In [None]:
correct_labels.shape

In [None]:
proposed_labels = np.argmax(outputs, axis=-1)

In [None]:
proposed_labels.shape

In [None]:
sum(proposed_labels == correct_labels)

In [None]:
wrong_guesses = np.where(proposed_labels != correct_labels)[0]

In [None]:
wrong_guesses

In [None]:
from torchvision.utils import make_grid

which_wrong = np.random.choice(wrong_guesses, 64, replace=False)
samples = torch.stack([mnist_testset[r][0]
                       for r in which_wrong])
plt.imshow(make_grid(samples, nrow=8, pad_value=1.0).permute(1,2,0))

In [None]:
print('Predicted: ' + ' '.join(['%d' % x for x in proposed_labels[which_wrong]]))
print('Actual:    ' + ' '.join(['%d' % x for x in correct_labels[which_wrong]]))

In [None]:
print('\n'.join([str((x,y)) for x,y in zip(proposed_labels[which_wrong], correct_labels[which_wrong])]))

In [None]:
# !jupyter nbconvert --to script 'conv_MNIST_torch.ipynb'