In [None]:
import matplotlib.pyplot as plt
import numpy as np

from sklearn import model_selection
import torch
from torch import nn, optim
from torch.nn import functional as F
from torch.utils import data
from torchvision import models, transforms

# Improving Performance with Data Augmentation

In [None]:
mnist_arr = np.loadtxt("../data/raw/mnist/train.csv", delimiter=',', skiprows=1, dtype=np.uint8)

In [None]:
_prng = np.random.RandomState(42)
training_features, validation_features, training_target, validation_target = (
    model_selection.train_test_split(mnist_arr[:, 1:],
                                     mnist_arr[:, 0],
                                     test_size=0.10,
                                     random_state=_prng)
)

## Create custom `DataSet` class to handle transformations

In [None]:
class CustomDataSet(data.Dataset):
    
    def __init__(self, X: np.ndarray, y: np.ndarray, transforms=None):
        super().__init__()
        self._X = X
        self._y = (torch.from_numpy(y)
                        .long())  # PyTorch really likes long integers for target dtype!
        self._transforms = transforms
        
    def __getitem__(self, index: int):
        features = self._X[index]
        target = self._y[index]
        return (features, target) if self._transforms is None else (self._transforms(features), target)
    
    def __len__(self):
        n_samples, _ = self._X.shape
        return n_samples


## Define the DataLoader instances for training and validation datasets

In [None]:
# data augmentation should only apply to training data
_training_transforms = transforms.Compose([
    transforms.Lambda(lambda X: X.reshape((28, 28, 1))),
    transforms.ToPILImage(mode='L'),
    transforms.RandomAffine(degrees=15, translate=(0.1, 0.1), shear=15, scale=(1.0, 1.1)),
    transforms.ToTensor(),
])
training_dataset = CustomDataSet(training_features, training_target, _training_transforms)

_batch_size = 32
training_data_loader = data.DataLoader(training_dataset, batch_size=_batch_size, shuffle=True)

# data augmentation should not be applied to validation data
_validation_transforms = transforms.Compose([
    transforms.Lambda(lambda X: X.reshape((28, 28, 1))),
    transforms.ToPILImage(mode='L'),
    transforms.ToTensor(),
])
validation_dataset = CustomDataSet(validation_features, validation_target, _validation_transforms)
validation_data_loader = data.DataLoader(validation_dataset, batch_size=1024)


## Exploring transformed images

In [None]:
fig , axes = plt.subplots(5, 6, sharex=True, sharey=True, figsize=(20, 20))
for i  in range(5):
    for j in range(6):
        if j == 0:
            _ = axes[i, j].imshow(training_features[i].reshape((28, 28)), cmap="gray")
        else:
            _ = axes[i, j].imshow(training_dataset[i][0][0], cmap="gray")
        
        if i == 0 and j == 0:
            axes[i, j].set_title("Original Digit")
        if i == 0 and j > 0:
            axes[i, j].set_title(f"Augmented Digit {j}")
            
fig.tight_layout()

## Train a CNN

In [None]:
def partial_fit(model_fn, loss_fn, X_batch, y_batch, opt):
    # forward pass
    loss = loss_fn(model_fn(X_batch), y_batch)

    # back propagation
    loss.backward()
    opt.step()
    opt.zero_grad() # don't forget to reset the gradient after each batch!


def fit(model_fn, loss_fn, training_data_loader, opt, validation_data_loader=None, number_epochs=1):
    
    for epoch in range(number_epochs):
        model_fn.train()
        for X_batch, y_batch in training_data_loader:
            partial_fit(model_fn, loss_fn, X_batch, y_batch, opt)
        
        # compute validation loss after each training epoch
        if validation_data_loader is not None:
            model_fn.eval()
            with torch.no_grad():
                batch_losses, batch_sizes = zip(*[(loss_fn(model_fn(X), y), len(X)) for X, y in validation_data_loader])
                validation_loss = np.sum(np.multiply(batch_losses, batch_sizes)) / np.sum(batch_sizes)
            print(f"Training epoch: {epoch}, Validation loss: {validation_loss}")


In [None]:
class LambdaLayer(nn.Module):
    
    def __init__(self, f):
        super().__init__()
        self._f = f
        
    def forward(self, X):
        return self._f(X)
    

In [None]:
lenet5 = nn.Sequential(
    nn.Conv2d(1, 6, kernel_size=5),
    nn.ReLU(),
    nn.MaxPool2d(2),
    nn.Conv2d(6, 16, kernel_size=5),
    nn.ReLU(),
    nn.MaxPool2d(2),
    LambdaLayer(lambda X: X.view(X.size(0), -1)),
    nn.Linear(256, 120),
    nn.ReLU(),
    nn.Linear(120, 84),
    nn.ReLU(),
    nn.Linear(84, 10)
)

loss_fn = nn.CrossEntropyLoss()

opt = optim.Adam(lenet5.parameters())

In [None]:
fit(lenet5, loss_fn, training_data_loader, opt, validation_data_loader, number_epochs=100)

## Use our trained model to make predictions using the test data

In [None]:
_testing_features = np.loadtxt("../data/raw/mnist/test.csv", delimiter=',', skiprows=1, dtype=np.int64)
_scaled_testing_features = np.divide(_testing_features, 255, dtype=np.float32)
scaled_testing_features_tensor = torch.from_numpy(_scaled_testing_features)

In [None]:
output = lenet5(scaled_testing_features_tensor.view(-1, 1, 28, 28))
predictions = torch.argmax(output, dim=1)

### Visually check model predictions

In [None]:
fig , axes = plt.subplots(5, 5, sharex=True, sharey=True, figsize=(20, 20))
idx = 0
for i  in range(5):
    for j in range(5):
        _ = axes[i, j].imshow(scaled_testing_features_tensor[idx].reshape((28, 28)), cmap="gray")
        axes[i, j].set_title(f"Predicted digit: {predictions[idx]}")
        idx += 1
fig.tight_layout()

### Reformat predictions

In [None]:
# submission format for kaggle
!head ../data/raw/mnist/sample_submission.csv

In [None]:
import os
import time

import pandas as pd

if not os.path.isdir("../data/kaggle-submissions/mnist/"):
    os.makedirs("../data/kaggle-submissions/mnist/")

timestamp = time.strftime("%Y%m%d-%H%M%S")
number_predictions, = predictions.shape
df = pd.DataFrame({"ImageId": range(1, number_predictions + 1), "Label": predictions.cpu()})
df.to_csv(f"../data/kaggle-submissions/mnist/submission-{timestamp}.csv", index=False)

### Submit to Kaggle!

Once you have successfully submited your predictions then you can check the [Digit-Recognizer competition](https://www.kaggle.com/c/digit-recognizer) website and see how well your best model compares to your peers.

In [None]:
%%bash
export KAGGLE_USERNAME="YOUR_USERNAME"
export KAGGLE_KEY="YOUR_API_KEY"
kaggle competitions submit digit-recognizer \
  -f $(ls ../data/kaggle-submissions/mnist/submission-*.csv | tail -n 1) \
  -m "My first digit recognizer submission!"