Based off of [What is torch.nn really?](https://pytorch.org/tutorials/beginner/nn_tutorial.html)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

from sklearn import model_selection, preprocessing
import torch

In [None]:
%matplotlib notebook

# Getting the MNIST data

## Download the data from Kaggle...

### 1. Register for an account with Kaggle

In order to download Kaggle competition data you will first need to create a [Kaggle](https://www.kaggle.com/) account.

### 2. Create an API key

Once you have registered for a Kaggle account you will need to create some [API credentials](https://github.com/Kaggle/kaggle-api#api-credentials) in order to be able to use the `kaggle` CLI to download data.

### 3. Download the Data

Execute the code in the following cell to download the Kaggle [Digit Recognizer: Learn computer vision with the famous MNIST data](https://www.kaggle.com/c/digit-recognizer) competition data. In order for the following Kaggle API call to work you will need to login to your Kaggle account and accept the rules for this competition.

In [None]:
!kaggle competitions download \
    -c digit-recognizer \
    -p ../data/raw/mnist/

## ...or not!

If you don't want to set up an account with Kaggle, then no worries! I have included the training and testing data sets for you.

    ../data/raw/mnist/train.csv
    ../data/raw/mnist/test/csv
    

# Load the MNIST data

In [None]:
!head ../data/raw/mnist/train.csv

In [None]:
mnist_arr = np.loadtxt("../data/raw/mnist/train.csv", delimiter=',', skiprows=1, dtype=np.int64)

In [None]:
# raw features are between 0 and 255
mnist_arr.min(), mnist_arr.max()

## Split the MNIST data into training and validation sets

In [None]:
prng = np.random.RandomState(42)
training_arr, validation_arr = model_selection.train_test_split(mnist_arr, test_size=0.20, random_state=prng)

In [None]:
training_arr.shape

In [None]:
training_target, training_features = training_arr[:, 0], training_arr[:, 1:]

In [None]:
validation_arr.shape

In [None]:
validation_target, validation_features = validation_arr[:, 0], validation_arr[:, 1:]

## Need to rescale the raw data

In [None]:
min_max_scaler = preprocessing.MinMaxScaler()
scaled_training_features = min_max_scaler.fit_transform(training_features)
scaled_validation_features = min_max_scaler.fit_transform(validation_features)

## Check out a training sample

In [None]:
_, ax = plt.subplots(1,1)
_ = ax.imshow(scaled_training_features[0].reshape((28, 28)), cmap="gray")

PyTorch uses `torch.tensor` rather than `numpy.ndarray` so we need to convert data.

In [None]:
training_target = torch.tensor(training_target)
scaled_training_features = torch.tensor(scaled_training_features, dtype=torch.float32)

validation_target = torch.tensor(validation_target)
scaled_validation_features = torch.tensor(scaled_validation_features, dtype=torch.float32)

In [None]:
scaled_training_features

In [None]:
scaled_training_features.max()

In [None]:
scaled_training_features.dtype

In [None]:
training_target

# Neural network from scratch

In [None]:
import math

In [None]:
number_samples, number_features = scaled_training_features.shape

# using Xavier initialization (divide weights by sqrt(number_features))
weights = torch.randn(number_features, 10) / math.sqrt(number_features)
weights.requires_grad_() # trailing underscore indicates in-place operation
bias = torch.zeros(10, requires_grad=True)

In [None]:
def _linear_transformation(X):
    return X @ weights + bias

def _log_softmax_activation(X):
    return X - X.exp().sum(-1).log().unsqueeze(-1)
    
def model(X):
    Z = _linear_transformation(X)
    return _log_softmax_activation(Z)

In [None]:
batch_size = 64
output = model(scaled_training_features[:batch_size])

In [None]:
output[1]

In [None]:
output.shape

In [None]:
def negative_log_likelihood(output, target):
    m, _ = output.shape
    return -output[range(m), target].mean()
    

In [None]:
negative_log_likelihood(output, training_target[:batch_size])

In [None]:
def accuracy(output, target):
    predictions = torch.argmax(output, dim=1)
    return (predictions == target).float().mean()

In [None]:
accuracy(output, training_target[:64])

In [None]:
number_epochs = 2
number_batches = (number_samples - 1) // batch_size + 1

learning_rate = 0.5
for epoch in range(number_epochs):
    for batch in range(number_batches):
        # forward pass
        start = batch * batch_size
        X = scaled_training_features[start:(start + batch_size)]
        y = training_target[start:(start + batch_size)]
        output = model(X)
        loss = negative_log_likelihood(output, y)
        
        # back propagation
        loss.backward()
        with torch.no_grad():
            weights -= learning_rate * weights.grad
            bias -= learning_rate * bias.grad
            weights.grad.zero_()
            bias.grad.zero_()
            

In [None]:
negative_log_likelihood(model(X), y), accuracy(model(X), y)

# Refactor using `torch.nn.functional`

In [None]:
import torch.nn.functional as F

In [None]:
def model(X):
    return X @ weights + bias

loss_function = F.cross_entropy

In [None]:
loss_function(model(X), y), accuracy(model(X), y)

# Refactor using `torch.nn.Module`

In [None]:
from torch import nn


class MNISTLogisticRegression(nn.Module):
    
    def __init__(self):
        super().__init__()
        self._weights = nn.Parameter(torch.randn(784, 10) / math.sqrt(784))
        self._bias = nn.Parameter(torch.zeros(10))
        
    def forward(self, X):
        return X @ self._weights + self._bias
    


In [None]:
model = MNISTLogisticRegression()

In [None]:
loss_function(model(X), y)

In [None]:
def fit(model, loss_function, number_samples, number_epochs=2, batch_size=64):
    
    number_batches = (number_samples - 1) // batch_size + 1
    for epoch in range(number_epochs):
        for batch in range(number_batches):
            # forward pass
            start = batch * batch_size
            X = scaled_training_features[start:(start + batch_size)]
            y = training_target[start:(start + batch_size)]
            output = model(X)
            loss = loss_function(output, y)

            # back propagation
            loss.backward()
            with torch.no_grad():
                for parameter in model.parameters():
                    parameter -= learning_rate * parameter.grad
                model.zero_grad()


In [None]:
fit(model, loss_function, number_samples)

In [None]:
loss_function(model(X), y)

# Refactoring using `nn.Linear`

In [None]:
from torch import nn


class MNISTLogisticRegression(nn.Module):
    
    def __init__(self):
        super().__init__()
        self._linear_layer = nn.Linear(784, 10)
        
    def forward(self, X):
        return self._linear_layer(X)
    


In [None]:
model = MNISTLogisticRegression()

In [None]:
loss_function(model(X), y)

In [None]:
fit(model, loss_function, number_samples)

In [None]:
loss_function(model(X), y)

# Refactoring using `torch.optim`

In [None]:
from torch import optim

In [None]:
def fit(model, loss_function, optimizer, number_samples, number_epochs=2, batch_size=64):
    
    number_batches = (number_samples - 1) // batch_size + 1
    for epoch in range(number_epochs):
        for batch in range(number_batches):
            # forward pass
            start = batch * batch_size
            X = scaled_training_features[start:(start + batch_size)]
            y = training_target[start:(start + batch_size)]
            output = model(X)
            loss = loss_function(output, y)
            
            # back propagation
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()


In [None]:
fit(model, F.cross_entropy, optim.SGD(model.parameters(), lr=0.5), number_samples)

In [None]:
F.cross_entropy(model(X), y)

# Refactor using `TensorDataSet`

In [None]:
from torch.utils import data

In [None]:
training_data = data.TensorDataset(scaled_training_features, training_target)

In [None]:
def fit(model, loss_function, optimizer, number_samples, number_epochs=2, batch_size=64):
    
    number_batches = (number_samples - 1) // batch_size + 1
    for epoch in range(number_epochs):
        for batch in range(number_batches):
            # forward pass
            start = batch * batch_size
            X, y = training_data[start:(start + batch_size)]
            output = model(X)
            loss = loss_function(output, y)
            
            # back propagation
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()


In [None]:
fit(model, F.cross_entropy, optim.SGD(model.parameters(), lr=0.5), number_samples)

In [None]:
F.cross_entropy(model(X), y)

# Refactor using `DataLoader`

In [None]:
training_data_loader = data.DataLoader(training_data, batch_size=batch_size, shuffle=True)

In [None]:
def fit(model, loss_function, optimizer, data_loader, number_epochs=2):
    
    for epoch in range(number_epochs):
        for X, y in data_loader:
            output = model(X)
            loss = loss_function(output, y)
            
            # back propagation
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()


In [None]:
fit(model, F.cross_entropy, optim.SGD(model.parameters(), lr=0.5), training_data_loader)

In [None]:
F.cross_entropy(model(X), y)

# Adding Validation

In [None]:
validation_data = data.TensorDataset(scaled_validation_features, validation_target)
validation_data_loader = data.DataLoader(validation_data, batch_size=2*batch_size)

In [None]:
def fit(model, loss_function, optimizer, training_data_loader, validation_data_loader=None, number_epochs=2):
    
    for epoch in range(number_epochs):
        model.train()
        for X, y in training_data_loader:
            output = model(X)
            loss = loss_function(output, y)
            
            # back propagation
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        
        # compute validation loss after each training epoch
        if validation_data_loader is not None:
            model.eval()
            with torch.no_grad():
                batch_losses, batch_sizes = zip(*[(loss_function(model(X), y), len(X)) for X, y in validation_data_loader])
                validation_loss = np.sum(np.multiply(batch_losses, batch_sizes)) / np.sum(batch_sizes)
            print(epoch, validation_loss)


In [None]:
fit(model, F.cross_entropy, optim.SGD(model.parameters(), lr=0.5), training_data_loader, validation_data_loader)

# Switching to CNN

In [None]:
class MNISTCNN(nn.Module):
    
    def __init__(self):
        super().__init__()
        self._conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=2, padding=1)
        self._conv2 = nn.Conv2d(16, 16, kernel_size=3, stride=2, padding=1)
        self._conv3 = nn.Conv2d(16, 10, kernel_size=3, stride=2, padding=1)
        
    def forward(self, X):
        X = X.view(-1, 1, 28, 28) # implicit knowledge of MNIST data shape!
        X = F.relu(self._conv1(X))
        X = F.relu(self._conv2(X))
        X = F.relu(self._conv3(X))
        X = F.avg_pool2d(X, 4)
        return X.view(-1, X.size(1))
    

In [None]:
model = MNISTCNN()
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9)

fit(model, F.cross_entropy, optimizer, training_data_loader, validation_data_loader)

# Refactor using `nn.Sequential`

In [None]:
class LambdaLayer(nn.Module):
    
    def __init__(self, f):
        super().__init__()
        self._f = f
        
    def forward(self, X):
        return self._f(X)
    


In [None]:
model = nn.Sequential(
    LambdaLayer(lambda X: X.view(-1, 1, 28, 28)),
    nn.Conv2d(1, 16, kernel_size=3, stride=2, padding=1),
    nn.ReLU(),
    nn.Conv2d(16, 16, kernel_size=3, stride=2, padding=1),
    nn.ReLU(),
    nn.Conv2d(16, 10, kernel_size=3, stride=2, padding=1),
    nn.ReLU(),
    nn.AvgPool2d(4),
    LambdaLayer(lambda X: X.view(X.size(0), -1))
)

In [None]:
fit(model,
    F.cross_entropy,
    optim.SGD(model.parameters(), lr=0.1, momentum=0.9),
    training_data_loader,
    validation_data_loader)

# Wrapping DataLoader

In [None]:
class WrappedDataLoader:
    
    def __init__(self, data_loader, f):
        self._data_loader = data_loader
        self._f = f
        
    def __len__(self):
        return len(self._data_loader)
    
    def __iter__(self):
        for batch in iter(self._data_loader):
            yield self._f(*batch)


In [None]:
preprocess = lambda X, y: (X.view(-1, 1, 28, 28), y)
training_data_loader = WrappedDataLoader(training_data_loader, preprocess)
validation_data_loader = WrappedDataLoader(validation_data_loader, preprocess)

In [None]:
model = nn.Sequential(
    nn.Conv2d(1, 16, kernel_size=3, stride=2, padding=1),
    nn.ReLU(),
    nn.Conv2d(16, 16, kernel_size=3, stride=2, padding=1),
    nn.ReLU(),
    nn.Conv2d(16, 10, kernel_size=3, stride=2, padding=1),
    nn.ReLU(),
    nn.AdaptiveAvgPool2d(1),
    LambdaLayer(lambda X: X.view(X.size(0), -1))
)

In [None]:
fit(model,
    F.cross_entropy,
    optim.SGD(model.parameters(), lr=0.1, momentum=0.9),
    training_data_loader,
    validation_data_loader)

# Using GPU

In [None]:
torch.cuda.is_available()

In [None]:
preprocess = lambda X, y: (X.view(-1, 1, 28, 28).to("cuda"), y.to("cuda"))
training_data_loader = WrappedDataLoader(training_data_loader, preprocess)
validation_data_loader = WrappedDataLoader(validation_data_loader, preprocess)

In [None]:
fit(model.to("cuda"),
    F.cross_entropy,
    optim.SGD(model.parameters(), lr=0.1, momentum=0.9),
    training_data_loader,
    validation_data_loader,
    number_epochs=5)

# Submitting to Kaggle

## Re-train the model using the entire training set

In [None]:
training_target, training_features = mnist_arr[:, 0], mnist_arr[:, 1:]
scaled_training_features = min_max_scaler.fit_transform(training_features)
scaled_training_features_tensor = torch.tensor(scaled_training_features, dtype=torch.float32)
training_target_tensor = torch.tensor(training_target)

training_data = data.TensorDataset(scaled_training_features_tensor, training_target_tensor)
training_data_loader = data.DataLoader(training_data, batch_size=batch_size, shuffle=True)
wrapped_training_data_loader = WrappedDataLoader(training_data_loader, preprocess)


In [None]:
fit(model.to("cuda"),
    F.cross_entropy,
    optim.SGD(model.parameters(), lr=0.1, momentum=0.9),
    wrapped_training_data_loader,
    number_epochs=5)

In [None]:
# submission format for kaggle
!head ../data/raw/mnist/sample_submission.csv

## Use trained model to make predictions using the test data

In [None]:
testing_features = np.loadtxt("../data/raw/mnist/test.csv", delimiter=',', skiprows=1, dtype=np.int64)

In [None]:
scaled_testing_features = min_max_scaler.fit_transform(testing_features)
scaled_testing_features = torch.tensor(scaled_testing_features, dtype=torch.float32)

In [None]:
output = model(scaled_testing_features.view(-1, 1, 28, 28).to("cuda"))
predictions = torch.argmax(output, dim=1)

In [None]:
number_predictions, = predictions.shape

In [None]:
import time

import pandas as pd


timestamp = time.strftime("%Y%m%d-%H%M%S")
df = pd.DataFrame({"ImageId": range(1, number_predictions + 1), "Label": predictions.cpu()})
df.to_csv(f"../data/kaggle-submissions/mnist/submission-{timestamp}.csv", index=False)

In [None]:
!kaggle competitions submit digit-recognizer -f ../data/kaggle-submissions/mnist/submission-20190203-145624.csv -m "My first ever Kaggle submission!"