# MNIST from scratch with numpy

In [4]:
# load changed modules automatically
# XXX: doesn't seem to work, fix this
%load_ext autoreload
%autoreload = 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
# load numpy
import numpy as np

# load dataloaders and required layers
from mnist import dataloader
from mnist import layers
from mnist.layers import Softmax, Linear

# load pyplot for displaying images
from matplotlib import pyplot as plt

# show images inline on notebook
%matplotlib inline

# debugging
import pdb

## Load training data

In [6]:
dl = dataloader.DataLoader()
((x_train, y_train), (x_valid, y_valid), _) = dl.load_data()

In [7]:
x_train.shape, y_train.shape, x_valid.shape, y_valid.shape

((50000, 784), (50000,), (10000, 784), (10000,))

## Normalize data

In [8]:
(x_train, y_train), (x_valid, y_valid) = dl.normalize(((x_train, y_train), (x_valid, y_valid)))

## Exploring the data

In [None]:
def show(img):
    plt.imshow(img, cmap="gray")

In [None]:
valid_images = np.reshape(x_valid, (-1,28,28))

In [None]:
show(valid_images[0])

## Start building the model

In [9]:
def shuffle(arr1, arr2):
    random_idxs = np.arange(len(arr1))
    np.random.shuffle(random_idxs)
    return x_train[random_idxs], y_train[random_idxs]

In [17]:
class Net():

    def __init__(self):
        self.layer1 = Linear(28*28, 10) # linear layer with bias
        self.softmax = Softmax()
    
    def forward(self, x):
        x = self.layer1.forward(x)
        x = self.softmax.forward(x)
        return x

net = Net()

In [20]:
n_epochs = 10
batch_size = 128
learning_rate = 1e-6
debug = False
prev_loss = None

for epoch in range(n_epochs):
    
    for i in range(0, len(x_train), batch_size):

        n = batch_size
        inputs = x_train[i:i+n]
        targets = y_train[i:i+n]
        
        inputs, targets = shuffle(inputs, targets)
            
        if debug:
            print("inputs.shape", inputs.shape)
            print("targets.shape", targets.shape)

        # forward propagation
        y_pred = net.forward(inputs)
        
        if debug:
            print("y_pred.shape:", y_pred.shape)
        
        # calculate cross-entropy loss
        loss = net.softmax.cross_entropy(y_pred, targets)
        
        if debug:
            print(epoch, loss)
        
        # backpropagation
        
        # get the predictions
        predictions = np.diag(y_pred[:, targets])
        grad_loss = predictions - targets
        if debug:
            print("predictions.shape", predictions.shape)
            print("grad_loss.shape", grad_loss.shape)
        
        # compute layer1 gradient
        # XXX: How???
        grad_layer1 = inputs.T @ grad_loss
        grad_bias = np.mean(grad_loss)
        
        # XXX: Figure out how to implement this, maybe it's easier when
        #      just calling layers' backward method...
        #grad_layer1, grad_bias = net.layer1.backward(inputs)
        
        if debug:
            print("grad_layer1.shape", grad_layer1.shape)
            print("net.layer1.weights.shape", net.layer1.weights.shape)
            print("net.layer1.bias.shape", net.layer1.bias.shape)
        
        net.layer1.weights -= learning_rate * grad_layer1
        net.layer1.bias -= learning_rate * grad_bias
        
    # calculate validation loss for some random indices
    random_idxs = np.random.randint(0, len(x_valid), batch_size)
    y_valid_pred = net.forward(x_valid[random_idxs])
    loss_valid = net.softmax.cross_entropy(y_valid_pred, y_valid[random_idxs])
    
    #calculate accuracy and validation accuracy
    accuracy = np.mean(y_pred.argmax(axis=1) == targets)
    valid_accuracy = np.mean(y_valid_pred.argmax(axis=1) == y_valid[random_idxs])
    
    print(epoch, loss, loss_valid, accuracy, valid_accuracy)

    if prev_loss is None:
        prev_loss = loss
    else:
        if loss > prev_loss:
            print("EPOCH", epoch, "ITS BIGGER")
            #pdb.set_trace()


0 30.163867417706467 57.75674351750538 0.15 0.1015625
1 57.07530770697142 60.9413864519667 0.15 0.171875
EPOCH 1 ITS BIGGER
2 84.09682128223903 140.2615951418866 0.15 0.1015625
EPOCH 2 ITS BIGGER
3 111.18219486323062 148.38698424889355 0.15 0.1171875
EPOCH 3 ITS BIGGER
4 138.30605967027725 146.41257653072802 0.15 0.1640625
EPOCH 4 ITS BIGGER
5 165.4528919005424 222.3960714939671 0.15 0.1015625
EPOCH 5 ITS BIGGER
6 192.61351504907424 286.70239640631564 0.15 0.140625
EPOCH 6 ITS BIGGER
7 219.7825109734018 240.26942109218515 0.15 0.1328125
EPOCH 7 ITS BIGGER
8 246.95664082206696 nan 0.15 0.0859375
EPOCH 8 ITS BIGGER


  return np.exp(x)/np.sum(np.exp(x))
  return np.exp(x)/np.sum(np.exp(x))
  return -np.mean(np.log(out))


9 nan nan 0.125 0.125


  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
