In [0]:
import numpy as np
np.random.seed(0)

In [0]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.datasets as dsets

In [0]:
# http://pytorch.org/
from os.path import exists
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())
cuda_output = !ldconfig -p|grep cudart.so|sed -e 's/.*\.\([0-9]*\)\.\([0-9]*\)$/cu\1\2/'
accelerator = cuda_output[0] if exists('/dev/nvidia0') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.4.1-{platform}-linux_x86_64.whl torchvision
import torch

In [0]:
class MLP:
    def __init__(self):
        self.w = np.random.normal(size=(785, 10))
        self.w[0, :] = 0
        self.lr = 0.01
        self.batch_size = 20


def softmax(self, x):
    x = x - np.max(x, axis=1).reshape((-1,1))
    x = np.exp(x)
    return x / np.sum(x, axis=1).reshape((-1, 1))

def softmax_der(self, softmax):
    '''
    If you dont know how to calc it see:
    https://eli.thegreenplace.net/2016/the-softmax-function-and-its-derivative/
    https://math.stackexchange.com/questions/945871/derivative-of-softmax-loss-function
    https://deepnotes.io/softmax-crossentropy
    https://medium.com/@aerinykim/how-to-implement-the-softmax-derivative-independently-from-any-loss-function-ae6d44363a9d
    https://stackoverflow.com/questions/40575841/numpy-calculate-the-derivative-of-the-softmax-function
    '''
    result = np.zeros((softmax.shape[0], softmax.shape[1], softmax.shape[1]))
    for idx in range(softmax.shape[0]):
        result_tmp = (np.diagflat(softmax[idx]) -
                      np.dot(softmax[idx].reshape((-1, 1)),
                             softmax[idx].reshape((1, -1))))
        result[idx] = result_tmp

    return result

def softmax_der_vectorized(self, softmax):
    '''
    http://ajcr.net/Basic-guide-to-einsum/
    https://stackoverflow.com/questions/48627163/construct-n1-dimensional-diagonal-matrix-from-values-in-n-dimensional-array
    '''
    diagnal = np.zeros((*softmax.shape, softmax.shape[-1]), softmax.dtype)
    np.einsum('...jj->...j', diagnal)[...] = softmax
    return diagnal-np.einsum('ij,i...->ij...',softmax,softmax)
    
    def mse_loss(self, predicted, target):
        return np.sum(np.square(target - predicted))

    def mse_der(self, predicted, target):
        result = 2*(predicted - target)
        return result[..., np.newaxis]

    def forward(self, input_data):
        input_data = np.hstack((np.ones(self.batch_size).reshape(-1, 1), input_data))
        self._data = input_data
        return self.softmax(np.inner(input_data, self.w.T))

    def backward(self, predicted, target):
        delta = self.mse_der(predicted, target) *  self.softmax_der_vectorized(predicted)
        delta = np.sum(delta, axis=1)
   
        update = np.dot(self._data.T, delta)
        
        self.w -= self.lr * update
        return self.mse_loss(predicted, target)

In [0]:
batch_size = 20
train_dataset= dsets.MNIST(root='./data', train=True, transform=transforms.ToTensor(), download=True)
test_dataset= dsets.MNIST(root='./data', train=False, transform=transforms.ToTensor(), download=True)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [28]:
epochs = 10
mlp = MLP()
print('Train for %d epochs' % epochs)
for epoch in range(epochs):
    print('Epoch %d' % epoch)
    for batch_idx, (data, target) in enumerate(train_loader):
        data = data.numpy().squeeze().reshape((batch_size, -1))
        target = target.numpy()
        one_hot = np.array([np.eye(10)[i] for i in target])
        output = mlp.forward(data)
        loss = mlp.backward(output, one_hot)
        if batch_idx % 1000 == 0:
            print('[%d/%d]Loss: ' % (batch_idx, len(train_loader)), loss)

Train for 10 epochs
Epoch 0
[0/3000]Loss:  33.609620349549225
[1000/3000]Loss:  21.81188541909328
[2000/3000]Loss:  14.472095336450208
Epoch 1
[0/3000]Loss:  10.181574589920944
[1000/3000]Loss:  11.620620287393734
[2000/3000]Loss:  8.177264129520966
Epoch 2
[0/3000]Loss:  9.156730459009804
[1000/3000]Loss:  7.998133788065669
[2000/3000]Loss:  10.891740814890941
Epoch 3
[0/3000]Loss:  5.557405341184383
[1000/3000]Loss:  12.51427489952469
[2000/3000]Loss:  3.98951496719494
Epoch 4
[0/3000]Loss:  3.4146170312996316
[1000/3000]Loss:  12.789549485792115
[2000/3000]Loss:  4.205618289211201
Epoch 5
[0/3000]Loss:  3.885511574926349
[1000/3000]Loss:  4.775450359579029
[2000/3000]Loss:  7.278697003932824
Epoch 6
[0/3000]Loss:  5.1018734808151915
[1000/3000]Loss:  1.5425730747371793
[2000/3000]Loss:  0.10309253480573724
Epoch 7
[0/3000]Loss:  4.79644239681873
[1000/3000]Loss:  4.736821875760951
[2000/3000]Loss:  5.315254708540284
Epoch 8
[0/3000]Loss:  4.265360212973095
[1000/3000]Loss:  3.086614

In [29]:
print('Test')
correct_labels = 0
for batch_idx, (data, target) in enumerate(test_loader):
    data = data.numpy().squeeze().reshape((batch_size, -1))
    target = target.numpy()
    output = mlp.forward(data)
    predicted = np.argmax(output, axis=1)
    correct_labels += np.sum(predicted == target)

print('Test accuracy: ', correct_labels / len(test_dataset))

Test
Test accuracy:  0.9101
