# Todo list

1. one_hot √

* mini-batch

* normalization

* train/dev/test set √

* linear function √

* sigmoid function

* tanh function

* relu function √

* softmax function √

* loss function √

* cost function

* regularization

* drop out

* batch normalization

* momentum

* exponentially moving average

* Adam

* all backpropagation of above

In [1]:
import numpy as np
import os
import matplotlib.pyplot as plt
import struct
from array import array
%matplotlib inline

In [2]:
def load_mnist():
    """ 
    load MNIST dataset into numpy array 
    MNIST dataset can be downloaded manually.
    url: http://yann.lecun.com/exdb/mnist/
    """
    ret = {}
    with open('MNIST/train-images.idx3-ubyte', 'rb') as f:
        magic, size, rows, cols = struct.unpack(">IIII", f.read(16))
        assert(magic==2051)
        ret['X_train'] = np.array(array("B", f.read())).reshape(size,rows,cols)

    with open('MNIST/t10k-images.idx3-ubyte', 'rb') as f:
        magic, size, rows, cols = struct.unpack(">IIII", f.read(16))
        assert(magic==2051)
        ret['X_test'] = np.array(array("B", f.read())).reshape(size,rows,cols)

    with open('MNIST/train-labels.idx1-ubyte', 'rb') as f:
        magic, size = struct.unpack(">II", f.read(8))
        assert(magic==2049)
        ret['Y_train'] = np.array(array("B", f.read())).reshape(size,1)

    with open('MNIST/t10k-labels.idx1-ubyte', 'rb') as f:
        magic, size = struct.unpack(">II", f.read(8))
        assert(magic==2049)
        ret['Y_test'] = np.array(array("B", f.read())).reshape(size,1)

    return ret

In [3]:
mnist_original = load_mnist()

""" random shuffle the training set """
permutation = np.random.permutation(mnist_original['X_train'].shape[0])
mnist_original['X_train'] = mnist_original['X_train'][permutation]
mnist_original['Y_train'] = mnist_original['Y_train'][permutation]

""" divide trainset into trainset and devset """
len_of_dev = 10000
mnist_original['X_dev'] = mnist_original['X_train'][:len_of_dev]
mnist_original['Y_dev'] = mnist_original['Y_train'][:len_of_dev]
mnist_original['X_train'] = mnist_original['X_train'][len_of_dev:]
mnist_original['Y_train'] = mnist_original['Y_train'][len_of_dev:]

print('X_train:', mnist_original['X_train'].shape,
      'X_dev:', mnist_original['X_dev'].shape,
      'X_test:', mnist_original['X_test'].shape)
print('Y_train:', mnist_original['Y_train'].shape,
      'Y_dev:', mnist_original['Y_dev'].shape,
      'Y_test:', mnist_original['Y_test'].shape)


X_train: (50000, 28, 28) X_dev: (10000, 28, 28) X_test: (10000, 28, 28)
Y_train: (50000, 1) Y_dev: (10000, 1) Y_test: (10000, 1)


In [4]:
def manually_validate_dataset(dataset):
    random_train = np.random.randint(1, len(dataset['X_train']))-1
    random_dev = np.random.randint(1, len(dataset['X_dev']))-1
    random_test = np.random.randint(1, len(dataset['X_test']))-1
    print(dataset['Y_train'][random_train], dataset['Y_dev'][random_dev], dataset['Y_test'][random_test])
    fig, (ax1, ax2, ax3) = plt.subplots(1,3, sharey=True, figsize=[10,3])
    ax1.imshow(dataset['X_train'][random_train], cmap='gray')
    ax2.imshow(dataset['X_dev'][random_dev], cmap='gray')
    ax3.imshow(dataset['X_test'][random_test], cmap='gray')
    plt.show()

#manually_validate_dataset(mnist_original)

$$ X = 
\begin{bmatrix}
\vert & & \vert & & \vert \\
x^{(1)} & ... & x^{(i)} & ... & x^{(m)} \\
\vert & & \vert & & \vert
\end{bmatrix} 
\quad \quad
Y = 
\begin{bmatrix}
\vert & & \vert & & \vert \\
y_{one\_hot}^{(1)} & ... & y_{one\_hot}^{(i)} & ... & y_{one\_hot}^{(m)} \\
\vert & & \vert & & \vert \\
\end{bmatrix}
$$

In [5]:
mnist = {}

""" X is 28*28 image """
def flatten(X):
    """ prepare X to (nx, m) shape """
    X = X.reshape(-1, 28*28).T
    return X

mnist['X_train'] = flatten(mnist_original['X_train'])
mnist['X_dev'] = flatten(mnist_original['X_dev'])
mnist['X_test'] = flatten(mnist_original['X_test'])

""" Y is label 0-9 """
def one_hot(Y, C):
    """ prepare Y to (1, m) shape """
    assert(Y.shape[1]==1)
    Y_ret = np.zeros((Y.shape[0], C))
    Y_ret[np.arange(Y.shape[0]), Y.reshape(-1).astype(int)] = 1
    Y_ret = Y_ret.T
    return Y_ret

def test_one_hot():
    Y = np.ones((5,1))
    Y = one_hot(Y, 10)
    assert(Y[0,0]==0)
    assert(Y[1,0]==1)
    assert(Y[2,1]==0)

def back_one_hot(Y):
    """ convert one hot Y back to real number """
    Y_ret = np.repeat( [np.arange(Y.shape[0])], repeats=Y.shape[1], axis=0 )
    assert(Y_ret.shape == Y.T.shape)
    Y_ret = Y_ret[Y.T.astype(bool)]
    return Y_ret.reshape(-1,1)

mnist['Y_train'] = one_hot(mnist_original['Y_train'], 10)
mnist['Y_dev'] = one_hot(mnist_original['Y_dev'], 10)
mnist['Y_test'] = one_hot(mnist_original['Y_test'], 10)

print(mnist['X_train'].shape, mnist['Y_train'].shape)

(784, 50000) (10, 50000)


In [6]:
""" layers of network, include the last softmax layer """
layers = [1, 2, 3, mnist['Y_train'].shape[0]]

def initialize_parameters(layers, x_size):
    """ init W b or any other parameters in every layer """
    parameters = {}
    cells_prev = x_size
    for layer_idx, cells in enumerate(layers):
        parameters['W'+str(layer_idx+1)] = np.random.randn(cells, cells_prev) * 0.01
        parameters['b'+str(layer_idx+1)] = np.zeros((cells, 1))
        cells_prev = layers[layer_idx]
    return parameters

parameters = initialize_parameters(layers, mnist['X_train'].shape[0])
for key, value in sorted(parameters.items()):
    print(key, ':', value.shape)

W1 : (1, 784)
W2 : (2, 1)
W3 : (3, 2)
W4 : (10, 3)
b1 : (1, 1)
b2 : (2, 1)
b3 : (3, 1)
b4 : (10, 1)


$$
ReLu = max(0, x)
\quad \quad
Softmax = \frac{\exp(Z)}{\sum_i^n{\exp(Z)}}
$$

$$
Z = W \dot X + b
\quad \quad
A = active(Z)
$$

In [7]:
def ReLU(X):
    return X * (X > 0)

def test_ReLU():
    X = np.array([1.,2.,-2.,-3.])
    Y = np.array([1.,2.,0.,0.])
    bias = np.sum(np.abs(Y - ReLU(X)))
    assert(bias<0.0001)
#test_ReLU()

def softmax(X):
    s = np.sum(np.exp(X))
    return np.exp(X) / s

def test_softmax():
    X = np.array([-3.44,1.16,-0.81,3.91])
    Y = np.array([0.0006, 0.0596, 0.0083, 0.9315])
    bias = np.sum(np.abs(Y - softmax(X)))
    assert(bias<0.0001)
#test_softmax()

def forward_propagation(X, layers, parameters):
    A = X
    for layer_idx, cells in enumerate(layers):
        Z = np.dot(parameters['W'+str(layer_idx+1)], A)
        if layer_idx<len(layers)-1:
            """ normal layers use relu """
            A = ReLU(Z)
        else:
            """ last layers use softmax """
            A = softmax(Z)
    return A

Y_hat = forward_propagation(mnist['X_train'], layers, parameters)
print(Y_hat.shape)

(10, 50000)


$$
L(\hat{y}, y) = -\frac{1}{m} \sum_i^m{(y_i\log(\hat{y}_i) + (1-y_i)\log(1-\hat{y}_i))}
$$

In [8]:
def loss(Y_hat, Y):
    A = (np.multiply(Y, np.log(Y_hat)))
    B = (np.multiply(1-Y, np.log(1-Y_hat)))
    C = -np.mean(A+B)
    return C

def test_loss():
    Y = np.asarray([[1, 1, 1]])
    aL = np.array([[.8,.9,0.4]])
    assert(loss(aL, Y) - 0.414931599615 < 0.0001)
#test_loss()
L = loss(Y_hat, mnist['Y_train'])
L

1.3122381664312115

In [9]:
def predict(Y_hat):
    return np.argmax(Y_hat,axis=0).reshape(-1,1)

def test_predict():
    Y = np.array([[0.9,0.01,0.01,0.01,0.02,0.01,0.01,0.01,0.01,0.01],
                  [0.01,0.01,0.01,0.02,0.9,0.01,0.01,0.01,0.01,0.01]
                 ]).T
    print(Y.shape)
    Y_result = predict(Y)
    print(Y_result)
    Z = np.array([0,4]).reshape(2,1)
    assert(np.sum(Y_result != Z)==0)
#test_predict()

Y_predict = predict(Y_hat)
accurate = np.sum(np.equal(Y_predict, back_one_hot(mnist['Y_train']))) / Y_predict.shape[0]
accurate

0.091800000000000007