In [1]:
import numpy as np


In [2]:
def load_data(test=None):
    train_data = np.loadtxt("train_image.csv", delimiter=",")
    test_data = np.loadtxt("test_image.csv",delimiter=",")
    train_labels = np.loadtxt("train_label.csv")
    if test is None:
        return train_data, test_data, train_labels
    test_labels = np.loadtxt("test_label.csv")
    return train_data, test_data, train_labels, test_labels
        

def lab_2_onehot(labels):
    vecs = 0.01*np.ones((labels.shape[0], 10))
    for i in range(labels.shape[0]):
        vecs[i][int(labels[i])] = 0.99
    return vecs

In [3]:
X_train, X_test, Y_train, Y_test = load_data(test=True)
Y_train_one_hot = lab_2_onehot(Y_train)
Y_test_one_hot  = lab_2_onehot(Y_test)

In [5]:
lr = 0.5
epochs = 50
nx = 784
nh = 64
momentum = 0.9
batch_size = 64
params = {"W1": np.random.randn(nh, nx)/np.sqrt(nx),
          "b1": np.ones((nh, 1))/np.sqrt(nx),
          "W2": np.random.randn(10,nh)/np.sqrt(nh),
          "b2": np.ones((10,1))/np.sqrt(nh)
         }
batches = 60000//64

In [7]:
def sigmoid(z):
    """
    sigmoid activation function.

    inputs: z
    outputs: sigmoid(z)
    """
    s = 1. / (1. + np.exp(-z))
    return s

def compute_loss(Y, Y_hat):
    """
    compute loss function
    """
    L_sum = np.sum(np.multiply(Y, np.log(Y_hat)))
    m = Y.shape[1]
    L = -(1./m) * L_sum

    return L


def feed_forward(X, params):
    """
    feed forward network: 2 - layer neural net

    inputs:
        params: dictionay a dictionary contains all the weights and biases

    return:
        cache: dictionay a dictionary contains all the fully connected units and activations
    """
    cache = {}

    # Z1 = W1.dot(x) + b1
    cache["Z1"] = np.matmul(params["W1"], X) + params["b1"]

    # A1 = sigmoid(Z1)
    cache["A1"] = sigmoid(cache["Z1"])

    # Z2 = W2.dot(A1) + b2
    cache["Z2"] = np.matmul(params["W2"], cache["A1"]) + params["b2"]

    # A2 = softmax(Z2)
    cache["A2"] = np.exp(cache["Z2"]) / np.sum(np.exp(cache["Z2"]), axis=0)

    return cache


def back_propagate(X, Y, params, cache, m_batch):
    """
    back propagation

    inputs:
        params: dictionay a dictionary contains all the weights and biases
        cache: dictionay a dictionary contains all the fully connected units and activations

    return:
        grads: dictionay a dictionary contains the gradients of corresponding weights and biases
    """
    # error at last layer
    dZ2 = cache["A2"] - Y

    # gradients at last layer (Py2 need 1. to transform to float)
    dW2 = (1. / m_batch) * np.matmul(dZ2, cache["A1"].T)
    db2 = (1. / m_batch) * np.sum(dZ2, axis=1, keepdims=True)

    # back propgate through first layer
    dA1 = np.matmul(params["W2"].T, dZ2)
    dZ1 = dA1 * sigmoid(cache["Z1"]) * (1 - sigmoid(cache["Z1"]))

    # gradients at first layer (Py2 need 1. to transform to float)
    dW1 = (1. / m_batch) * np.matmul(dZ1, X.T)
    db1 = (1. / m_batch) * np.sum(dZ1, axis=1, keepdims=True)

    grads = {"dW1": dW1, "db1": db1, "dW2": dW2, "db2": db2}

    return grads



for i in range(epochs):

    # shuffle training set
    permutation = np.random.permutation(X_train.shape[1])
    X_train_shuffled = X_train[:, permutation]
    Y_train_shuffled = Y_train[permutation]

    for j in range(batches):

        # get mini-batch
        begin = j * batch_size
        end = min(begin + batch_size, X_train.shape[1] - 1)
        X = X_train_shuffled[:, begin:end]
        Y = Y_train_shuffled[:, begin:end]
        m_batch = end - begin

        # forward and backward
        cache = feed_forward(X, params)
        grads = back_propagate(X, Y, params, cache, m_batch)

        # with momentum (optional)
        dW1 = (momentum * dW1 + (1. - momentum) * grads["dW1"])
        db1 = (momentum * db1 + (1. - momentum) * grads["db1"])
        dW2 = (momentum * dW2 + (1. - momentum) * grads["dW2"])
        db2 = (momentum * db2 + (1. - momentum) * grads["db2"])

        # gradient descent
        params["W1"] = params["W1"] - lr * dW1
        params["b1"] = params["b1"] - lr * db1
        params["W2"] = params["W2"] - lr * dW2
        params["b2"] = params["b2"] - lr * db2

    # forward pass on training set
    cache = feed_forward(X_train, params)
    train_loss = compute_loss(Y_train, cache["A2"])

    # forward pass on test set
    cache = feed_forward(X_test, params)
    test_loss = compute_loss(Y_test, cache["A2"])
    print("Epoch {}: training loss = {}, test loss = {}".format(
        i + 1, train_loss, test_loss))



NameError: name 'batches' is not defined