In [1]:
######################################
from IPython.display import HTML
from IPython.display import display

# Taken from https://stackoverflow.com/questions/31517194/how-to-hide-one-specific-cell-input-or-output-in-ipython-notebook
tag = HTML('''<script>
code_show=true; 
function code_toggle() {
    if (code_show){
        $('div.cell.code_cell.rendered.selected div.input').hide();
    } else {
        $('div.cell.code_cell.rendered.selected div.input').show();
    }
    code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
To show/hide this cell's raw code input about last HW2 code, click <a href="javascript:code_toggle()">here</a>.''')
display(tag)
#######################################


import numpy as np

np.random.seed(12345)

def initialize(input_dim, hidden1_dim, output_dim, batch_size):
    W1 = np.random.randn(hidden1_dim, input_dim) * 0.01
    b1 = np.zeros((hidden1_dim,))    
    W3 = np.random.randn(output_dim, hidden1_dim) * 0.01
    b3 = np.zeros((output_dim,))

    parameters = [W1, b1, W3, b3]
    x = np.random.rand(input_dim, batch_size)
    y = np.random.randn(output_dim, batch_size)

    return parameters, x, y


def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def deriv_sigmoid(x):
    return x * (1 - x)

def forward(parameters, X):
    W1, b1, W3, b3 = parameters

    batch_size = X.shape[1]
    hidden1_dim = W1.shape[0]
    output_dim = W3.shape[0]

    hid_1 = np.zeros((hidden1_dim, batch_size))
    outputs = np.zeros((output_dim, batch_size))

    #####################################
    hid_1 = sigmoid(np.dot(W1, X) + b1.reshape(-1,1))
    outputs = np.dot(W3, hid_1) + b3.reshape(-1,1)
    #####################################
    
    activations = [X, hid_1, outputs]

    return activations

def squared_loss(predictions, targets):
    """ Computes mean squared error

    predictions: (output_dim, batch_size)
    targets: (output_dim, batch_size)

    """

    loss = np.zeros(targets.shape[1])

    #####################
    loss = (1./targets.shape[1]) * np.sum(np.sum(.5 * (predictions - targets)**2, axis=0))
    #####################   

    return np.mean(loss)


def deriv_squared_loss(predictions, targets):
    
    batch_size = targets.shape[1]
    dloss = np.zeros(targets.shape)

    #####################
    dloss = (predictions - targets) / batch_size
    #####################
    

    return dloss


def backward(activations, targets, parameters):

    X, hid_1, predictions = activations

    input_dim = X.shape[0]
    hidden1_dim = hid_1.shape[0]
    output_dim = predictions.shape[0]

    W1, b1, W3, b3 = parameters

    dW1 = np.zeros((hidden1_dim, input_dim))
    db1 = np.zeros((hidden1_dim,))   
    dW3 = np.zeros((output_dim, hidden1_dim))
    db3 = np.zeros((output_dim,))

    ##############################
    out_error = squared_loss(predictions, targets)
    out_delta = deriv_squared_loss(predictions, targets)
    
#     dhid_2_error = np.dot(W3.T, out_delta)
#     dhid_2_delta = dhid_2_error * deriv_sigmoid(hid_2)
    
#     dhid_1_error = np.dot(W2.T, dhid_2_delta)
#     dhid_1_delta = dhid_1_error * deriv_sigmoid(hid_1)
    dhid_1_error = np.dot(W3.T, out_delta)
    dhid_1_delta = dhid_1_error * deriv_sigmoid(hid_1)
    
    dW1 = np.dot(dhid_1_delta, X.T)
    db1 = np.sum(dhid_1_delta, axis=1)
    
#     dW2 = np.dot(dhid_2_delta, hid_1.T)
#     db2 = np.sum(dhid_2_delta, axis=1)
    
    dW3 = np.dot(out_delta, hid_1.T)
    db3 = np.sum(out_delta, axis=1)
    ##############################
    

    grads = [dW1, db1, dW3, db3]

    return grads

def convert_to_1d_vector(parameters):
    W1, b1, W3, b3 = parameters
    params = np.concatenate([W1.ravel(), b1.ravel(),                            
                             W3.ravel(), b3.ravel()], axis=0)

    return params


def convert_to_list(params, input_dim, hidden1_dim, output_dim):
    base_idx = 0

    W1 = np.reshape(params[base_idx: base_idx + input_dim * hidden1_dim],
                    (hidden1_dim, input_dim))
    base_idx += input_dim * hidden1_dim

    b1 = params[base_idx: base_idx + hidden1_dim]
    base_idx += hidden1_dim

    W3 = np.reshape(params[base_idx: base_idx + hidden1_dim * output_dim],
                    (output_dim, hidden1_dim))
    base_idx += hidden1_dim * output_dim

    b3 = params[base_idx: base_idx + output_dim]

    parameters = [W1, b1, W3, b3]

    return parameters


def gradient_check(parameters, gradients, X, Y, loss, eps=1e-7):
    W1, b1, W3, b3 = parameters
    network_structure = [X.shape[0], W1.shape[0], W3.shape[0]]

    # convert a list of parameters to a single vector
    params = convert_to_1d_vector(parameters)
    grads = convert_to_1d_vector(gradients)

    n_params = len(params)
    losses_plus = np.zeros((n_params,))
    losses_minus = np.zeros((n_params,))
    num_grads = np.zeros((n_params,))

    for i in range(n_params):
        params_eps_plus = np.copy(params)
        params_eps_plus[i] += eps

        parameters_plus = convert_to_list(params_eps_plus, *network_structure)

        activations = forward(parameters_plus, X)
        P = activations[-1]
        losses_plus = loss(P, Y)

        params_eps_minus = np.copy(params)
        params_eps_minus[i] -= eps

        parameters_minus = convert_to_list(params_eps_minus, *network_structure)

        activations = forward(parameters_minus, X)
        P = activations[-1]
        losses_minus = loss(P, Y)

        num_grads[i] = (losses_plus - losses_minus) / (2*eps)

    diff = np.linalg.norm(grads - num_grads) / (np.linalg.norm(grads) + np.linalg.norm(num_grads))

    return diff

if __name__ != '__main__':

    input_dim = 3
    hidden_dim = 4
    output_dim = 2
    batch_size = 5

    parameters, X, Y = initialize(input_dim, hidden_dim, output_dim, batch_size)

    activations = forward(parameters, X)

    P = activations[-1]

    loss = squared_loss(P, Y)
    print('Loss: {}'.format(loss))

    grads = backward(activations, Y, parameters)

    diff = gradient_check(parameters, grads, X, Y, squared_loss)

    print('Gradient checking: ')
    if diff < 1e-7:
        print('\tPassed')
    else:
        print('\tFailed')

In [2]:
import Ipynb_importer #To import ipynotebook

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_mldata
from sklearn.preprocessing import StandardScaler
from sklearn.utils import check_random_state 

mnist = fetch_mldata('MNIST original')
X = mnist.data.astype('float64')
y = mnist.target
random_state = check_random_state(0) 

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=1000, test_size=300, random_state= random_state)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test) 
X_train = X_train.T
X_test = X_test.T

from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder() 
enc.fit(np.arange(10).reshape(10,1) )

y_train_onehot =  enc.transform( y_train.reshape(y_train.shape[0],1) ).toarray()
y_test_onehot = enc.transform( y_test.reshape(y_test.shape[0],1) ).toarray()
y_train_onehot, y_test_onehot = y_train_onehot.T, y_test_onehot.T

In [3]:
if __name__ == '__main__':
    input_dim = X_train.shape[0] # batch_size
    hidden_dim = 30
    output_dim = 10
    batch_size = 5
    epoc = 200

    parameters, X, Y = initialize(input_dim, hidden_dim, output_dim, batch_size)
    
    for i in range(epoc):
        x = X_train[: , i*batch_size: (i*batch_size + batch_size)]
        y = y_train_onehot[: , i*batch_size: i*batch_size + batch_size]
        activations = forward(parameters, x)
        P = activations[-1]
        loss = squared_loss(P, y)
        grads = backward(activations, y, parameters)
        rate = 0.141
        parameters[0] -= rate * grads[0]
        parameters[1] -= rate * grads[1]
        parameters[2] -= rate * grads[2]
        parameters[3] -= rate * grads[3]

    expected = y_test

    predicted = forward(parameters, X_test) #Network output
    predicted = predicted[2]
    for i in range( predicted.shape[1] ):
        k = np.argmax(predicted[:, i])
        predicted[0, i] = k
    predicted = predicted[0, :]
    predicted = predicted.T
        
    print("Classification report:")

    print(classification_report(expected, predicted))

    print("Confusion matrix:")

    print(confusion_matrix(expected, predicted))

Classification report:
             precision    recall  f1-score   support

        0.0       1.00      0.69      0.81        32
        1.0       0.91      0.94      0.93        34
        2.0       0.54      0.91      0.67        32
        3.0       0.91      0.65      0.75        31
        4.0       0.69      0.92      0.79        26
        5.0       0.59      0.83      0.69        24
        6.0       0.90      0.96      0.93        27
        7.0       0.62      0.94      0.74        31
        8.0       0.94      0.48      0.64        31
        9.0       1.00      0.19      0.32        32

avg / total       0.82      0.74      0.73       300

Confusion matrix:
[[22  0  7  0  0  2  0  1  0  0]
 [ 0 32  1  0  0  1  0  0  0  0]
 [ 0  0 29  0  1  0  1  0  1  0]
 [ 0  0  8 20  0  2  0  1  0  0]
 [ 0  1  0  0 24  0  1  0  0  0]
 [ 0  0  2  0  1 20  1  0  0  0]
 [ 0  0  1  0  0  0 26  0  0  0]
 [ 0  1  1  0  0  0  0 29  0  0]
 [ 0  1  5  1  0  9  0  0 15  0]
 [ 0  0  0  1  9  0  0 