In [9]:
import numpy as np
import sys
from IPython.core.debugger import set_trace

In [10]:
import ipynb_reader
import mnist_load as load

#defining data 
train_images_array = load.train_images_array;
train_labels_array = load.train_labels_array;
test_images_array = load.test_images_array;
test_labels_array = load.test_labels_array;

In [11]:
train_images_array = train_images_array.reshape((train_images_array.shape[0], -1))
test_images_array = test_images_array.reshape((test_images_array.shape[0], -1))

Simplest implementation - Two layer neural network

In [None]:
input = train_images_array
goal_pred = train_labels_array
weights = np.zeros((input.shape[1], goal_pred.shape[1]))
iterations, alpha = 100, .01

def neural_network(input, weights): 
    for i in range(iterations):
        pred = input.dot(weights)
        delta = pred - goal_pred
        error = np.sum((delta ** 2)) / input.shape[0]
        weight_delta = delta.T.dot(input).T
        weights -= weight_delta * alpha
        print(' Error: ' + str(error))
        
neural_network(input, weights)

**Three layer neural network - Nonlinearity and backpropagation**

In [30]:
np.random.seed(1)

iterations, alpha = 350, .01

hidden_size = 30

weights_0_1 = .2 * np.random.random((train_images_array.shape[1], hidden_size)) - .1
weights_1_2 = .2 * np.random.random((hidden_size, train_labels_array.shape[1])) - .1

weights = {
    'weights_0_1': weights_0_1,
    'weights_1_2': weights_1_2
}

def relu(x):
    return (x > 0) * x

def relu2deriv(x): 
    return (x > 0)


def neural_network(input: np.ndarray, weights: dict) -> None:
    for iteration in range(iterations):
        train_error = 0
        train_correct_cnt = 0
        for index, observation in enumerate(input):
            #forward propagation
            layer_0 = input[index:index+1]
            layer_1 = relu(layer_0.dot(weights['weights_0_1']))
            layer_2 = layer_1.dot(weights['weights_1_2'])
            
            layer_2_delta = layer_2 - train_labels_array[index:index+1]
            
            train_error += np.sum((layer_2_delta ** 2))
            train_correct_cnt += int(np.argmax(layer_2) == np.argmax(train_labels_array[index:index+1]))
            
            layer_1_delta = layer_2_delta.dot(weights['weights_1_2'].T) \
                                * relu2deriv(layer_1)
            
            weights_1_2_delta = layer_1.T.dot(layer_2_delta)
            weights_0_1_delta = layer_0.T.dot(layer_1_delta)
            
            weights['weights_1_2'] -= weights_1_2_delta * alpha
            weights['weights_0_1'] -= weights_0_1_delta * alpha
            
        
        if(iteration % 10 == 0):
            test_error = 0
            test_correct_cnt = 0
            for index, observation in enumerate(test_images_array):
                #forward propagation
                layer_0 = test_images_array[index:index+1]
                layer_1 = relu(layer_0.dot(weights['weights_0_1']))
                layer_2 = layer_1.dot(weights['weights_1_2'])

                layer_2_delta = layer_2 - test_labels_array[index:index+1]

                test_error += np.sum((layer_2_delta ** 2))
                test_correct_cnt += int(np.argmax(layer_2) == np.argmax(test_labels_array[index:index+1]))

            print('Train err: ' + str(train_error/len(train_images_array)) + ' Train acc: ' + str(train_correct_cnt) \
                      + ' Test err: ' + str(test_error/len(test_images_array)) + 'Test acc: ' + str(test_correct_cnt))
            
            
neural_network(train_images_array, weights)

Train err: 0.8288112117657275 Train acc: 326 Test err: 0.7739978581822734Test acc: 337
Train err: 0.3052834866693787 Train acc: 857 Test err: 0.5574162363545417Test acc: 641
Train err: 0.2151555808615371 Train acc: 912 Test err: 0.5006069547010612Test acc: 713
Train err: 0.1731913840373818 Train acc: 934 Test err: 0.48507880715910784Test acc: 734
Train err: 0.14830335106149323 Train acc: 951 Test err: 0.4743294954691758Test acc: 735
Train err: 0.1293786979814382 Train acc: 961 Test err: 0.47386161318692216Test acc: 754
Train err: 0.11452623769554238 Train acc: 966 Test err: 0.46489997488910384Test acc: 763
Train err: 0.10282767254153001 Train acc: 970 Test err: 0.45471101068323333Test acc: 774
Train err: 0.09412530429867552 Train acc: 972 Test err: 0.44942717047693626Test acc: 772
Train err: 0.08733014543876857 Train acc: 974 Test err: 0.45083337256342976Test acc: 767
Train err: 0.1054236640373662 Train acc: 957 Test err: 0.4512706436225698Test acc: 765
Train err: 0.07426604516798402 T

**Add dropout**

This should point out to what dropout really is: it's noise. It makes it more difficult for the network to train on the training data.

In [10]:
np.random.seed(1)

iterations, alpha = 300, .005

hidden_size = 100

weights_0_1 = .2 * np.random.random((train_images_array.shape[1], hidden_size)) - .1
weights_1_2 = .2 * np.random.random((hidden_size, train_labels_array.shape[1])) - .1

weights = {
    'weights_0_1': weights_0_1,
    'weights_1_2': weights_1_2
}

def relu(x):
    return (x > 0) * x

def relu2deriv(x): 
    return (x > 0)


def neural_network(input: np.ndarray, weights: dict) -> None:
    for iteration in range(iterations):
        train_error = 0
        train_correct_cnt = 0
        for index, observation in enumerate(input):
            #forward propagation
            layer_0 = input[index:index+1]            
            
            layer_1 = relu(layer_0.dot(weights['weights_0_1']))
            dropout_mask = np.random.randint(2, size=layer_1.shape)
            layer_1 *= dropout_mask * 2
            
            layer_2 = layer_1.dot(weights['weights_1_2'])
            
            layer_2_delta = layer_2 - train_labels_array[index:index+1]
            
            train_error += np.sum((layer_2_delta ** 2))
            train_correct_cnt += int(np.argmax(layer_2) == np.argmax(train_labels_array[index:index+1]))
            
            layer_1_delta = layer_2_delta.dot(weights['weights_1_2'].T) \
                                * relu2deriv(layer_1)
            layer_1_delta *= dropout_mask
            
            weights_1_2_delta = layer_1.T.dot(layer_2_delta)
            weights_0_1_delta = layer_0.T.dot(layer_1_delta)
            
            weights['weights_1_2'] -= weights_1_2_delta * alpha
            weights['weights_0_1'] -= weights_0_1_delta * alpha
            
        
        if(iteration % 10 == 0):
            test_error = 0
            test_correct_cnt = 0
            for index, observation in enumerate(test_images_array):
                #forward propagation
                layer_0 = test_images_array[index:index+1]
                layer_1 = relu(layer_0.dot(weights['weights_0_1']))
                layer_2 = layer_1.dot(weights['weights_1_2'])

                layer_2_delta = layer_2 - test_labels_array[index:index+1]

                test_error += np.sum((layer_2_delta ** 2))
                test_correct_cnt += int(np.argmax(layer_2) == np.argmax(test_labels_array[index:index+1]))

            print('Train err: ' + str(train_error/len(train_images_array)) + ' Train acc: ' + str(train_correct_cnt) \
                      + ' Test err: ' + str(test_error/len(test_images_array)) + ' Test acc: ' + str(test_correct_cnt))
            
            
neural_network(train_images_array, weights)

Train err: 0.9250876349606809 Train acc: 265 Test err: 0.7831812858165159 Test acc: 432
Train err: 0.5569089972691266 Train acc: 637 Test err: 0.5348375824714446 Test acc: 665
Train err: 0.49294010009549516 Train acc: 697 Test err: 0.49867365412301534 Test acc: 683
Train err: 0.4348550132879813 Train acc: 761 Test err: 0.4329321580006654 Test acc: 760
Train err: 0.3991034824374928 Train acc: 793 Test err: 0.4052929193195025 Test acc: 777
Train err: 0.3910061741624961 Train acc: 811 Test err: 0.42118278966625483 Test acc: 782
Train err: 0.379394629428567 Train acc: 807 Test err: 0.37356590907920745 Test acc: 807
Train err: 0.36493899436383537 Train acc: 839 Test err: 0.4030171396893566 Test acc: 811
Train err: 0.3659492229094198 Train acc: 841 Test err: 0.3687700744381114 Test acc: 802
Train err: 0.35500889780196093 Train acc: 853 Test err: 0.3639346974165983 Test acc: 815
Train err: 0.3434234782673846 Train acc: 841 Test err: 0.3743493231847987 Test acc: 799
Train err: 0.34456115152547

**Add Batch gradient descent**

In [41]:
np.random.seed(1)

iterations, alpha = 300, .1

hidden_size, batch_size = 100, 100

weights_0_1 = .2 * np.random.random((train_images_array.shape[1], hidden_size)) - .1
weights_1_2 = .2 * np.random.random((hidden_size, train_labels_array.shape[1])) - .1

weights = {
    'weights_0_1': weights_0_1,
    'weights_1_2': weights_1_2
}

def relu(x):
    return (x > 0) * x

def relu2deriv(x): 
    return (x > 0)


def neural_network(input: np.ndarray, weights: dict) -> None:
    for iteration in range(iterations):
        train_error = 0
        train_correct_cnt = 0
        for index in range(int(len(train_images_array)/batch_size)):
            
            #batch
            batch_start, batch_end = index * batch_size, (index + 1) * batch_size
            
            #forward propagation
            layer_0 = input[batch_start:batch_end]            
            
            layer_1 = relu(layer_0.dot(weights['weights_0_1']))
            dropout_mask = np.random.randint(2, size=layer_1.shape)
            layer_1 *= dropout_mask * 2
            
            layer_2 = layer_1.dot(weights['weights_1_2'])
            
            layer_2_delta = (layer_2 - train_labels_array[batch_start:batch_end]) / batch_size
            
            train_error += np.sum((layer_2_delta ** 2))
            train_correct_cnt += np.sum((np.argmax(layer_2, axis=1) == np.argmax(train_labels_array[batch_start:batch_end], axis=1)).astype(int))
            
            layer_1_delta = layer_2_delta.dot(weights['weights_1_2'].T) \
                                * relu2deriv(layer_1)
            layer_1_delta *= dropout_mask
            
            weights_1_2_delta = layer_1.T.dot(layer_2_delta)
            weights_0_1_delta = layer_0.T.dot(layer_1_delta)
            
            weights['weights_1_2'] -= weights_1_2_delta * alpha
            weights['weights_0_1'] -= weights_0_1_delta * alpha
            
        
        if(iteration % 10 == 0):
            test_error = 0
            test_correct_cnt = 0
            for index, observation in enumerate(test_images_array):
                #forward propagation
                layer_0 = test_images_array[index:index+1]
                layer_1 = relu(layer_0.dot(weights['weights_0_1']))
                layer_2 = layer_1.dot(weights['weights_1_2'])

                layer_2_delta = layer_2 - test_labels_array[index:index+1]

                test_error += np.sum((layer_2_delta ** 2))
                test_correct_cnt += int(np.argmax(layer_2) == np.argmax(test_labels_array[index:index+1]))

            print('Train err: ' + str(train_error/len(train_images_array)) + ' Train acc: ' + str(train_correct_cnt) \
                      + ' Test err: ' + str(test_error/len(test_images_array)) + ' Test acc: ' + str(test_correct_cnt))
            
            
neural_network(train_images_array, weights)

Train err: 0.0005573970698997744 Train acc: 141 Test err: 0.9830041578389545 Test acc: 209
Train err: 8.189187596183152e-05 Train acc: 301 Test err: 0.8055807344912501 Test acc: 291
Train err: 7.554899178711649e-05 Train acc: 405 Test err: 0.7293137678964844 Test acc: 481
Train err: 7.204728741796208e-05 Train acc: 417 Test err: 0.6474245415902079 Test acc: 538
Train err: 6.935513674738765e-05 Train acc: 437 Test err: 0.6254958069504352 Test acc: 584
Train err: 6.742269984629588e-05 Train acc: 460 Test err: 0.6199360940979987 Test acc: 604
Train err: 6.653739280397698e-05 Train acc: 477 Test err: 0.5900802644333478 Test acc: 632
Train err: 6.62595075372212e-05 Train acc: 467 Test err: 0.5926669787746321 Test acc: 699
Train err: 6.384734976636664e-05 Train acc: 503 Test err: 0.5922647776910404 Test acc: 649
Train err: 6.177237911438533e-05 Train acc: 521 Test err: 0.5607088685792284 Test acc: 728
Train err: 5.971055902248943e-05 Train acc: 541 Test err: 0.522805397067298 Test acc: 732
T

**Add better activation functions - Tanh for hidden layer and Softmax for output layer**

In [8]:
np.random.seed(1)

iterations, alpha = 300, 2

hidden_size, batch_size = 100, 100

weights_0_1 = .02 * np.random.random((train_images_array.shape[1], hidden_size)) - .01
weights_1_2 = .2 * np.random.random((hidden_size, train_labels_array.shape[1])) - .1

weights = {
    'weights_0_1': weights_0_1,
    'weights_1_2': weights_1_2
}

def tanh(x):
    return np.tanh(x)

def tanh2deriv(output):
    return 1 - output ** 2

def softmax(x):
    tmp = np.exp(x)
    return tmp / np.sum(tmp, axis=1, keepdims=True)

def neural_network(input: np.ndarray, weights: dict) -> None:
    for iteration in range(iterations):
        #train_error = 0 no error function for we are not ready for that yet - cross entropy
        train_correct_cnt = 0
        for index in range(int(len(train_images_array)/batch_size)):
            
            #batch
            batch_start, batch_end = index * batch_size, (index + 1) * batch_size
            
            #forward propagation
            layer_0 = input[batch_start:batch_end]            
            
            layer_1 = tanh(layer_0.dot(weights['weights_0_1']))
            dropout_mask = np.random.randint(2, size=layer_1.shape)
            layer_1 *= dropout_mask * 2
            
            layer_2 = softmax(layer_1.dot(weights['weights_1_2']))
            
            layer_2_delta = (layer_2 - train_labels_array[batch_start:batch_end]) / (batch_size * layer_2.shape[0])
            
            train_correct_cnt += np.sum((np.argmax(layer_2, axis=1) == np.argmax(train_labels_array[batch_start:batch_end], axis=1)).astype(int))
            
            layer_1_delta = layer_2_delta.dot(weights['weights_1_2'].T) \
                                * tanh2deriv(layer_1)
            layer_1_delta *= dropout_mask
            
            weights_1_2_delta = layer_1.T.dot(layer_2_delta)
            weights_0_1_delta = layer_0.T.dot(layer_1_delta)
            
            weights['weights_1_2'] -= weights_1_2_delta * alpha
            weights['weights_0_1'] -= weights_0_1_delta * alpha
            
        
        if(iteration % 10 == 0):
             #test_error = 0 no error function for we are not ready for that yet - cross entropy
            test_correct_cnt = 0
            for index, observation in enumerate(test_images_array):
                #forward propagation
                layer_0 = test_images_array[index:index+1]
                layer_1 = tanh(layer_0.dot(weights['weights_0_1']))
                layer_2 = layer_1.dot(weights['weights_1_2'])

                test_correct_cnt += int(np.argmax(layer_2) == np.argmax(test_labels_array[index:index+1]))

            print('Train acc: ' + str(train_correct_cnt) \
                      + ' Test acc: ' + str(test_correct_cnt))
            
            
neural_network(train_images_array, weights)

Train acc: 171 Test acc: 99
Train acc: 631 Test acc: 569
Train acc: 719 Test acc: 617
Train acc: 744 Test acc: 670
Train acc: 798 Test acc: 717
Train acc: 829 Test acc: 735
Train acc: 838 Test acc: 756
Train acc: 859 Test acc: 770
Train acc: 880 Test acc: 788
Train acc: 870 Test acc: 793
Train acc: 884 Test acc: 805
Train acc: 893 Test acc: 809
Train acc: 902 Test acc: 812
Train acc: 909 Test acc: 814
Train acc: 901 Test acc: 818
Train acc: 911 Test acc: 818
Train acc: 918 Test acc: 822
Train acc: 923 Test acc: 825
Train acc: 917 Test acc: 827
Train acc: 920 Test acc: 824
Train acc: 924 Test acc: 832
Train acc: 918 Test acc: 831
Train acc: 924 Test acc: 840
Train acc: 932 Test acc: 841
Train acc: 937 Test acc: 832
Train acc: 937 Test acc: 839
Train acc: 941 Test acc: 842
Train acc: 934 Test acc: 839
Train acc: 937 Test acc: 837
Train acc: 945 Test acc: 844


**Add Convolutional layer**

In [23]:
np.random.seed(1)

iterations, alpha = 300, 2

batch_size = 128
input_rows, input_cols = 28, 28
num_kernels, kernel_rows, kernel_cols = 16, 3, 3
hidden_size = (input_rows - kernel_rows) * (input_cols - kernel_cols) * num_kernels # 10000

kernels = .02 * np.random.random((kernel_rows * kernel_cols, num_kernels)) - .01
weights_1_2 = .2 * np.random.random((hidden_size, train_labels_array.shape[1])) - .1

weights = {
    'kernels': kernels,
    'weights_1_2': weights_1_2
}


def tanh(x):
    return np.tanh(x)

def tanh2deriv(output):
    return 1 - output ** 2

def softmax(x):
    tmp = np.exp(x)
    return tmp / np.sum(tmp, axis=1, keepdims=True)

def get_image_section(layer, row_from, row_to, col_from, col_to): 
    subsection = layer[:, row_from:row_to, col_from:col_to]
    return subsection.reshape(-1, 1, row_to-row_from, col_to-col_from)

def neural_network(input: np.ndarray, weights: dict) -> None:
    for iteration in range(iterations):
        #train_error = 0 no error function for we are not ready for that yet - cross entropy
        train_correct_cnt = 0
        for index in range(int(len(train_images_array)/batch_size)):
            
            #batch
            batch_start, batch_end = index * batch_size, (index + 1) * batch_size
            
            #forward propagation
            layer_0 = input[batch_start:batch_end]            
            layer_0 = layer_0.reshape(-1, input_rows, input_cols)
            
            #convolutional layer
            sects = list()
            for row_start in range(layer_0.shape[1] - kernel_rows):
                for col_start in range(layer_0.shape[2] - kernel_cols):
                    sect = get_image_section(layer_0,
                                            row_start, row_start+kernel_rows,
                                            col_start, col_start+kernel_cols)
                    
                    sects.append(sect)
                    
            expanded_input = np.concatenate(sects, axis=1) #(128, 625, 3, 3)           
            flattened_input = expanded_input.reshape(expanded_input.shape[0]*expanded_input.shape[1], -1) #(80000, 9)            
            kernel_output = flattened_input.dot(weights['kernels']) #(80000, 16)
            
            layer_1 = tanh(kernel_output.reshape(expanded_input.shape[0], -1)) #(128, 10000)
            
            dropout_mask = np.random.randint(2, size=layer_1.shape)
            layer_1 *= dropout_mask * 2
            
            layer_2 = softmax(layer_1.dot(weights['weights_1_2']))
            
            layer_2_delta = (layer_2 - train_labels_array[batch_start:batch_end]) / (batch_size * layer_2.shape[0])
            
            train_correct_cnt += np.sum((np.argmax(layer_2, axis=1) == np.argmax(train_labels_array[batch_start:batch_end], axis=1)).astype(int))
            
            layer_1_delta = layer_2_delta.dot(weights['weights_1_2'].T) \
                                * tanh2deriv(layer_1)
            layer_1_delta *= dropout_mask
            
            weights_1_2_delta = layer_1.T.dot(layer_2_delta)
            
            #Updating kernels
            layer_1_delta_reshaped = layer_1_delta.reshape(kernel_output.shape)
            kernel_update = flattened_input.T.dot(layer_1_delta_reshaped)
            
            weights['weights_1_2'] -= weights_1_2_delta * alpha
            weights['kernels'] -= kernel_update * alpha
            
        #continue from here ....
        if(iteration % 10 == 0):
            #test_error = 0 no error function for we are not ready for that yet - cross entropy
            test_correct_cnt = 0
            for index, observation in enumerate(test_images_array):
                
                #forward propagation
                layer_0 = test_images_array[index:index+1]
                layer_0 = layer_0.reshape(-1, input_rows, input_cols)
                
                sects = list()
                for row_start in range(layer_0.shape[1] - kernel_rows):
                    for col_start in range(layer_0.shape[2] - kernel_cols):
                        sect = get_image_section(layer_0, 
                                                 row_start, row_start + kernel_rows,
                                                 col_start, col_start + kernel_cols)
                        sects.append(sect)
                        
                expanded_input = np.concatenate(sects, axis=1)
                flattened_input = expanded_input.reshape(expanded_input.shape[0]*expanded_input.shape[1], -1)
                kernel_output = flattened_input.dot(kernels)
                
                layer_1 = tanh(kernel_output.reshape(expanded_input.shape[0], -1))
                layer_2 = layer_1.dot(weights['weights_1_2'])

                test_correct_cnt += int(np.argmax(layer_2) == np.argmax(test_labels_array[index:index+1]))

            print('Iteration: ' + str(iteration) + ' Train acc: ' + str(train_correct_cnt) \
                      + ' Test acc: ' + str(test_correct_cnt))
            
            
neural_network(train_images_array, weights)

Iteration: 0 Train acc: 93 Test acc: 92
Iteration: 10 Train acc: 191 Test acc: 273
Iteration: 20 Train acc: 350 Test acc: 429
Iteration: 30 Train acc: 507 Test acc: 584
Iteration: 40 Train acc: 609 Test acc: 635
Iteration: 50 Train acc: 668 Test acc: 682
Iteration: 60 Train acc: 693 Test acc: 697
Iteration: 70 Train acc: 712 Test acc: 720
Iteration: 80 Train acc: 727 Test acc: 739
Iteration: 90 Train acc: 747 Test acc: 745
Iteration: 100 Train acc: 768 Test acc: 757
Iteration: 110 Train acc: 757 Test acc: 779
Iteration: 120 Train acc: 768 Test acc: 785
Iteration: 130 Train acc: 775 Test acc: 794
Iteration: 140 Train acc: 784 Test acc: 796
Iteration: 150 Train acc: 782 Test acc: 797
Iteration: 160 Train acc: 791 Test acc: 801
Iteration: 170 Train acc: 802 Test acc: 802
Iteration: 180 Train acc: 793 Test acc: 808
Iteration: 190 Train acc: 809 Test acc: 807
Iteration: 200 Train acc: 803 Test acc: 805
Iteration: 210 Train acc: 818 Test acc: 811
Iteration: 220 Train acc: 814 Test acc: 811
I

One output node trainning => it wouldn't actually work for we would update the same set weights, therefore the last number(image) predicted would set the weights to recognize those specific patterns. 

One output node for each of our numbers i.e. 0 to 9;

In [5]:
weights = np.random.rand(784, 10);
alpha = .00000000001;
goal_pred = [];

def goal_pred_by_label(label):
    return {
         0: [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         1: [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
         2: [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
         3: [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
         4: [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
         5: [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
         6: [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
         7: [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
         8: [0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
         9: [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
    }.get(label);

for i in range(len(imageBs_array)):
    goal_pred.append(goal_pred_by_label(labels_array[i]));
    
def calc_delta(pred, labels_array):
    goal_pred, delta = [[], []];
    for i in range(len(labels_array)):
        goal_pred.append(goal_pred_by_label(labels_array[i]));
    for j in range(len(goal_pred)):
        delta.append(np.subtract(pred[j], goal_pred[j]));
    return delta;
        
def neural_network(input, weights):
    for i in range(1):
        pred = np.dot(input, weights);
        #print('Weight: ' + str(weights[0]));
        msquared_error = (np.subtract(pred, goal_pred)) ** 2;
        delta = np.subtract(pred, goal_pred);
        weight_delta = np.dot(delta.T, input).T;
        #print('Weight_delta: ' + str(weight_delta[0]));
        #print('Error: ' + str(msquared_error[0]) + '\n-----------------------------');
        weights = np.subtract(weights, (weight_delta * alpha));
        if(i == 99999 or i == 99998 or i == 99997 or i == 99996):
            print('Error: ' + str(msquared_error[0]) + '\n-----------------------------');
        #plot_it_all(weights, msquared_error, weight_delta);
#         print('Label: ' + str(labels_array[0]) + ' \nError: ' + str(msquared_error[0])
#              + '\nPred: ' + str(pred[0]) + '\nGoal_pred: ' + str(goal_pred[0]) 
#              + '\n-----------------------------');

def plot_it_all(weights, errors, derivatives):
    
    ax1.set_title('How much changing each weight' 
                  + '\n contributed to the error?');
    ax1.set_ylabel('Mean squared error');
    ax1.set_xlabel('Weight');
    ax1.scatter(weights, errors, s=None, c='g');
    ax1.plot(weights, errors);
    for i in range(len(weights)):
        ax1.annotate(i, (weights[i], errors[i]));

neural_network(images_array, weights);7

MNIST stochastic gradient descent.

In [6]:
np.random.seed(1);

alpha, iterations, hidden_size = (.001, 500, 400); 
pixels_per_image, num_of_labels = (784, 10);

synapse_0 = .2 * np.random.random((pixels_per_image, hidden_size)) - .1;
synapse_1 = .2 * np.random.random((hidden_size, num_of_labels)) - .1;

relu = lambda x:(x > 0) * x;
relu2deriv = lambda x:(x > 0);

for iteration in range(iterations):
    msquared_error_layer_2, correct_cnt = (0.0, 0);
    for index in range(len(train_images_array)):
        #forward propagation
        layer_0 = train_images_array[index:index+1];
        layer_1 = relu(layer_0.dot(synapse_0));
        
        #dropout
        dropout_mask = np.random.randint(2, size=layer_1.shape);
        layer_1 *= dropout_mask * 2;
        
        layer_2 = layer_1.dot(synapse_1);#l_2 = relu(l_0S_0)S_1;
        #print(layer_2);
        msquared_error_layer_2 += np.sum((layer_2 
                - train_labels_array[index:index+1]) ** 2);
        
        correct_cnt += int(np.argmax(layer_2) == np.argmax(train_labels_array[index:index+1]));
        
        layer_2_delta = layer_2 - train_labels_array[index:index+1];
        
        layer_1_delta = layer_2_delta.dot(synapse_1.T) * relu2deriv(layer_1);
        layer_1_delta *= dropout_mask;
        
        synapse_1_delta = layer_1.T.dot(layer_2_delta);
        synapse_0_delta = layer_0.T.dot(layer_1_delta);
        
        synapse_1 -= synapse_1_delta * alpha;
        synapse_0 -= synapse_0_delta * alpha;
    #time for inference    
    if(iteration % 10 == 0):
        #page -r msquared_error_layer_2
        #print(msquared_error_layer_2);
        #time for inference 
        msquared_error_test, correct_cnt_test = (0.0, 0.0);
        for index in range(len(test_images_array)):
            layer_0 = test_images_array[index:index+1];
            layer_1 = relu(layer_0.dot(synapse_0));
            layer_2 = layer_1.dot(synapse_1);
    
            msquared_error_test += np.sum((layer_2 - test_labels_array[index:index+1]) ** 2);
            correct_cnt_test += int(np.argmax(layer_2) == np.argmax(test_labels_array[index:index+1]));
            
        sys.stdout.write("\n"
                             + "I:" + str(iteration)
                             + " Train Error:" + str(msquared_error_layer_2/float(len(train_images_array)))[0:5]
                             + " Train Correct:" + str(correct_cnt/len(train_images_array))
                             + " Test Error: " + str(msquared_error_test/len(test_images_array))[0:5]
                              +" Test Correct: " + str(correct_cnt_test/len(test_images_array)));
                                 


I:0 Train Error:22.65 Train Correct:0.299 Test Error: 18.69 Test Correct: 0.489
I:10 Train Error:10.98 Train Correct:0.608 Test Error: 9.846 Test Correct: 0.583
I:20 Train Error:9.138 Train Correct:0.667 Test Error: 10.34 Test Correct: 0.654
I:30 Train Error:9.256 Train Correct:0.677 Test Error: 11.16 Test Correct: 0.6
I:40 Train Error:9.656 Train Correct:0.678 Test Error: 7.557 Test Correct: 0.662
I:50 Train Error:9.443 Train Correct:0.694 Test Error: 8.033 Test Correct: 0.66
I:60 Train Error:9.879 Train Correct:0.688 Test Error: 9.030 Test Correct: 0.698
I:70 Train Error:9.843 Train Correct:0.685 Test Error: 8.937 Test Correct: 0.695
I:80 Train Error:9.548 Train Correct:0.685 Test Error: 8.801 Test Correct: 0.707
I:90 Train Error:8.859 Train Correct:0.698 Test Error: 9.016 Test Correct: 0.712
I:100 Train Error:9.345 Train Correct:0.703 Test Error: 8.374 Test Correct: 0.706
I:110 Train Error:10.22 Train Correct:0.682 Test Error: 7.809 Test Correct: 0.707
I:120 Train Error:10.08 Train

Mini-bitched stochastic gradient descent.

In [10]:
np.random.seed(1);

batch_size = 100; 
alpha, iterations, hidden_size = (.01, 500, 400); 
pixels_per_image, num_of_labels = (784, 10);

synapse_0 = .2 * np.random.random((pixels_per_image, hidden_size)) - .1;
synapse_1 = .2 * np.random.random((hidden_size, num_of_labels)) - .1;

relu = lambda x:(x > 0) * x;
relu2deriv = lambda x:(x > 0);

for iteration in range(iterations):
    msquared_error_layer_2, correct_cnt = (0.0, 0);
    for index in range(int(len(train_images_array)/batch_size)):
        batch_start, batch_end = (index * batch_size, (index + 1) * batch_size);
        #forward propagation
        layer_0 = train_images_array[batch_start:batch_end];
        layer_1 = relu(layer_0.dot(synapse_0));
        #dropout
        dropout_mask = np.random.randint(2, size=layer_1.shape);
        layer_1 *= dropout_mask * 2;
        
        layer_2 = layer_1.dot(synapse_1);#l_2 = relu(l_0S_0)S_1;
        #print(layer_2);100x10
        msquared_error_layer_2 += np.sum((layer_2 
                - train_labels_array[batch_start:batch_end]) ** 2);
        
        for index_cnt in range(batch_size):
            correct_cnt += int(np.argmax(layer_2[index_cnt:index_cnt + 1]) == \
                                   np.argmax(train_labels_array[batch_start + index_cnt: batch_start + index_cnt + 1]));
            
        layer_2_delta = (layer_2 - train_labels_array[batch_start:batch_end]) \
                                        / batch_size;
        
        layer_1_delta = layer_2_delta.dot(synapse_1.T) * relu2deriv(layer_1);
        
        #I`ve added the '*2' in the backward step for otherwise
        #we would be computing a gradient of a different function
        #than we`re evaluating.
        #Thus, generally, it`s important to account for anything
        #we're doing in the forward step
        #in the backward step as well.
        #layer_1_delta *= dropout_mask * 2;
        
        synapse_1_delta = layer_1.T.dot(layer_2_delta);
        synapse_0_delta = layer_0.T.dot(layer_1_delta);
        
        synapse_1 -= synapse_1_delta * alpha;
        synapse_0 -= synapse_0_delta * alpha;
        
    #time for inference    
    if(iteration % 10 == 0):
        #page -r msquared_error_layer_2
        #print(msquared_error_layer_2);
        #time for inference 
        msquared_error_test, correct_cnt_test = (0.0, 0.0);
        for index in range(len(test_images_array)):
            layer_0 = test_images_array[index:index+1];
            layer_1 = relu(layer_0.dot(synapse_0));
            layer_2 = layer_1.dot(synapse_1);
    
            msquared_error_test += np.sum((layer_2 - test_labels_array[index:index+1]) ** 2);
            correct_cnt_test += int(np.argmax(layer_2) == np.argmax(test_labels_array[index:index+1]));
            
        sys.stdout.write("\n"
                             + "I:" + str(iteration)
                             + " Train Error:" + str(msquared_error_layer_2/float(len(train_images_array)))[0:5]
                             + " Train Correct:" + str(correct_cnt/len(train_images_array))
                             + " Test Error: " + str(msquared_error_test/len(test_images_array))[0:5]
                              +" Test Correct: " + str(correct_cnt_test/len(test_images_array)));
                                 


I:0 Train Error:68.47 Train Correct:0.159 Test Error: 26.20 Test Correct: 0.249
I:10 Train Error:16.62 Train Correct:0.388 Test Error: 17.53 Test Correct: 0.364
I:20 Train Error:12.72 Train Correct:0.494 Test Error: 14.17 Test Correct: 0.469
I:30 Train Error:11.17 Train Correct:0.532 Test Error: 12.09 Test Correct: 0.521
I:40 Train Error:10.53 Train Correct:0.575 Test Error: 11.06 Test Correct: 0.569
I:50 Train Error:9.583 Train Correct:0.604 Test Error: 10.07 Test Correct: 0.607
I:60 Train Error:9.053 Train Correct:0.642 Test Error: 9.406 Test Correct: 0.624
I:70 Train Error:8.277 Train Correct:0.662 Test Error: 9.060 Test Correct: 0.616
I:80 Train Error:7.795 Train Correct:0.683 Test Error: 8.583 Test Correct: 0.647
I:90 Train Error:7.250 Train Correct:0.703 Test Error: 8.761 Test Correct: 0.654
I:100 Train Error:7.290 Train Correct:0.695 Test Error: 8.535 Test Correct: 0.671
I:110 Train Error:6.904 Train Correct:0.703 Test Error: 8.314 Test Correct: 0.679
I:120 Train Error:6.701 Tr

In [30]:
array = np.random.randint(3, size=(3, 4, 5, 6))
print(len(array))
np.concatenate(array, axis=2).shape

3


(4, 5, 18)

weights also can be seen as a high dimensional shape.
As you train, this shape molds around your data,
learning to distinguish one pattern from another.
The images in our testing dataset were slightly different 
than the patterns in our train set.


Modeling specific types of phenomenon in data. See you there!