In [2]:
# Single-input and multi-input gradient descent are almost the same. The only difference is that with multi-input, you have
# more than one input and each input must be multiplied by delta (pred - true) individually to affect the output the way we
# want. weight_delta (input * (pred - true)) aka the derivative is then subtracted from the weight for each input. alpha is also
# applied to each weight_delta when updating each weight.

# There are multiple inputs and multiple weights (because each input needs a weight), but there is still only one error value
# ((pred - true) ** 2) because there is one prediction.

# Remember that delta is raw error. It is a measure of how different you want the prediction to be. weight_delta is the actual
# estimate for how much we should adjust an input's weight to get the result we want (the derivative-based estimate).

# The parabola that represents error over weight is narrower with larger inputs and wider with smaller ones. This is because
# larger inputs = higher derivative/bigger slope values to get to the error = 0 point.

# If one input is larger than the others, learning tends to favor making changes to the weight for this input. That's because
# a large input has the ability to make large changes to the output more easily. You may also have issues with divergence with
# this output while the other outputs are fine. You might have to use lower alpha values than would otherwise be desirable.
# There is a subfield called normalization that deals with this problem and encourages learning across all weights.

# Freezing a weight
# You can learn by editing the weights of the other two smaller inputs and freezing the weight of the large input. The large
# input will still get its error down near 0 because, once again, there is only one prediction and one error value.
# This could be bad because the network could learn how to predict without the large input. Assuming this large input is
# actually important to the prediction process, the network is missing data when making predictions.

# You can also have 1 input and multiple outputs. This works the same as above except there is one input and multiple delta
# values. weight_delta = input * (pred - true) for each of these. Each of the new weight_deltas is again multiplied by the alpha
# and each weight is updated.

# For networks with multiple inputs and multiple outputs, first calculate the error and delta for each output. Then, treat
# each input like a network with multiple outputs and multiply all the deltas for each output by that input to get weight_delta.
# Next, multiply all weight_deltas by the appropriate alpha value (there should be one for each input) and update the weight of
# that input for each output. 

In [9]:
# Example of a multi-input multi-output gradient descent neural network:

import numpy as np

          # toes %win # fans   predictions:
weights = [[0.1, 0.1, -0.3], # players hurt?
          [0.1, 0.2, 0.0],   # did the team win?
          [0.0, 1.3, 0.1]]   # players sad?

# multiply two vectors together and then add up all the parts
def w_sum(a, b):
    assert (len(a) == len(b))
    output = 0
    for i in range(len(a)):
        output += (a[i] * b[i])
    return output

# multiply a vector in the matrix by one outside vector, then sum the parts of the resulting vector to get one value per
# vector in the matrix
def vector_matrix_multiplication (vect, matrix):
    assert(len(vect) == len(matrix))
    output = [0, 0, 0]
    for i in range(len(vect)):
        output[i] = w_sum(vect, matrix[i])
    return output

# multiply the inputs by the weights to get the prediction. the weights are a set of weights for each input, each set
# has one value corresponding to each output (matrix). the inputs are one value per input (vector).
# we get one prediction for each output: hurt, win, sad
def neural_network(input, weights):
    pred = vector_matrix_multiplication(input, weights)
    return pred

# multiply input by delta to get weight_delta. multiplies each input by 3 deltas because one input goes into 3 outputs.
# and there are 3 inputs. the inputs each contribute to each output.
def outer_prod(vec_a, vec_b):
    out = np.zeros([len(vec_a), len(vec_b)])
    for i in range(len(vec_a)):
        for j in range(len(vec_b)):
            out[i][j] = vec_a[i] * vec_b[i]
    return out

#inputs
toes = [8.5, 9.5, 9.9, 9.0] # current average number of toes per player
wlrec = [0.65, 0.8, 0.8, 0.9] # current games won (%)
nfans = [1.2, 1.3, 0.5, 1.0] # number of fans (millions)

#true value
hurt = [0.1, 0.0, 0.0, 0.1]
win = [1, 1, 0, 1]
sad = [0.1, 0.0, 0.1, 0.2]

alpha = 0.01

for n in range(len(toes)):

    input = [toes[n], wlrec[n], nfans[n]]
    true = [hurt[n], win[n], sad[n]]

    pred = neural_network(input, weights)

    error = [0, 0, 0]
    delta = [0, 0, 0]

    # since we are only using 0th of each input and each true value, we are only getting learning for one set of inputs right now
    for i in range(len(true)):
        error[i] = (pred[i] - true[i]) ** 2
        delta[i] = pred[i] - true[i]

    weight_deltas = outer_prod(input, delta)

    for i in range(len(weights)):
        for j in range(len(weights[0])):
            weights[i][j] -= alpha * weight_deltas[i][j]

    # show the adjusted weight after the current round of learning
    print("Round: ", n + 1)
    print("Pred: ", pred)
    print("Error: ", error)
    # print("Weights: ", weights)

# Some divergence may be happening, but we only have 4 data points. It may be happening because each time our true value
# is different and previous examples just repeated the same true value over and over to demonstrate how the prediction 
# approaches the true value after multiple repetitions.

Round:  1
Pred:  [0.555, 0.9800000000000001, 0.9650000000000001]
Error:  [0.20702500000000007, 0.0003999999999999963, 0.7482250000000001]
Round:  2
Pred:  [0.19136999999999987, 1.1115080000000002, 1.049592]
Error:  [0.03662247689999995, 0.012434034064000036, 1.1016433664640002]
Round:  3
Pred:  [0.28322232000000014, 1.1414648832000003, 0.8209234047999999]
Error:  [0.08021488254618248, 1.3029420795787903, 0.5197305555884246]
Round:  4
Pred:  [-0.23534634051200004, 0.9721577645849601, 0.9688404880384]
Error:  [0.1124571680947903, 0.0007751900729065042, 0.5911156960471249]


In [3]:
# MNISTPreprocessor notebook code provided at 
# https://github.com/iamtrask/Grokking-Deep-Learning/blob/master/MNISTPreprocessor.ipynb
# It is essentially 4 lines long. Swapped to Python 3.8.8 to use TensorFlow.

from keras.datasets import mnist

# Load training data (set of 1000 images of handwritten numbers and their respective labels) and test data (more images and
# labels intended to be used for seeing if the trained neural network can properly label images it has never seen before)
(x_train, y_train), (x_test, y_test) = mnist.load_data()

images = x_train[0:1000]
labels = y_train[0:1000]

# Images consists of 1000 arrays that are 784 long. This is because the images are 28x28, so they have 784 pixels. 
# Each pixel gets a value in the array: 1 if they are completely black and 0 if they are completely white. 
# Labels is an array that is 1000 long and each item in the array is a number between 0 and 9 (inclusive).

In [5]:
images

array([[[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]],

       [[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]],

       [[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]],

       ...,

       [[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]],

       [[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 

In [7]:
labels

array([5, 0, 4, 1, 9, 2, 1, 3, 1, 4, 3, 5, 3, 6, 1, 7, 2, 8, 6, 9, 4, 0,
       9, 1, 1, 2, 4, 3, 2, 7, 3, 8, 6, 9, 0, 5, 6, 0, 7, 6, 1, 8, 7, 9,
       3, 9, 8, 5, 9, 3, 3, 0, 7, 4, 9, 8, 0, 9, 4, 1, 4, 4, 6, 0, 4, 5,
       6, 1, 0, 0, 1, 7, 1, 6, 3, 0, 2, 1, 1, 7, 9, 0, 2, 6, 7, 8, 3, 9,
       0, 4, 6, 7, 4, 6, 8, 0, 7, 8, 3, 1, 5, 7, 1, 7, 1, 1, 6, 3, 0, 2,
       9, 3, 1, 1, 0, 4, 9, 2, 0, 0, 2, 0, 2, 7, 1, 8, 6, 4, 1, 6, 3, 4,
       5, 9, 1, 3, 3, 8, 5, 4, 7, 7, 4, 2, 8, 5, 8, 6, 7, 3, 4, 6, 1, 9,
       9, 6, 0, 3, 7, 2, 8, 2, 9, 4, 4, 6, 4, 9, 7, 0, 9, 2, 9, 5, 1, 5,
       9, 1, 2, 3, 2, 3, 5, 9, 1, 7, 6, 2, 8, 2, 2, 5, 0, 7, 4, 9, 7, 8,
       3, 2, 1, 1, 8, 3, 6, 1, 0, 3, 1, 0, 0, 1, 7, 2, 7, 3, 0, 4, 6, 5,
       2, 6, 4, 7, 1, 8, 9, 9, 3, 0, 7, 1, 0, 2, 0, 3, 5, 4, 6, 5, 8, 6,
       3, 7, 5, 8, 0, 9, 1, 0, 3, 1, 2, 2, 3, 3, 6, 4, 7, 5, 0, 6, 2, 7,
       9, 8, 5, 9, 2, 1, 1, 4, 4, 5, 6, 4, 1, 2, 5, 3, 9, 3, 9, 0, 5, 9,
       6, 5, 7, 4, 1, 3, 4, 0, 4, 8, 0, 4, 3, 6, 8,

In [87]:
from copy import deepcopy

row = []
for i in range(0, 28):
    row.append([1 for _ in range(0, 28)])

# made a weight array, is 28x28 and each of the 784 items is 1. starts all weights at 1. weight array is actually 10 arrays
# of this form, one for each number 0 to 9 inclusive. deepcopy = will be editable and not change other parts of array
weight = []
for j in range(0, 10):
    weight.append(deepcopy(row))

# convert truth array to 784 arrays of length 10 that are filled with 0s except there is a 1 in the index position matching the
# correct prediction.
converted_labels = []
for item in labels:
    conversion = []
    for i in range(0, 10):
        if item == i:
            conversion.append(1)
        else:
            conversion.append(0)
    converted_labels.append(conversion)
    
def wsum(a, b):
    assert (len(a) == len(b))
    output = 0
    for i in range(len(a)):
        for j in range(len(a[0])):
            output += (a[i][j] * b[i][j])
    return output
    
def vm_multiplication (vect, matrix):
    output = []
    for i in range(len(matrix)):
        assert(len(vect) == len(matrix[i]))
        # w_sum just multiplies vect[n] by matrix[n] and adds this number to output, output is 1 number
        output.append(wsum(vect, matrix[i]))
    return output
    
def n_network(input, weights):
    pred = vm_multiplication(input, weights)
    return pred

# multiply input by delta to get weight_delta (magnitude and direction of weight change to approach 0 error)
def oprod(vec_a, vec_b):
    out = []
    for i in range(len(vec_a)):
        for j in range(len(vec_a[0])):
            out.append(vec_a[i][j] * vec_b)
    return out

for n in range(len(labels)):

    input = images[n]
    true = converted_labels[n]

    # prediction should be an array of length 10 that has numbers in it. ideally, the numbers would all be 0 except the 
    # predicted value, which should be 1. hopefully something like this happens after training.
    pred = n_network(input, weight)

    # there are 10 output values, so there are 10 possible errors and 10 possible raw errors.
    error = [0 for _ in range(0, 10)]
    delta = [0 for _ in range(0, 10)]
    
    # populate the error and raw error values
    for i in range(len(error)):
        error[i] = (pred[i] - true[i]) ** 2
        delta[i] = pred[i] - true[i]
    
    # get the "derivative" value aka the amount and direction we should change the weight, we must do this for EACH of the 784
    # inputs for EACH of the 10 possible outputs. so 784 inputs will be multiplied by the 0 delta value, the 1 delta value, etc.
    # this will produce an array 784 long for each possible output, so there are 10 arrays 784 long each in weight_deltas
    weight_deltas = []
    for i in range(len(delta)):
        weight_deltas.append(oprod(input, delta[i]))
    
    # weight is 10 arrays that are 784 items long. subtract all 10 arrays that are 784 items long of the weight_deltas from
    # each weight subarray
    for i in range(len(weight)): #10
        for j in range(len(weight[0])): #28
            for k in range(len(weight[0][0])): #28
                # need to fix weight_deltas generation to have a 28 array for each row instead of being 1 array of 784
                weight[i][j][k] -= alpha * weight_deltas[i][j][k]

    # show the adjusted weight after the current round of learning
    print("Round: ", n + 1)
    print("Pred: ", pred)
    print("Error: ", error)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 82575, 495450, 495450, 495450, 3468150, 3743400, 4816875, 715650, 4569150, 7018875, 6798675, 3495675, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 825750, 990900, 2587350, 4238850, 4679250, 6963825, 6963825, 6963825, 6963825, 6963825, 6193125, 4734300, 6963825, 6661050, 5367375, 1761600, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1348725, 6550950, 6963825, 6963825, 6963825, 6963825, 6963825, 6963825, 6963825, 6963825, 6908775, 2559825, 2257050, 2257050, 1541400, 1073475, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 495450, 6027975, 6963825, 6963825, 6963825, 696382

IndexError: invalid index to scalar variable.