In [2]:
# Single-input and multi-input gradient descent are almost the same. The only difference is that with multi-input, you have
# more than one input and each input must be multiplied by delta (pred - true) individually to affect the output the way we
# want. weight_delta (input * (pred - true)) aka the derivative is then subtracted from the weight for each input. alpha is also
# applied to each weight_delta when updating each weight.

# There are multiple inputs and multiple weights (because each input needs a weight), but there is still only one error value
# ((pred - true) ** 2) because there is one prediction.

# Remember that delta is raw error. It is a measure of how different you want the prediction to be. weight_delta is the actual
# estimate for how much we should adjust an input's weight to get the result we want (the derivative-based estimate).

# The parabola that represents error over weight is narrower with larger inputs and wider with smaller ones. This is because
# larger inputs = higher derivative/bigger slope values to get to the error = 0 point.

# If one input is larger than the others, learning tends to favor making changes to the weight for this input. That's because
# a large input has the ability to make large changes to the output more easily. You may also have issues with divergence with
# this output while the other outputs are fine. You might have to use lower alpha values than would otherwise be desirable.
# There is a subfield called normalization that deals with this problem and encourages learning across all weights.

# Freezing a weight
# You can learn by editing the weights of the other two smaller inputs and freezing the weight of the large input. The large
# input will still get its error down near 0 because, once again, there is only one prediction and one error value.
# This could be bad because the network could learn how to predict without the large input. Assuming this large input is
# actually important to the prediction process, the network is missing data when making predictions.

# You can also have 1 input and multiple outputs. This works the same as above except there is one input and multiple delta
# values. weight_delta = input * (pred - true) for each of these. Each of the new weight_deltas is again multiplied by the alpha
# and each weight is updated.

# For networks with multiple inputs and multiple outputs, first calculate the error and delta for each output. Then, treat
# each input like a network with multiple outputs and multiply all the deltas for each output by that input to get weight_delta.
# Next, multiply all weight_deltas by the appropriate alpha value (there should be one for each input) and update the weight of
# that input for each output. 

In [18]:
# Example of a multi-input multi-output gradient descent neural network:

import numpy as np

          # toes %win # fans   predictions:
weights = [[0.1, 0.1, -0.3], # players hurt?
          [0.1, 0.2, 0.0],   # did the team win?
          [0.0, 1.3, 0.1]]   # players sad?

# multiply two vectors together and then add up all the parts
def w_sum(a, b):
    assert (len(a) == len(b))
    output = 0
    for i in range(len(a)):
        output += (a[i] * b[i])
    return output

# multiply a vector in the matrix by one outside vector, then sum the parts of the resulting vector to get one value per
# vector in the matrix
def vector_matrix_multiplication (vect, matrix):
    assert(len(vect) == len(matrix))
    output = [0, 0, 0]
    for i in range(len(vect)):
        output[i] = w_sum(vect, matrix[i])
    return output

# multiply the inputs by the weights to get the prediction. the weights are a set of weights for each input, each set
# has one value corresponding to each output (matrix). the inputs are one value per input (vector).
# we get one prediction for each output: hurt, win, sad
def neural_network(input, weights):
    pred = vector_matrix_multiplication(input, weights)
    return pred

# multiply input by delta to get weight_delta. multiplies each input by 3 deltas because one input goes into 3 outputs.
# and there are 3 inputs. the inputs each contribute to each output.
def outer_prod(vec_a, vec_b):
    out = np.zeros([len(vec_a), len(vec_b)])
    for i in range(len(vec_a)):
        for j in range(len(vec_b)):
            out[i][j] = vec_a[i] * vec_b[i]
    return out

#inputs
toes = [8.5, 9.5, 9.9, 9.0] # current average number of toes per player
wlrec = [0.65, 0.8, 0.8, 0.9] # current games won (%)
nfans = [1.2, 1.3, 0.5, 1.0] # number of fans (millions)

#true value
hurt = [0.1, 0.0, 0.0, 0.1]
win = [1, 1, 0, 1]
sad = [0.1, 0.0, 0.1, 0.2]

alpha = 0.01

for n in range(0, 4):

    input = [toes[n], wlrec[n], nfans[n]]
    true = [hurt[n], win[n], sad[n]]

    pred = neural_network(input, weights)

    error = [0, 0, 0]
    delta = [0, 0, 0]

    # since we are only using 0th of each input and each true value, we are only getting learning for one set of inputs right now
    for i in range(len(true)):
        error[i] = (pred[i] - true[i]) ** 2
        delta[i] = pred[i] - true[i]

    weight_deltas = outer_prod(input, delta)

    for i in range(len(weights)):
        for j in range(len(weights[0])):
            weights[i][j] -= alpha * weight_deltas[i][j]

    # show the adjusted weight after the current round of learning
    print("Round: ", n + 1)
    print("Error: ", error)
    print("Weights: ", weights)

# Some divergence may be happening, but we only have 4 data points.

Round:  1
Error:  [0.20702500000000007, 0.0003999999999999963, 0.7482250000000001]
Weights:  [[0.061325, 0.061325, -0.338675], [0.10013000000000001, 0.20013, 0.00012999999999999942], [-0.01038, 1.28962, 0.08962]]
Round:  2
Error:  [0.03662247689999995, 0.012434034064000036, 1.1016433664640002]
Weights:  [[0.04314485000000001, 0.04314485000000001, -0.35685515], [0.09923793600000001, 0.199237936, -0.000762064000000002], [-0.024024696, 1.275975304, 0.07597530400000001]]
Round:  3
Error:  [0.08021488254618248, 1.3029420795787903, 0.5197305555884246]
Weights:  [[0.015105840319999998, 0.015105840319999998, -0.38489415968], [0.0901062169344, 0.1901062169344, -0.009893783065600004], [-0.027629313024000002, 1.2723706869759999, 0.072370686976]]
Round:  4
Error:  [0.1124571680947903, 0.0007751900729065042, 0.5911156960471249]
Weights:  [[0.04528701096608001, 0.04528701096608001, -0.35471298903392], [0.09035679705313536, 0.19035679705313535, -0.009643202946864646], [-0.035317717904384004, 1.264682