In [1]:
import random
import numpy as np
import pandas as pd

In [2]:
input_features = 2
output_features = 4
num_data = 3
np.random.seed(0)

In [3]:
# create a data matrix x with size (num_data, input_features)

# create a feature matrix w with size (input_features, output_features)

In [4]:
x = np.random.randint(5, size=(num_data, input_features))

x

array([[4, 0],
       [3, 3],
       [3, 1]])

In [5]:
w = np.random.randint(-1, 2, size=(input_features, output_features))

w

array([[ 0,  1, -1,  1],
       [-1, -1, -1,  1]])

In [6]:
# algorithm: 
# calculate new matrix of shape 3, 4

# For element i, j (for row i and column j) of output matrix
# only consider row i from input matrix and column j from w matrix.
# Those rows and columns should each have num_data elements. Multiply the coresponding elements and add them up

In [7]:
x.dot(w)

array([[ 0,  4, -4,  4],
       [-3,  0, -6,  6],
       [-1,  2, -4,  4]])

In [8]:
output = np.zeros((num_data, output_features))
for i in range(input_features):
    for j in range(output_features):
        for d in range(num_data):
            output[d,j] = output[d,j] + x[d,i] * w[i,j]

In [9]:
output

array([[ 0.,  4., -4.,  4.],
       [-3.,  0., -6.,  6.],
       [-1.,  2., -4.,  4.]])

In [10]:
x.dot(w)

array([[ 0,  4, -4,  4],
       [-3,  0, -6,  6],
       [-1,  2, -4,  4]])

In [11]:
# Confirm your solution works by comparing assert (output == x.dot(w)).all()

In [12]:
assert (output == x.dot(w)).all()

In [13]:
# randomly generate a true_value signal that has the same shape as your output.

# Compute your loss by calculating the Sum of the absolute error between your output and the ground truth

# Compute the direction of your loss on each output (for instance would making the output bigger make loss bigger)
# (I choose to have 1 represent positive, -1 represent negative)

In [14]:
true_value = np.random.uniform(-1, 1, output.shape)
L = np.abs(output - true_value).sum()
print(L)
dL_do = (output > true_value) * 2.0 - 1

41.578510114266535


In [15]:
true_value

array([[-0.04466977,  0.62433746, -0.04004566, -0.21443041],
       [ 0.67215753, -0.32520768,  0.29634374, -0.26351692],
       [ 0.91431032, -0.71929844,  0.74017452, -0.05278391]])

In [16]:
output

array([[ 0.,  4., -4.,  4.],
       [-3.,  0., -6.,  6.],
       [-1.,  2., -4.,  4.]])

In [17]:
dL_do

array([[ 1.,  1., -1.,  1.],
       [-1.,  1., -1.,  1.],
       [-1.,  1., -1.,  1.]])

In [18]:
# Calculate the derivitive with respect to x

# remember, the derivitive of A * B with respect to A is B, and with respect to B is A.

In [19]:
do_dx = np.zeros((output_features, num_data, input_features))
for i in range(input_features):
    for j in range(output_features):
        for d in range(num_data):
            do_dx[j, d, i] = dL_do[d, j] * w[i,j]
do_dx

array([[[ 0., -1.],
        [-0.,  1.],
        [-0.,  1.]],

       [[ 1., -1.],
        [ 1., -1.],
        [ 1., -1.]],

       [[ 1.,  1.],
        [ 1.,  1.],
        [ 1.,  1.]],

       [[ 1.,  1.],
        [ 1.,  1.],
        [ 1.,  1.]]])

In [20]:
# Oh no, we have so many derivities to the same value, one deritive per output. Let's sum those up to get
# One derivitive for each of our values so we can make changes to make our model more efficient
# Write code to sum over the "output features" dimension to reduce that dimension to a scalar

In [21]:
dL_dx = do_dx.sum(0)
dL_dx

array([[3., 0.],
       [3., 2.],
       [3., 2.]])

In [22]:
# check that it matches with the dot product of w and dL_do

In [23]:
assert (dL_dx == w.dot(dL_do.T).T).all()

In [24]:
# now do the same for dL_dw by multiplying dL_do by x

In [25]:
dL_do

array([[ 1.,  1., -1.,  1.],
       [-1.,  1., -1.,  1.],
       [-1.,  1., -1.,  1.]])

In [26]:
x

array([[4, 0],
       [3, 3],
       [3, 1]])

In [27]:
x

array([[4, 0],
       [3, 3],
       [3, 1]])

In [28]:
do_dw = np.zeros((input_features, num_data, output_features))
for i in range(input_features):
    for j in range(output_features):
        for d in range(num_data):
            do_dw[i, d, j] = dL_do[d, j] * x[d,i]
dL_dw = do_dw.sum(1)
do_dw

array([[[ 4.,  4., -4.,  4.],
        [-3.,  3., -3.,  3.],
        [-3.,  3., -3.,  3.]],

       [[ 0.,  0., -0.,  0.],
        [-3.,  3., -3.,  3.],
        [-1.,  1., -1.,  1.]]])

In [29]:
dL_dw

array([[ -2.,  10., -10.,  10.],
       [ -4.,   4.,  -4.,   4.]])

In [30]:
x.T.dot(dL_do)

array([[ -2.,  10., -10.,  10.],
       [ -4.,   4.,  -4.,   4.]])

In [31]:
dL_dw

array([[ -2.,  10., -10.,  10.],
       [ -4.,   4.,  -4.,   4.]])

In [32]:
# Check that your deritive matches with x.T.dot(dL_do)

In [33]:
assert (dL_dw == x.T.dot(dL_do)).all()

In [34]:
# change w_updated by reducing w by a small fraction in the direction of -dL_dw (to make L smaller)
# recompute output, call it output_updated
# DONT recompute the target
# recompute Loss Use a new variable name for L (like L_updated)
# Make sure Loss is now smaller than before

In [35]:
w_updated = w - dL_dw * .01

In [36]:
output_updated = np.zeros((num_data, output_features))
for i in range(input_features):
    for j in range(output_features):
        for d in range(num_data):
            output_updated[d,j] = output_updated[d,j] + x[d,i] * w_updated[i,j]

In [37]:
output

array([[ 0.,  4., -4.,  4.],
       [-3.,  0., -6.,  6.],
       [-1.,  2., -4.,  4.]])

In [38]:
true_value.round(2)

array([[-0.04,  0.62, -0.04, -0.21],
       [ 0.67, -0.33,  0.3 , -0.26],
       [ 0.91, -0.72,  0.74, -0.05]])

In [39]:
output_updated

array([[ 0.08,  3.6 , -3.6 ,  3.6 ],
       [-2.82, -0.42, -5.58,  5.58],
       [-0.9 ,  1.66, -3.66,  3.66]])

In [40]:
L_updated = np.abs(output_updated - true_value).sum()
L_updated

38.088094755935614

In [41]:
L

41.578510114266535

In [42]:
# Yey! The loss went down.
# And some explanation about how dL_dx could be used to 
# backpropogated and compute dL_d parameters of function that computed x if x was a hidden layer