# Nerual Network Python Implementation

## Imports

In [1316]:
import numpy as np
import matplotlib.pyplot as plt
import pickle
from helpers.hw2 import get_mnist_threes_nines, display_image
%matplotlib inline

## Notebook Helpers

In [1317]:
def single_exec(func):
    '''Runs a function and removes the funciton scope'''
    display(func())
    del func

## Assertion Helpers

In [1318]:
ASSERT_OFF = False # Kill switch when we don't want assertions

In [1319]:
def isarray(a, dim_len):
    if ASSERT_OFF: return
    if isinstance(dim_len, int):
        dim_len = [dim_len]
    assert isinstance(a, np.ndarray), f"Expected np.array but was {type(a)}: {a}"
    assert len(a.shape) in dim_len, f"Expected dimention {dim_len} but was {len(a.shape)}: {a}"

def test():
    a = np.array([[1,2,3]])
    #a = [1,2,3,4]
    isarray(a, 2)
single_exec(test)

None

In [1320]:
def iscallable(c, input_len, x=10):
    if ASSERT_OFF: return
    from inspect import signature
    assert callable(c)
    
    actual_input_len = len([noe for noe in signature(c).parameters.values() if "=" not in str(noe)])
    assert actual_input_len == input_len, f"Expected {c.__name__} to take {input_len} arguments, but was {actual_input_len} instead."
def test():
    iscallable(iscallable, 2)
single_exec(test)

None

In [1321]:
def isnum(i):
    if ASSERT_OFF: return
    assert isinstance(i, int) or isinstance(i, np.int64) or isinstance(i, float), f"Expected to be int, but was {type(i)}: {i}"

def test():
    #is_int("2")
    isnum(2)
single_exec(test)

None

In [1322]:
def islist(l, content_type):
    if ASSERT_OFF: return
    
    assert isinstance(l, list), f"Expected to be list, but was {type(l)}: {l}"
    def check(content):
        if content_type == "callable":
            return callable(content)
        if isinstance(content_type, tuple) and isinstance(content_type[0], type):
            if content_type[0] == np.ndarray:
                isarray(content, content_type[1])
        return isinstance(content, content_type)
    not_matching = [str(content) for content in l if not check(content)]
    assert len(not_matching) == 0, f"These elements are not of type {content_type}: {', '.join(not_matching)}"
def test():
    #is_int("2")
    #islist(["hei", 2, "bo", 3], str)
    islist(["hei", "bo"], str)
single_exec(test)

None

## Neural Network Helpers

> 1.2a (finite differences checker, used to help implement `my_nn_finite_difference_checker` in 1.3a. Feel free to modify the function signature, or to skip this part and implement `my_nn_finite_difference_checker` without this helper function.)

**`finite_difference_checker`**

Parameter 1 `f`: function

Parameter 2 `x`: np.array

Parameter 3 `k`: which $x_i$ to take partial derivative of

Parameter `epsilon`: infinitively small value (approx)

Returns: $\frac{\partial f}{\partial x_{k+1}}(x)$

In [1323]:
def finite_difference_checker(f, x, k, epsilon = 10**(-5)):
    """Returns $\frac{\partial f}{\partial x_k}(x)$"""
    iscallable(f, 1)
    isarray(x, 1)
    isnum(k)
    
    minuend_x = x+np.identity(x.shape[0])[k]*epsilon
    subtrahend_x = x-np.identity(x.shape[0])[k]*epsilon
    return (f(minuend_x)- f(subtrahend_x))/(2*epsilon)

def test():
    f = lambda x: x**2
    x = np.array([1,2,3,4])
    k = 0
    return finite_difference_checker(f, x, k)
single_exec(test)

array([2., 0., 0., 0.])

**`gradient_approx`**

Parameter 1 f: function

Parameter 2 x: np.array

Parameter `epsilon`: infinitively small value (approx)

Returns: $\nabla_x f(x)$


In [1324]:
''' def gradient_approx(f, x, epsilon = 10**(-5)):
    x_matrix = np.full((x.size, x.size), x)
    epsilon_matrix = np.identity(x.size) * epsilon
    minuend_x = x_matrix+epsilon_matrix
    subtrahend_x = x_matrix-epsilon_matrix
    
    return (f(minuend_x)- f(subtrahend_x))*(1/(2*epsilon)) '''

def gradient_approx(f, x, epsilon = 10**(-5)):
    iscallable(f, 1)
    isarray(x, 1)
    isnum(epsilon)
    
    fdc = lambda i: finite_difference_checker(f, x, i, epsilon=epsilon, )
    vfdc = np.vectorize(fdc, signature="()->(n)")
    return vfdc(np.arange(x.shape[0]))
    
def test():
    f = lambda x: np.exp(x)
    x = np.array([1,2,3,4,1])
    return gradient_approx(f, x)
single_exec(test)

array([[ 2.71828183,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  7.3890561 ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        , 20.08553692,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , 54.59815003,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  2.71828183]])

> 1.2b (functions that implement neural network layers)

**`sigmoid_activation`**

Parameter 1 `x`: np.array

Returns: (out: sigmoids output, grad: gradient of the sigmoid in respect of $x_i$) 

In [1325]:
def sigmoid_activation(x, epsilon = 10**(-15)):
    # YOUR CODE HERE
    isarray(x, 1)
    isnum(epsilon)

    positive_sigmoid = lambda x: 1 / (1 + np.exp(-x, where = x>=0))
    sigmoid = lambda x: np.where(x >= 0, positive_sigmoid(x), 1-positive_sigmoid(-x))
    #sigmoid = lambda x: 1 / (1 + np.exp(-x))
    ''' inv_sigmoid = lambda y: -np.log((1/y ) - 1)
    clipped_x = np.clip(sigmoid(x), inv_sigmoid(clip_start), inv_sigmoid(clip_end))  '''
    clip_start = epsilon
    clip_end = 1-epsilon
    out = np.clip(sigmoid(x), clip_start, clip_end)

    #grad = gradient_approx(sigmoid, x) # Disabled do to easier to calculate
    grad = (sigmoid(x)*(1-sigmoid(x)))*np.identity(x.shape[0])
    return out, grad

def test():
    x = np.array([1,2,3,np.int64(40**9), -1000])
    return sigmoid_activation(x)
single_exec(test)

(array([7.31058579e-01, 8.80797078e-01, 9.52574127e-01, 1.00000000e+00,
        1.00000000e-15]),
 array([[0.19661193, 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.10499359, 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.04517666, 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ]]))

**`logistic_loss`**

Parameter 1 `g`: np.array of values

Parameter 2 `y`: np.array of expected values

Returns: (loss: array of losses, dL_dg: gradient of the loss with respect to `g`) 

In [1326]:

def logistic_loss(g, y):
    """
    Computes the loss and gradient for binary classification with logistic
    loss

    Inputs:
    - g: Output of final layer with sigmoid activation,
         of shape (n, 1) 

    - y: Vector of labels, of shape (n,) where y[i] is the label for x[i] 
         and y[i] in {0, 1}

    Returns a tuple of:
    - loss: array of losses
    - dL_dg: Gradient of the loss with respect to g
    """
    # YOUR CODE HERE
    isarray(g, 2)
    isarray(y, 1)

    g = g.T[0]
    isarray(g, 1)

    def loss_function (g): 
        g = np.clip(g, 1e-15, 1-1e-15) # To avoid 1+1e-15 and -1e-15
        return -np.log((g**y) * ((1 - g)**(1-y)))

    loss = loss_function(g)
    dL_dg = gradient_approx(loss_function, g)
    return loss, dL_dg

def test():
    g = np.array([[.1,.2,.3,.4]]).T
    y = np.array([.1,.2,.3,.2])
    return logistic_loss(g, y)
single_exec(test)

(array([0.32508297, 0.50040242, 0.6108643 , 0.59191865]),
 array([[-3.29736238e-09,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00],
        [ 0.00000000e+00, -7.88258347e-10,  0.00000000e+00,
          0.00000000e+00],
        [ 0.00000000e+00,  0.00000000e+00, -3.05311332e-10,
          0.00000000e+00],
        [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          8.33333333e-01]]))

**`relu_activation`**

Parameter 1 `s`: np.array

Returns: (out: relus output, ds: gradient of the relu in respect of $s_i$) 

In [1327]:
def relu_activation(s):
    # YOUR CODE HERE
    isarray(s, 1)
    
    relu_function = lambda s: np.max([np.full(s.shape[0], 0), s], axis=0)
    out = relu_function(s)
    ds = gradient_approx(relu_function, s) # Disabled due to easyer to calculate
    return out, ds

def test():
    s = np.array([-1,2,-3,4, 0])
    return relu_activation(s)
single_exec(test)

(array([0, 2, 0, 4, 0]),
 array([[0. , 0. , 0. , 0. , 0. ],
        [0. , 1. , 0. , 0. , 0. ],
        [0. , 0. , 0. , 0. , 0. ],
        [0. , 0. , 0. , 1. , 0. ],
        [0. , 0. , 0. , 0. , 0.5]]))

> To find $\frac{\partial l (W, b)}{\partial W_{i,j}}$: 
> $$\frac{\partial l (W, b)}{\partial w_{i,j}} = \delta_j^{(l)}x_i^{(l-1)}= \frac {\partial e(w_{i,j})}{\partial s_j^{(l)}} x_i^{(l-1)}$$

**`layer_forward`**

Parameter 1 `x`: np.array, $n \times d^{(l-1)}$

Parameter 2 `W`: np.array, $d^{(l-1)} \times d^{(l)}$, contains all $w_{i,j}^{(l)}$ for this layer

Parameter 3 `b`: np.array, $1 \times d^{(l)}$ (matrix, not vector)

Parameter 3 `activation_fn`: (in: np.array) -> (out: np.array, grad: np.array)

Returns: (out: np.array ($n \times d^{(l)}$) with next neurons, cache: (x: same as input,  da_dz: $\frac{\partial a}{\partial z}$, W)) 


In [1328]:

def layer_forward(x, W, b, activation_fn):
    # YOUR CODE HERE
    isarray(x, 2)
    isarray(W, 2)
    isarray(b, 2)
    iscallable(activation_fn, 1)
    
    n, _ = x.shape
    b = b[0]
    repeated_b = np.full((n, b.shape[0]), b)
    act_fn_in = (x @ W) + repeated_b
    v_activation_fn = np.vectorize(activation_fn, signature="(n)->(n), (n, n)")
    out, da_dz = v_activation_fn(act_fn_in)
    cache = (x, da_dz, W)
    return out, cache

def test():
    # n = 3
    # d^(l-1) = 4
    # d^(l) = 2
    x = np.array([
        [1, 2, 3, 4], 
        [4, 2, 3, 4],
        [4, 2, 6, 4],
    ])
    W = np.array([
        [-1, -3], 
        [-4, 2],
        [2, 3],
        [2, 2],
    ])
    b = np.array([[3, 1]])
    activation_fn = relu_activation
    return layer_forward(x, W, b, activation_fn)
single_exec(test)

(array([[ 8, 19],
        [ 5, 10],
        [11, 19]]),
 (array([[1, 2, 3, 4],
         [4, 2, 3, 4],
         [4, 2, 6, 4]]),
  array([[[1., 0.],
          [0., 1.]],
  
         [[1., 0.],
          [0., 1.]],
  
         [[1., 0.],
          [0., 1.]]]),
  array([[-1, -3],
         [-4,  2],
         [ 2,  3],
         [ 2,  2]])))

> 1.3b i, ii (deliverables for the sigmoid activation)

In [1329]:
# 1.3b i
s = np.asarray([1., 0., -1])
out, grad = sigmoid_activation(s)
with np.printoptions(precision=2):
    print(out)
    print(grad)
    
print("="*80)

# 1.3b ii
s = np.asarray([-1000., 1000.])
out, grad = sigmoid_activation(s)
print(out)
print(grad)

[0.73 0.5  0.27]
[[0.2  0.   0.  ]
 [0.   0.25 0.  ]
 [0.   0.   0.2 ]]
[1.e-15 1.e+00]
[[0. 0.]
 [0. 0.]]


> 1.3b iii: What is the derivative of the negative log-likelihood loss with respect to $g$?
> 
> **your answer here**

>1.3b iv: Explain what is returned in `cache` in your `layer_forward` implementation. (Trying to answer this question before completing your implementation might help think about should go in `cache`, which should be stuff computed during the forward pass that is needed for backpropagation in the backward pass. Just make sure your final answer pertains to what you ultimately return in `cache`.)
>
>**your answer here**

> 1.2c (in this part you will code functions that initialize the neural network's weights. You will also code the forward pass which ties everything together, computing the output of a neural network with weights given by `weight_matrices` + `biases`, activation functions given by `activations`, on the input `X_batch`, a 2d input where each row is an individual input vector)

**`create_weight_matrices`**

Parameter 1 `layer_dims`: np.array, contains dimention of each layer in the neural network. `layer_dims[i]` defines
      the number of neurons in the i+1 layer

Returns: np.array, list of weight matrices


In [1330]:
def create_weight_matrices(layer_dims):
    """
    Creates a list of weight matrices defining the weights of NN
    
    Inputs:
    - layer_dims: A list whose size is the number of layers. layer_dims[i] defines
      the number of neurons in the i+1 layer.

    Returns a list of weight matrices
    """
    # YOUR CODE HERE
    isarray(layer_dims, 1)
    sizes=np.array([layer_dims[:-1], layer_dims[1:]]).T
    weights = [np.random.normal(0,1,size) for size in sizes]
    return weights

def test():
    #layer_dims = np.array([784, 200, 1])
    layer_dims = np.array([7, 4, 1])
    return create_weight_matrices(layer_dims)
single_exec(test)

[array([[ 9.62418473e-01, -2.35200248e+00,  1.00437977e+00,
          1.03908306e+00],
        [ 5.54588567e-01, -1.36129186e+00,  8.34660576e-01,
          5.07040471e-04],
        [ 5.36768149e-01,  3.40903668e+00,  4.54052129e-01,
          1.26610067e+00],
        [ 6.73983012e-01, -8.99932569e-01,  1.52008891e-01,
          4.75192965e-01],
        [-3.13269719e-01, -6.96115225e-01,  8.49764422e-01,
          8.88587366e-01],
        [-1.45182439e+00, -1.36431726e-01,  8.22917619e-01,
          6.87855429e-01],
        [ 5.58985498e-01, -3.06271987e-01,  1.26219326e-01,
          5.77255025e-01]]),
 array([[-0.02013194],
        [ 0.43121298],
        [ 0.0156675 ],
        [-0.83883607]])]

**`create_bias_vectors`**

Parameter 1 `layer_dims`: np.array, contains dimention of each layer in the neural network. `layer_dims[i]` defines
      the number of neurons in the i+1 layer

Returns: np.array, list of weight biases for n-1 layers (first layer doesn't need bias)


In [1331]:
def create_bias_vectors(layer_dims):
    # YOUR CODE HERE
    isarray(layer_dims, 1)
    biases = [np.random.normal(0,1,(1, size)) for size in layer_dims[1:]]
    return biases

def test():
    #layer_dims = np.array([784, 200, 1])
    layer_dims = np.array([7, 4, 1])
    return create_bias_vectors(layer_dims)
single_exec(test)

[array([[ 0.01836862, -0.08424494,  1.5009134 , -0.75526176]]),
 array([[1.99907498]])]

**`forward_pass`**

Parameter 1: `X_batch`: np.array, matrix og shape ($n$, $d^{(1)}$)

Parameter 2: `weight_matrices`: np.array, list of weight matrices, [$W^{(1)}, ..., W^{(\max(l)-1)}$]

Parameter 3: `biases`: np.array, list of bias vectors for each layer (without first layer), [$b^{(1)}$, ..., $b^{(\max(l)-1)}$]

Parameter 4: `activations`: np.array, list of activation funciton of each layer, size will be $\max(l)-1$

Returns: np.array, (vector of output, list of layer caches)


In [1332]:
def forward_pass(X_batch, weight_matrices, biases, activations):
    # YOUR CODE HERE
    isarray(X_batch, 2)
    islist(weight_matrices, (np.ndarray, 2))
    islist(biases, (np.ndarray, 2))
    islist(activations, "callable")

    output = X_batch
    layer_caches = []
    for W, b, activation_fn in zip(weight_matrices, biases, activations):
        output, cache = layer_forward(output, W, b, activation_fn)
        layer_caches.append(cache)
    return output, layer_caches

def test():
    X_batch = np.array([
        [1, 2, 3, 4, 8, 4, 1], 
        [4, 2, 3, 4, 8, 2, 9],
        [4, 2, 6, 4, 2, 9, 1],
    ])
    layer_dims = np.array([7, 4, 1])
    weight_matrices = create_weight_matrices(layer_dims)
    biases = create_bias_vectors(layer_dims)
    activations = [relu_activation, sigmoid_activation]
    return forward_pass(X_batch, weight_matrices, biases, activations)
single_exec(test)

(array([[0.01000946],
        [0.05939331],
        [0.2461504 ]]),
 [(array([[1, 2, 3, 4, 8, 4, 1],
          [4, 2, 3, 4, 8, 2, 9],
          [4, 2, 6, 4, 2, 9, 1]]),
   array([[[1., 0., 0., 0.],
           [0., 0., 0., 0.],
           [0., 0., 0., 0.],
           [0., 0., 0., 1.]],
   
          [[1., 0., 0., 0.],
           [0., 0., 0., 0.],
           [0., 0., 0., 0.],
           [0., 0., 0., 0.]],
   
          [[1., 0., 0., 0.],
           [0., 0., 0., 0.],
           [0., 0., 0., 0.],
           [0., 0., 0., 0.]]]),
   array([[ 0.4795968 , -0.32412168, -0.08872   , -0.88953954],
          [ 0.43331034, -0.62320052, -0.58478363,  0.55919913],
          [-1.01631523, -0.98634105, -0.448563  ,  0.70363832],
          [-0.43549427,  1.08966881,  1.21891652,  0.07512697],
          [ 0.46727646, -0.90424034, -1.23869719,  0.31595761],
          [ 0.41406365, -1.9264484 , -1.5812752 , -0.7073796 ],
          [ 1.37214123,  0.58686031, -1.03650839, -0.56970373]])),
  (array([[ 4.62740

1.3c (deliverable which has you run a forward pass of your neural network and compute its logistic loss on some output)

In [1333]:
with open("data/test_batch_weights_biases.pkl", "rb") as fn:
    (X_batch, y_batch, weight_matrices, biases) = pickle.load(fn)

activations = [relu_activation, sigmoid_activation]
output, _ = forward_pass(X_batch, weight_matrices, biases,
                         activations)
print(output)
loss, dL_dg = logistic_loss(output, y_batch)
print(loss.mean())

[[0.49732013]
 [0.49732464]]
0.698516803853688


> 1.3a (deliverable which has you compute the gradient w.r.t. `weight_matrices` and `biases` using a finite differences checker)

In [1334]:
%%script false --no-raise-error
with open("data/test_batch_weights_biases.pkl", "rb") as fn:
    (X_batch, y_batch, weight_matrices, biases) = pickle.load(fn)

def my_nn_finite_difference_checker(X_batch, y_batch, weight_matrices, biases, activations):
    # YOUR CODE HERE
    return grad_Ws, grad_bs

grad_Ws, grad_bs = my_nn_finite_difference_checker(X_batch, 
                                                   y_batch, 
                                                   weight_matrices, 
                                                   biases, 
                                                   activations)   

with np.printoptions(precision=2):
    print(grad_Ws[0])
    print()
    print(grad_Ws[1])
    print()
    print(grad_bs[0])
    print()
    print(grad_bs[1])

> 1.2d (the backward pass!!!!)

**`backward_pass`**

Parameter 1: `dL_dg`: np.array, matrix, gradient of the loss from logistic loss

Parameter 2: `layer_caches`: list of caches (x: np.ndarray,  s: np.ndarray))

Returns: (grad_Ws: , grad_bs)


In [1335]:
#%%script false --no-raise-error

def backward_pass(dL_dg, layer_caches):
    # YOUR CODE HERE
    '''  for i, cache in enumerate(layer_caches[::-1]):
        x, da_dz, W = cache
        def z_fn(w):
            w_T = np.array([w]).T
            return (x @ w_T).T # without constant "+b" as it will make no difference here
        #v_z = np.vectorize(z, signature="(n,m) -> (1, m)")
        dz_dw = x #np.array([gradient_approx(z_fn, w) for w in W.T]).T
        display(dz_dw)
        #da_dw = np.array([dz_dw_i @ da_dz_i for dz_dw_i, da_dz_i in zip(dz_dw, da_dz)])
        print("dz_dw:", dz_dw.shape) 
        print("da_dz:", da_dz.shape) 
        #print("da_dw:", da_dw.shape) 
        print("dL_dg:", dL_dg.shape) 

    grad_Ws = dL_dg @ x '''

    dL_da = [dL_dg]
    for i, (x, da_dz, W) in enumerate(layer_caches[::-1]):
        W_mul_da_dz = np.array([(W @ da_dz_i).T[0] for da_dz_i in da_dz]).T
        dL_da.append((W_mul_da_dz @ dL_da[-1]).T)
    dL_da.reverse()

    for i, ((x, da_dz, W), dL_da_i) in enumerate(zip(layer_caches, dL_da)):
        #_, da_dz, _ = layer_caches[i-1]
        #print("dL_da:", dL_da_i)
        noe = [np.array([ x_i_j * np.diag(da_dz_i) for x_i_j in x_i]).T  for x_i, da_dz_i in zip(x, da_dz)]
        noe = [res @ dL_da for res in noe]
        print(noe)
        #print("--------------------------------")
        print(x)
        print(da_dz)
        print(dL_da_i)
    
    

    grad_Ws = ...
    grad_bs = ...
    return grad_Ws, grad_bs

def test():
    X_batch = np.array([
        [1, 2, 3, 4, 8, 4, 1], 
        [4, 2, 3, 4, 8, 2, 9],
        [4, 2, 6, 4, 2, 9, 1],
    ])
    y_batch = np.array([ 0.99999995, 0.99966556, 0.9998216])
    layer_dims = np.array([7, 4, 1])
    weight_matrices = create_weight_matrices(layer_dims)
    biases = create_bias_vectors(layer_dims)
    activations = [relu_activation, sigmoid_activation]
    output, layer_caches = forward_pass(X_batch, weight_matrices, biases, activations)
    loss, dL_dg = logistic_loss(output, y_batch)
    return backward_pass(dL_dg, layer_caches)
single_exec(test)

  noe = [res @ dL_da for res in noe]


ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 3 is different from 7)

> 1.3d (test your backward pass! compare it with 1.3a, the gradient computed by the finite difference checker. The answers should match!)

In [None]:
#%%script false --no-raise-error
with open("data/test_batch_weights_biases.pkl", "rb") as fn:
    (X_batch, y_batch, weight_matrices, biases) = pickle.load(fn)

activations = [relu_activation, sigmoid_activation]
output, layer_caches = forward_pass(X_batch, weight_matrices, biases,
                                    activations)
loss, dL_dg = logistic_loss(output, y_batch)
grad_Ws, grad_bs = backward_pass(dL_dg, layer_caches)

grad_bs = grad_Ws # Slett denne linja!!

with np.printoptions(precision=2):
    print(grad_Ws[0])
    print()
    print(grad_Ws[1])
    print()
    print(grad_bs[0])
    print()
    print(grad_bs[1])

  noe = [res @ dL_da for res in noe]


ValueError: could not broadcast input array from shape (2,4) into shape (2,)

1.2e (train your neural network on MNIST! save the training and test losses and accuracies at each iteration to use in 1.3e)

In [None]:
(X_train, y_train), (X_test, y_test) = get_mnist_threes_nines()

# YOUR CODE HERE
ASSERT_OFF = True
#display(["".join(["." if i==0 else "@" for i in line]) for line in X_train[0]])
#display("".join(["." if i==0 else "@" for i in X_train[0]]))
# Flatten out
m, n, o = X_train.shape
X_train = X_train.reshape((m, n*o))

# Make it easier to debug
capacity = 1000
X_train = X_train[:capacity]
y_train = y_train[:capacity]

# Train
layer_dims = np.array([784, 200, 1])
weight_matrices = create_weight_matrices(layer_dims)
biases = create_bias_vectors(layer_dims)
activations = [relu_activation, sigmoid_activation]
out = None
for i in range(2):
    output, layer_caches = forward_pass(X_train, weight_matrices, biases, activations)
    loss, dL_dg = logistic_loss(output, y_train)
    print(loss.mean())
    grad_Ws, grad_bs = backward_pass(dL_dg, layer_caches)
    weight_matrices = [(W.shape, grad_Ws.shape) for W in weight_matrices]
    print(weight_matrices)
ASSERT_OFF = False

15.271325018672163
(1000, 200, 200)
(1000, 1, 1)
[((784, 200), (1000, 200)), ((200, 1), (1000, 200))]


ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 2 is different from 784)

1.3e code answers for i, ii, iii

In [None]:
# i
# Plot the train and test losses from the MNIST network with step size = 0.1
# YOUR CODE HERE

# ii
# Plot the train and test accuracies from the MNIST network with step size = 0.1
# YOUR CODE HERE

# iii
# Visualize (plot) some images that are misclassified by your network

1.3e iii:
Examine the images that your network guesses incorrectly, and explain at a high level
what patterns you see in those images.

**your answer here**

1.3e iv:
Rerun the neural network training but now increase the step size to 10.0. What happens?
You do not need to include plots here.

**your answer here**

1.3f (optional) (Train a network to fit 100 random images to the first 100 original labels! How fast can you memorize the dataset?)

In [None]:
X_train = np.random.rand(100, 784)
# YOUR CODE HERE