# MLP

## Gradient of the cost function

### Unvectorized

We work through calculating the gradient of the cost function for a 2 layer MLP for one example.

$\textbf{x} \in \mathbb{R}^{p^{[1]}}$

$\textbf{W}^{[1]} \in \mathbb{R}^{p^{[2]} \times p^{[1]}}$

$\textbf{W}^{[2]} \in \mathbb{R}^{1 \times p^{[2]}}$ (the weights in the last layer form a row vector)

$y \in \{0, 1\}$

$\textbf{a}^{[0]} = \textbf{x}$

$\textbf{z}^{[1]} = \textbf{W}^{[1]} \textbf{a}^{[0]} \in \mathbb{R}^{p^{[2]}}$

$\textbf{a}^{[1]} = g^{[1]}(\textbf{z}^{[1]}) \in \mathbb{R}^{p^{[2]}}$

$z^{[2]} = \textbf{W}^{[2]} \textbf{a}^{[1]} \in \mathbb{R}$

$a^{[2]} = g^{[2]}(z^{[2]}) \in \mathbb{R}$

$l = -y \log a^{[2]} - (1-y) \log (1 - a^{[2]})$

$\frac{\partial l}{\partial W_j^{[2]}} = \frac{\partial l}{\partial a^{[2]}} \frac{\partial a^{[2]}}{\partial z^{[2]}} \frac{\partial z^{[2]}}{\partial W_j^{[2]}}$

$\frac{\partial l}{\partial a^{[2]}} = -\frac{y}{a^{[2]}} + \frac{1-y}{1-a^{[2]}}$

$\frac{\partial a^{[2]}}{\partial z^{[1]}} = a^{[2]} (1 - a^{[2]})$

$\frac{z^{[2]}}{\partial W_j^{[2]}} = \frac{\partial}{\partial W_j^{[2]}} \left(W_1^{[2]} a_1^{[1]} + \dots + W_{p^{[2]}}^{[2]} a_{p^{[2]}}^{[1]} \right) = a_j^{[1]}$

$\frac{\partial l}{\partial z^{[2]}} = a^{[2]} - y$

$\frac{\partial l}{\partial W_j^{[2]}} = \frac{\partial l}{\partial z^{[2]}} a_j^{[1]}$

$\frac{\partial l}{\partial W_{i,j}^{[1]}} = \frac{\partial l}{\partial a^{[2]}} \frac{\partial a^{[2]}}{\partial z^{[2]}} \frac{\partial z^{[2]}}{\partial a^{[1]}} \frac{\partial a^{[1]}}{\partial z^{[1]}} \frac{\partial z^{[1]}}{\partial W_{i,j}^{[1]}}$

$\frac{\partial l}{\partial z^{[1]}} = \frac{\partial l}{\partial z^{[2]}} W^{[2]} g'^{[1]}(z^{[1]})$

$\frac{\partial l}{\partial W_{i,j}^{[1]}} = \frac{\partial l}{\partial z^{[1]}} \frac{\partial z^{[1]}}{\partial W_{i,j}^{[1]}}$

### Vectorized (p_out, p_in)

We start with $\frac{\partial J}{\partial \textbf{Z}^{[L]}} = \textbf{A}^{[L]} - \textbf{y}$. Then proceeding backwards through the layers, we compute $\frac{\partial J}{\partial \textbf{W}^{[l]}} = \frac{1}{n^{[l-1]}} \frac{\partial J}{\partial \textbf{Z}^{[l]}} (\textbf{A}^{[l-1]})^T$ and $\frac{\partial J}{\partial \textbf{Z}^{[l]}} = (\textbf{W}^{[l+1]})^T (\frac{\partial J}{\partial \textbf{Z}^{[l+1]}}) \odot g'^{[l]}(\textbf{Z}^{[l]})$.

## Layer implementation

see [mlp.py](https://github.com/hacobe/notes/blob/main/mlp.py)

## Function Implementation

### (p_out, p_in)

* hidden units are stacked vertically and examples are stacked horizontally
* weight matrices therefore have the form (p_out, p_in)

In [2]:
import tensorflow as tf
import numpy as np

2022-09-07 09:08:06.608787: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
def relu(x):
    return np.maximum(x, 0)

def sigmoid(x):
    return 1. / (1 + np.exp(-x))

In [4]:
def forward(X, weights):
    cache = []
    A = X.T
    for l in range(len(weights)):
        W = weights[l].numpy().T
        Z = np.dot(W, A)
        cache.append((A, W, Z))
        if l == (len(weights) - 1):
            A = sigmoid(Z)
        else:
            A = relu(Z)
    return A, cache

In [5]:
def backward(AL, y, cache):
    grads = []
    num_layers = len(cache)
    for l in range(num_layers - 1, -1, -1):
        A_prev, _, Z = cache[l]

        if l == num_layers - 1:
            dZ = (AL - y.T)
        else:
            _, W, _ = cache[l+1]
            dA = np.dot(W.T, dZ)
            dAdZ = np.zeros(Z.shape)
            dAdZ[Z > 0] = 1
            dZ = dA * dAdZ
        dW = (1./A_prev.shape[1]) * np.dot(dZ, A_prev.T)
        grads.append(dW)
    return grads[::-1]

In [9]:
class Model(tf.keras.Model):

    def __init__(self):
        super(Model, self).__init__()
        self.d1 = tf.keras.layers.Dense(16, activation='relu', use_bias=False)
        self.d2 = tf.keras.layers.Dense(8, activation='relu', use_bias=False)
        self.d3 = tf.keras.layers.Dense(1, activation='sigmoid', use_bias=False)

    def call(self, x):
        x = self.d1(x)
        x = self.d2(x)
        return self.d3(x)

def assert_close(actual, expected):
    TOL = 1e-3
    assert (abs(actual - expected) < TOL).all()

def test_mlp():
    np.random.seed(0)
    tf.random.set_seed(0)

    n = 10
    p = 32

    X = np.random.random((n, p))
    p = np.random.random((n,))
    y = np.zeros((n, 1))
    for i in range(n):
        r = np.random.random()
        if r <= p[i]:
            y[i, 0] = 1.

    model = Model()
    loss_object = tf.keras.losses.BinaryCrossentropy()
    with tf.GradientTape() as tape:
        out = model(X)
        loss = loss_object(y, out)
    expected_grads = tape.gradient(loss, model.trainable_variables)

    expected_AL = model(X).numpy()
    actual_AL, cache = forward(X, model.weights)
    actual_grads = backward(actual_AL, y, cache)

    assert_close(actual_AL.T, expected_AL)
    assert len(actual_grads) == len(expected_grads)
    for i in range(len(expected_grads)):
        assert_close(actual_grads[i].T, expected_grads[i].numpy())

test_mlp()

### (p_in, p_out)

In [6]:
import numpy as np
import tensorflow as tf

In [7]:
def relu(x):
    return np.maximum(x, 0)

def sigmoid(x):
    return 1. / (1 + np.exp(-x))

In [8]:
def forward(X, weights):
    cache = []
    A = X
    for l in range(len(weights)):
        W = weights[l].numpy()
        Z = np.dot(A, W)
        cache.append((A, W, Z))
        if l == (len(weights) - 1):
            A = sigmoid(Z)
        else:
            A = relu(Z)
    return A, cache

In [9]:
def backward(AL, y, cache):
    grads = []
    num_layers = len(cache)
    for l in range(num_layers - 1, -1, -1):
        A_prev, _, Z = cache[l]

        if l == num_layers - 1:
            dZ = (AL - y)
        else:
            _, W, _ = cache[l+1]
            dA = np.dot(dZ, W.T)
            dAdZ = np.zeros(Z.shape)
            dAdZ[Z > 0] = 1
            dZ = dA * dAdZ
        # dW is simpler than the equations for dZ
        dW = (1./A_prev.shape[0]) * np.dot(A_prev.T, dZ)
        grads.append(dW)
    return grads[::-1]

In [10]:
class Model(tf.keras.Model):

    def __init__(self):
        super(Model, self).__init__()
        self.d1 = tf.keras.layers.Dense(16, activation='relu', use_bias=False)	
        self.d2 = tf.keras.layers.Dense(8, activation='relu', use_bias=False)
        self.d3 = tf.keras.layers.Dense(1, activation='sigmoid', use_bias=False)

    def call(self, x):
        x = self.d1(x)
        x = self.d2(x)
        return self.d3(x)

def assert_close(actual, expected):
    TOL = 1e-3
    assert (abs(actual - expected) < TOL).all()

def test_mlp():
    np.random.seed(0)
    tf.random.set_seed(0)

    n = 10
    p = 32

    X = np.random.random((n, p))
    p = np.random.random((n,))
    y = np.zeros((n, 1))
    for i in range(n):
        r = np.random.random()
        if r <= p[i]:
            y[i, 0] = 1.

    model = Model()
    loss_object = tf.keras.losses.BinaryCrossentropy()
    with tf.GradientTape() as tape:
        out = model(X)
        loss = loss_object(y, out)
    expected_grads = tape.gradient(loss, model.trainable_variables)

    expected_AL = model(X).numpy()
    actual_AL, cache = forward(X, model.weights)
    actual_grads = backward(actual_AL, y, cache)

    assert_close(actual_AL, expected_AL)
    assert len(actual_grads) == len(expected_grads)
    for i in range(len(expected_grads)):
        assert_close(actual_grads[i], expected_grads[i].numpy())

test_mlp()

## Sources

* [Gradient Descent For Neural Networks (C1W3L09)](https://www.youtube.com/watch?v=7bLEWDZng_M&list=PLkDaE6sCZn6Ec-XTbcX1uRg2_u4xOEky0&index=33)
* [Backpropagation Intuition (C1W3L10)](https://www.youtube.com/watch?v=yXcQ4B-YSjQ&list=PLkDaE6sCZn6Ec-XTbcX1uRg2_u4xOEky0&index=35)
* https://towardsdatascience.com/lets-code-a-neural-network-in-plain-numpy-ae7e74410795