In [2]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from tqdm import tqdm

# 0. Notation

l: number of layers

m: number of examples

$n_x$ = $n^{[0]}$: number of variables(features)

$n_y$: number of output(classes) 

x.shape: (n_x, m)

y.shape: (n_y, m)

W: (m, n)

b: (,1)

# 1. Forward Propagation

## 1-1. Initialize parameters
- Weights and biases are parameters that model connections between different layers
- Define parameter shape: keep the implementation as close as possible to the mathmatical calculations

$$X = 
\left[
\begin{matrix} 
x_{1}^{(1)} & x_{1}^{(2)} & \dots & x_{1}^{(m)} \\
x_{2}^{(1)} & x_{2}^{(2)} & \dots & x_{2}^{(m)} \\
\vdots & \vdots & \vdots & \vdots \\
x_{n}^{(1)} & x_{n}^{(2)} & \dots & x_{n}^{(m)} \\
\end{matrix}
\right] \in \R^{n \times m} $$

$$W^{[l]} \in \R^{n^{[l]} \times n^{[l - 1]}}$$

$$b^{[l]} \in \R^{n^{[l]} \times 1}$$

In [6]:
def initialize_parameters(dimensions):
    parameters = {}
    n_layers = len(dimensions)

    for l in range(1, n_layers):
        parameters["W" + str(l)] = np.random.randn(dimensions[l], dimensions[l - 1])
        parameters["b" + str(l)] = np.random.randn(dimensions[l], 1)

    return parameters

In [17]:
parameters = initialize_parameters([2, 32, 32, 1])

for key, value in parameters.items():
    print(key, value.shape)

W1 (32, 2)
b1 (32, 1)
W2 (32, 32)
b2 (32, 1)
W3 (1, 32)
b3 (1, 1)


## 1-2. Update neuron states

$$Z^{[l]} = W^{[l]} \cdot A^{[l - 1]} + b^{[l]} $$
$$ A^{[l]} = \frac {1} {1 + e^{-Z^{[l]}}}$$

### 1-2-1. Activation function

In [18]:
def sigmoid(Z):
    A = 1 / (1 + np.exp(-Z))
    return A

### 1-2-2. Forward

In [19]:
def forward(X, parameters):
    activations = {"A0": X}

    n_layers = len(parameters) // 2

    for l in range(1, n_layers + 1):
        Z = np.dot(parameters["W" + str(l)], activations["A" + str(l - 1)]) + parameters["b" + str(l)]
        activations["A" + str(l)] = sigmoid(Z) 

    return activations

In [26]:
X = np.array([[2], [1]])
activations = forward(X, parameters)

for key, value in activations.items():
    print(key, value.shape)

A0 (2, 1)
A1 (32, 1)
A2 (32, 1)
A3 (1, 1)


### 1-2-3. Predict

In [37]:
def predict(X, parameters):
    activations = forward(X, parameters)
    n_layers = len(parameters) // 2
    Af = activations["A" + str(n_layers)]
    return (Af >= 0.5).astype(int)

# 2. Backward Propagation

## 2-1. Evaluate performance - Loss

$$L = - \frac {1} {m} \sum y \log(A) + (1 - y) \log(1 - A)$$

In [28]:
def log_loss(A, y, eps=1e-15):
    A = np.clip(A, eps, 1 - eps)
    return - 1/len(y) * np.sum(y * np.log(A) + (1 - y) * np.log(1 - A))

## 2-2. Backward - gradient(derivative)

$$\frac {\partial {\mathcal{L}}} {\partial {W}} =
\frac {1} {m}  (A - y) \cdot X^T$$

$$\frac {\partial {\mathcal{L}}} {\partial {b}} =
\frac {1} {m} \sum(A - y)$$

---

$$ dZ^{[L]} = A^{[L]} - y $$
$$ dW^{[l]} = \frac {1} {m} dZ^{[l]} \cdot A^{[l - 1]^T} $$
$$ db^{[l]} = \frac {1} {m} \sum dZ^{[l]} $$

$$ dZ^{[l - 1]} = W^{[l]^T} \cdot dZ^{[l]} \times A^{[l - 1]}(1 - A^{[l - 1]}) $$


In [31]:
def backward(y, activations, parameters):
    gradients = {}

    m = y.shape[1]
    n_layers = len(parameters) // 2

    dZ = activations["A" + str(n_layers)] - y

    for l in reversed(range(1, n_layers + 1)):
        gradients["dW" + str(l)] = 1 / m * np.dot(dZ, activations["A" + str(l - 1)].T)
        gradients["db" + str(l)] = 1 / m * np.sum(dZ, axis=1, keepdims=True)
        if l > 1:
            dZ = np.dot(parameters["W" + str(l)].T, dZ) * activations["A" + str(l - 1)] * (1 - activations["A" + str(l - 1)])

    return gradients 

In [34]:
y = np.array([[1]])
gradients = backward(y, activations, parameters)

for key, value in gradients.items():
    print(key, value.shape)

dW3 (1, 32)
db3 (1, 1)
dW2 (32, 32)
db2 (32, 1)
dW1 (32, 2)
db1 (32, 1)


## 2-3. Update - Vectorized gradient descent

$$\begin{bmatrix}
w_1 \\
\\
w_2 \\
\end{bmatrix}

= \begin{bmatrix}
w_1 \\
\\
w_2 \\
\end{bmatrix}

- \alpha 
\begin{bmatrix}
\frac {\partial {\mathcal{L}}} {\partial {w_1}} \\
\\
\frac {\partial {\mathcal{L}}} {\partial {w_2}} \\
\end{bmatrix}, \space
W = \begin{bmatrix}
w_1 \\
\\
w_2 \\
\end{bmatrix} \space

\frac {\partial {\mathcal{L}}} {\partial {W}} = 
\begin{bmatrix}
\frac {\partial {\mathcal{L}}} {\partial {w_1}} \\
\\
\frac {\partial {\mathcal{L}}} {\partial {w_2}} \\
\end{bmatrix}$$

$$b = b - \alpha
\frac {\partial {\mathcal{L}}} {\partial {b}}$$

$$ W^{[l]} = W^{[l]} - \alpha \times dW^{[l]} $$

$$b^{[l]} = b^{[l]} - \alpha \times db^{[l]} $$

In [35]:
def update(gradients, parameters, lr):

    n_layers = len(parameters) // 2

    for l in range(1, n_layers + 1):
        parameters["W" + str(l)] -= lr * gradients["dW" + str(l)]
        parameters["b" + str(l)] -= lr * gradients["db" + str(l)]

    return parameters

# 3. Neural Network - 2 layers multilayer perceptron

In [40]:
def neural_net(x_train, y_train, hidden_layers = (32, 32, 32), x_test=None, y_test=None, lr=0.01, epochs=100):

    np.random.seed(0)
    dimensions = list(hidden_layers)
    dimensions.insert(0, x_train.shape[0])
    dimensions.append(y_train.shape[0])

    parameters = initialize_parameters(dimensions)

    train_loss = []
    train_acc = []
    test_loss = []
    test_acc = []

    for epoch in tqdm(range(epochs)):

        activations = forward(x_train, parameters)
        gradients = backward(y_train, activations, parameters)
        parameters = update(gradients, parameters, lr)

        if epoch % 10 == 0:
            n_layers = len(parameters) // 2
            # trian loss
            train_loss.append(log_loss(activations["A" + str(n_layers)], y_train))
            # accuracy
            y_pred = predict(x_train, parameters)
            train_acc.append(accuracy_score(y_train.flatten(), y_pred.flatten()))

            if x_test is not None and y_test is not None:
                # test loss
                test_activations = forward(x_test, parameters)
                test_loss.append(log_loss(test_activations["A" + str(n_layers)], y_test))
                # accuracy
                y_pred = predict(x_test, parameters)
                test_acc.append(accuracy_score(y_test.flatten(), y_pred.flatten()))
    
    fig, ax = plt.subplots(1, 3, figsize=(16, 4))
    ax[0].plot(train_loss, label="train loss")
    ax[0].legend()
    ax[1].plot(train_acc, label="train_acc")
    ax[1].legend()

    plot_learning_curves(x_train, y_train, parameters, ax[2])

    plt.show()

    return parameters, train_loss, train_acc

### Learning Curves

In [38]:
def plot_learning_curves(X, y, parameters, ax):
    # Generate input data for decision boundary plot
    x1_range = np.linspace(-1.5, 1.5, 100)
    x2_range = np.linspace(-1.5, 1.5, 100)
    x1_grid, x2_grid = np.meshgrid(x1_range, x2_range)

    # Calculate output for each input pair
    z_grid = np.array([[predict(np.array([[x1], [x2]]), parameters)[0, 0] for x1 in x1_range] for x2 in x2_range])

    # Plotting decision boundary
    contour = ax.contourf(x1_grid, x2_grid, z_grid, levels=50, cmap='viridis', alpha=0.7)
    plt.colorbar(contour, ax=ax)

    # Plot the dataset points
    ax.scatter(X[0, :], X[1, :], c=y, cmap="summer", edgecolor='k')
    ax.set_title("Decision Boundary")
    ax.set_xlabel('x1')
    ax.set_ylabel('x2')