# ANN Backpropagation

## Transfer Functions

The derivatives take the output of the functions

In [99]:
import random
import numpy as np

class Relu:
    def __call__(self, x):
        y = np.copy(x)
        y[y<0]=0
        return y

    def der(self, x):
        y = np.copy(x)
        y[y>0]=1
        y[y<=0]=0
        return y


class Linear:
    def __call__(self, x):
        return x

    def der(self, x):
        return np.ones_like(x)
    

class Sigmoid:
    def __call__(self, x):
        return 1/(1+np.exp(-x))

    def der(self, y):
        return y*(1-y)  
    
    
class Tanh:
    def __call__(self, x):
        return np.tanh(x)

    def der(self, y):
        return y*(1-y)  


## Define Network Functions

In [112]:
# forward pass
def forward(X, weights, bias, transfer_funcs):
    
    activations = [X]
    
    for W, b, f in zip(weights, bias, transfer_funcs):
        activations.append(f(activations[-1]@W +b))
    
    # first activation is input
    return activations[-1], activations



# backward pass for calculating gradients
def backward(act, err_der, weights, bias, transfer_funcs):
    
    grad = []
    # output_layer
    l_out = transfer_funcs[-1].der(act[-1])*err_der
    l_in = act[-2]
        
    dW = l_in.T@l_out
    dB = np.mean(l_out, axis=0)
    bias[-1] -= lr*dB
    
    grad.append(dW/err_der.shape[0])
    
    df2_df1 = l_out@weights[-1].T

    # middle layers
    for i in range(len(weights)-2, -1, -1):
        
        l_out = transfer_funcs[i].der(act[i+1])*df2_df1
        l_in = act[i]
        dW = l_in.T@l_out
        df2_df1 = l_out@weights[i].T
        
        grad.append(dW/err_der.shape[0])
        
    # return the gradients
    return list(reversed(grad))



# optimize step, apply gradients
def apply_gradients(lr, grads, weights, bias):
    
    for i, grad in enumerate(grads):
        weights[i]-=lr*grad
      

## Artificial Data, Training Loop

Here we define the training loop, iterate over the dataset, calculate the gradients and apply them

In [113]:
import numpy as np
np.random.seed(222)

# Define the network
weights = [
    
    np.random.normal(0,1,(2,32)),
    np.random.normal(0,1,(32,16)),
    np.random.normal(0,1,(16,1)),
]

bias = [
    
    np.random.normal(0,1,(1,32)),
    np.random.normal(0,1,(1,16)),
    np.random.normal(0,1,(1,1)),
]

transfer_funcs = [
    
    Relu(),
    Relu(),
    Linear()
]


# training loop
X = np.random.normal(0,1, (1000, 2))
Y = np.sum(X, axis=1)[:, None]

BATCH_SIZE = 64
SHUFFLE = True
EPOCHS = 500
# set the learning rate
lr = 1e-3

data_indices = np.arange(len(X))
np.random.shuffle(data_indices)

if SHUFFLE:
    data_X = X[data_indices]
    data_Y = Y[data_indices]
else:
    data_X = X
    data_Y = Y
    
for epoch in range(EPOCHS):
    epoch_error = 0
    for i in range(0, len(X), BATCH_SIZE):
        X_batch = data_X[i:i+BATCH_SIZE]
        Y_batch = data_Y[i:i+BATCH_SIZE]
        
        
        # forward pass, save activations of layers for gradient calculation
        Y_, activations = forward(X_batch, weights, bias, transfer_funcs)

        # calculate error and error derivative
        err = np.sum(np.square(Y_-Y_batch))
        err_der = Y_-Y_batch
        
        epoch_error+=err

        # calculate the gradients
        gradients = backward(activations, err_der, weights, bias, transfer_funcs)
        
        # apply the gradients
        apply_gradients(lr, gradients, weights, bias)
        
    
    if epoch%20 == 0:
        print(f"Epoch {epoch} MSE={epoch_error/2/len(X)}")


Epoch 0 MSE=6.974657149304077
Epoch 20 MSE=0.44559598953795493
Epoch 40 MSE=0.24153250997052933
Epoch 60 MSE=0.16406958068346228
Epoch 80 MSE=0.12541631608615145
Epoch 100 MSE=0.10287426661650317
Epoch 120 MSE=0.08756556369002266
Epoch 140 MSE=0.07693257452058418
Epoch 160 MSE=0.06895468971080009
Epoch 180 MSE=0.06259217878447224
Epoch 200 MSE=0.05756630219572251
Epoch 220 MSE=0.053492277617714815
Epoch 240 MSE=0.050110683093865425
Epoch 260 MSE=0.047170538974549564
Epoch 280 MSE=0.044644258071217566
Epoch 300 MSE=0.04240931371342241
Epoch 320 MSE=0.040447061858661905
Epoch 340 MSE=0.0386943663874325
Epoch 360 MSE=0.03711733824943413
Epoch 380 MSE=0.03567395088019383
Epoch 400 MSE=0.034346901979347165
Epoch 420 MSE=0.033099640866293946
Epoch 440 MSE=0.03191696374254339
Epoch 460 MSE=0.030821562200785718
Epoch 480 MSE=0.02980973065333577
