# Multi-layer Perceptron (MLP)

MLP is the vanilla neural network structure that was actively investigated and developed during the previous wave of neural networks research in 1990's. It is built on a single neuron model called *Perceptron*.

## Single neuron (Perceptron)

$$
y = logsig\left(\sum_j w_jx_j+w_0\right)
$$

Logsig is sigmoidal function that provides the desired non-linearity to the neuron output. The purpose of non-linearity is to discretize the output to $\left\{0,1\right\}$ needed for two-class classifcation problems.

Study the behavior of logsig

In [None]:
import matplotlib.pyplot as plt
from scipy.special import expit

print('scipy.special.expit is the logistic sigmoid function')

x = [-10,-5.0,-2.5,0,+2.5,+5.0,+10]
y = expit(x)

plt.plot(x,y)

Multiplier of $x$ makes the change from 0 to 1 more rapid.

In [None]:
import numpy as np

x = [-10,-5.0,-2.5,0,+2.5,+5.0,+10]
y = expit(10*np.array(x))

plt.plot(x,y)

**Example:** Handmade OR neuron

In [None]:
x1 = [0,1,1,0]
x2 = [0,0,1,1]
y  = [0,1,1,1] # x1 OR x2
w1=20
w2=20
w0=-10

plt.plot(x1,x2,'ro')
plt.title('x1 OR x2 using manually selected weights')
for ind in range(len(y)):
    plt.text(x1[ind],x2[ind],f'y={expit(w1*x1[ind]+w2*x2[ind]+w0):.2f}')

Inspect the decision boundary

In [None]:
from sklearn.inspection import DecisionBoundaryDisplay

feature_1, feature_2 = np.meshgrid(np.linspace(-0.1,1.1), np.linspace(-0.1,1.1))
grid = np.vstack([feature_1.ravel(), feature_2.ravel()]).T
print('Decision boundary of hand selected weights')
y_pred = np.reshape(expit(w1*grid[:,0]+w2*grid[:,1]+w0), feature_1.shape)
display = DecisionBoundaryDisplay(xx0=feature_1, xx1=feature_2, response=y_pred)
display.plot()
plt.xlabel('x1')
plt.ylabel('x2')
plt.show()

**Example:** GD for the OR data

In [None]:
# Compute MSE heat map for different a and b
num_of_epochs = 10000
lr = 0.15 # learning rate
N = len(x1)
w1_t = 0
w2_t = 0
w0_t = 0

for e in range(num_of_epochs):
    y_h = expit(w1_t*np.array(x1)+w2_t*np.array(x2)+w0_t)
    nablaL_w1 = 1/N*sum(2*(y-y_h)*-y_h*(1-y_h)*x1)
    nablaL_w2 = 1/N*sum(2*(y-y_h)*-y_h*(1-y_h)*x2)
    nablaL_w0 = 1/N*sum(2*(y-y_h)*-y_h*(1-y_h)*1)
    #grad_w1 = np.sum(2*x_tr*(y_tr-y)*y*(-1+y))
    #grad_w0 = np.sum(2*(y_tr-y)*y*(-1+y))
    w1_t = w1_t-lr*nablaL_w1
    w2_t = w2_t-lr*nablaL_w2
    w0_t = w0_t-lr*nablaL_w0

    if np.mod(e,1000) == 0 or e == 1: # Plot after every 20th epoch
        y_pred = expit(w1_t*np.array(x1)+w2_t*np.array(x2)+w0_t)
        MSE = np.sum((y-y_pred)**2)/(len(y))
        print(f'Epoch {e} MSE is {MSE:.5f}')
        #plt.title(f'Epoch={e} w0={w0_t:.2f} w1={w1_t:.2f} MSE={MSE:.2f}')
        #plt.plot(x_h,y_h,'co', label="hobbit")
        #plt.plot(x_e,y_e,'mo', label="elf")
        #x = np.linspace(0.0,+5.0,50)
        #plt.plot(x,expit(w1_t*x+w0_t),'b-',label='y=logsig(w1x+w0)')
        #plt.plot([0.5, 5.0],[0.5,0.5],'k--',label='y=0 (class boundary)')
        #plt.xlabel('height [m]')
        #plt.legend()
        #plt.show()

np.set_printoptions(precision=3,suppress=True)
print(f'True values y={y} and predicted values y_pred={y_pred}')
np.set_printoptions(precision=3,suppress=True)
print(f'w_1 = {w1_t} w_2 = {w2_t} w_0 = {w0_t}')

In [None]:
plt.plot(x1,x2,'ro')
plt.title('x1 OR x2 using GD weights')
for ind in range(len(y)):
    plt.text(x1[ind],x2[ind],f'y={expit(w1_t*x1[ind]+w2_t*x2[ind]+w0_t):.2f}')

Decision boundary

In [None]:
feature_1, feature_2 = np.meshgrid(np.linspace(-0.1,1.1), np.linspace(-0.1,1.1))
grid = np.vstack([feature_1.ravel(), feature_2.ravel()]).T
print('Decision boundary of GD weights')
y_pred = np.reshape(expit(w1_t*grid[:,0]+w2_t*grid[:,1]+w0_t), feature_1.shape)
display = DecisionBoundaryDisplay(xx0=feature_1, xx1=feature_2, response=y_pred)
display.plot()
plt.show()

## Multi-layer neural network

**Example:** Handmade XOR MLP

In [None]:
x1 = [0,1,1,0]
x2 = [0,0,1,1]
y  = [0,1,0,1] # x1 XOR x2

w11 = -20
w12 = 30
w10 = -20
w21 = 30
w22 = -20
w20 = -20

w1=20
w2=20
w0=-10

plt.plot(x1,x2,'ro')
plt.title('x1 XOR x2 for manually selected weights')
for ind in range(len(y)):
    y1 = expit(w11*x1[ind]+w12*x2[ind]+w10)
    y2 = expit(w21*x1[ind]+w22*x2[ind]+w20)
    y_h  = expit(w1*y1+w2*y2+w0)
    plt.text(x1[ind],x2[ind],f'y={y_h:.2f} (y1={y1:.2f},y2={y2:.2f})')

Define necessary computational structures

In [None]:
def sigmoid(x):
  return 1 / (1 + np.exp(-x))

# Forward pass
def perceptron(x1,x2,w1,w2,w0):
    return sigmoid(w1*x1+w2*x2+w0)


def mlp(w11,w12,w10,w21,w22,w20,w1,w2,w0,x1,x2):
    y1 = perceptron(x1,x2,w11,w12,w10)
    y2 = perceptron(x1,x2,w21,w22,w20)
    y  = perceptron(y1,y2,w1,w2,w0)
    return y

In [None]:
feature_1, feature_2 = np.meshgrid(np.linspace(-0.1,1.1), np.linspace(-0.1,1.1))
grid = np.vstack([feature_1.ravel(), feature_2.ravel()]).T
print('Decision boundary for manually selected weights')
y_pred = np.reshape(mlp(w11,w12,w10,w21,w22,w20,w1,w2,w0,grid[:,0],grid[:,1]), feature_1.shape)
display = DecisionBoundaryDisplay(xx0=feature_1, xx1=feature_2, response=y_pred)
display.plot()
plt.show()

### MLP gradient descent

Initialize weights randomly

In [None]:
w11_t = np.random.normal(-1,1)
w12_t = np.random.normal(-1,1)
w10_t = np.random.normal(-1,1)
w21_t = np.random.normal(-1,1)
w22_t = np.random.normal(-1,1)
w20_t = np.random.normal(-1,1)

w1_t = np.random.normal(-1,1)
w2_t = np.random.normal(-1,1)
w0_t = np.random.normal(-1,1)

In [None]:
x1 = np.array(x1)
x2 = np.array(x2)
y = np.array(y)

Note that you may need to run this several times (why?)

In [None]:
num_of_epochs = 5000
lr = 0.05

MSE = np.zeros([num_of_epochs,1])

# Main training loop
for e in range(num_of_epochs):
    ## Forward pass

    y_1 = perceptron(x1,x2,w11_t,w12_t,w10_t)
    y_2 = perceptron(x1,x2,w21_t,w22_t,w20_t)
    y_h = perceptron(y_1,y_2,w1_t,w2_t,w0_t)

    ## Backward pass

    # Loss gradient
    nabla_L = 2*(y-y_h)*-1

    # Output neuron gradient
    nabla_y_h_y1 = nabla_L*y_h*(1-y_h)*w1_t
    nabla_y_h_y2 = nabla_L*y_h*(1-y_h)*w2_t

    ## Update

    # Output weights
    w1_t = w1_t - lr*np.sum(nabla_L*y_h*(1-y_h)*y_1)
    w2_t = w2_t - lr*np.sum(nabla_L*y_h*(1-y_h)*y_2)
    w0_t = w0_t - lr*np.sum(nabla_L*y_h*(1-y_h)*1)

    # Hidden layer y_1 weights
    w11_t = w11_t - lr*np.sum(nabla_y_h_y1*y_1*(1-y_1)*x1)
    w12_t = w12_t - lr*np.sum(nabla_y_h_y1*y_1*(1-y_1)*x2)
    w10_t = w10_t - lr*np.sum(nabla_y_h_y1*y_1*(1-y_1)*1)

    # Hidden layer y_2 weights
    w21_t = w21_t - lr*np.sum(nabla_y_h_y2*y_2*(1-y_2)*x1)
    w22_t = w22_t - lr*np.sum(nabla_y_h_y2*y_2*(1-y_2)*x2)
    w20_t = w20_t - lr*np.sum(nabla_y_h_y2*y_2*(1-y_2)*1)
    MSE[e] = np.sum((y-y_h)**2)

# Final evaluation
y_1 = perceptron(x1,x2,w11_t,w12_t,w10_t)
y_2 = perceptron(x1,x2,w21_t,w22_t,w20_t)
y_h = perceptron(y_1,y_2,w1_t,w2_t,w0_t)
print(y_h)
plt.plot(range(num_of_epochs),MSE)
plt.show()

In [None]:
plt.plot(x1,x2,'ro')
plt.title('x1 XOR x2 for GD found weights')
for ind,foo in enumerate(y):
    y1 = expit(w11_t*x1[ind]+w12_t*x2[ind]+w10_t)
    y2 = expit(w21_t*x1[ind]+w22_t*x2[ind]+w20_t)
    y_h  = expit(w1_t*y1+w2_t*y2+w0_t)
    plt.text(x1[ind],x2[ind],f'y_h={y_h:.2f} (y1={y1:.2f},y2={y2:.2f})')

In [None]:
feature_1, feature_2 = np.meshgrid(np.linspace(-0.1,1.1), np.linspace(-0.1,1.1))
grid = np.vstack([feature_1.ravel(), feature_2.ravel()]).T
print('Decision boundary for manually selected weights')
y_pred = np.reshape(mlp(w11_t,w12_t,w10_t,w21_t,w22_t,w20_t,w1_t,w2_t,w0_t,grid[:,0],grid[:,1]), feature_1.shape)
display = DecisionBoundaryDisplay(xx0=feature_1, xx1=feature_2, response=y_pred)
display.plot()
plt.show()

## References

 * C.M. Bishop (2006): Pattern Recognition and Machine Learning, Chapter 5 ([PDF](https://www.microsoft.com/en-us/research/uploads/prod/2006/01/Bishop-Pattern-Recognition-and-Machine-Learning-2006.pdf))