# Adjusting the weights to learn

<p>This notebook shows how to train the network by adjusting the weights

<small>Author: Fernando Carlos López Hernández</small>

In [1]:
import numpy as np

def init_random_NN(Ns_per_layer):
    """ Create a NN with the number of Ns indicated for each input+hidden+output layer """
    NN = list()
    n_previous = Ns_per_layer.pop(0)
    for n in Ns_per_layer:
        layer = [{'w': np.random.normal(0,1, size = n_previous+1)} for i in range(n)]
        NN.append(layer)
        n_previous = n
    return NN

def activation_value(weights, inputs):
    """ Computes the activation value of the N """
    return np.dot(weights, inputs)

def sigmoid(v):
    return 1.0 / (1.0 + np.exp(-v))

def step(v):
    if v>0.0:
        return 1
    else:
        return 0

def forward_propagate(NN, inputs, transfer_fn):
    inputs = np.concatenate([[1.0],inputs])
    for L in NN:
        outputs = []
        for N in L:
            N['v'] = activation_value(N['w'], inputs)
            N['y'] = transfer_fn(N['v'])
            outputs.append(N['y'])
        inputs = np.concatenate([[1.0],outputs])
    return outputs

## Defining the NN problem
First we create a NN with random weights and the structure of the AND, OR, XOR problem to solve.  2 inputs and 3 outputs (AND, OR, XOR)

X contains the samples by columns

G contains the ground truth of each sample by columns

In [2]:
# We define the AND, OR, XOR problem using one hot encoding
X = np.array([[1, 1, 1, 1],
              [0, 0, 1, 1],
              [0, 1, 0, 1]])
G = np.array([[0, 0, 0, 1],
              [0, 1, 1, 1],
              [0, 1, 1, 0]])

## Training with a single layer

Then we initializate the NN with random weigths

In [3]:
n_inputs = X.shape[0]-1
n_outputs = G.shape[0]
NN = init_random_NN([n_inputs,n_outputs])
print(NN)

[[{'w': array([0.03649985, 1.28319384, 0.28645509])}, {'w': array([-0.70454226, -0.07538271, -0.60932087])}, {'w': array([ 2.02422437, -0.51085988, -0.63239757])}]]


And we train the single layer NN using the delta rule.

In [4]:
def train_single_layer(NN, X, G, transfer_fn, max_epochs = 1000, alpha = 0.1, loss_threshold = 0.01):
    """ Update the weights until the loss is less than loss_threshold
        X has the training set batch with the samples in columns
        G has the expected output values for each sample in a columns """
    for _ in range(max_epochs):
        epoch_loss = 0.0
        for s in range(X.shape[1]):
            # For each sample
            input_sample = X[:,s]
            ground_sample = G[:,s]
            # Forward-propagate providing the input_sample without bias
            y_pred = np.array(forward_propagate(NN, input_sample[1:], transfer_fn))
            # For each N in the output layer
            output_layer = NN[-1]
            for i,N in enumerate(output_layer):
                # Error estimation for N with the sample
                e = ground_sample[i] - y_pred[i]
                epoch_loss += e**2
                # Delta Learning rule
                delta = y_pred[i]*(1-y_pred[i])*e
                dW = alpha*delta*input_sample
                # Update the weights
                N['w'] += dW
        if (epoch_loss<=loss_threshold):
            break

transfer_fn = sigmoid
max_epochs = 10000
train_single_layer(NN, X, G, transfer_fn, max_epochs)


We show the results with the trained NN. Note that the NN trained with the delta rule correctly predicts the AND and OR gates, but the NN fails recognizing the XOR gate

In [5]:
inputs = X[1:,:]
for input in inputs.T:
    y_pred = forward_propagate(NN, input, transfer_fn)
    print('INPUT ', input, ' AND y:%f: OR y:%f: NOR y:%f' %  tuple(y_pred))

INPUT  [0 0]  AND y:0.000243: OR y:0.055037: NOR y:0.503144
INPUT  [0 1]  AND y:0.055452: OR y:0.965638: NOR y:0.500000
INPUT  [1 0]  AND y:0.055471: OR y:0.965633: NOR y:0.496856
INPUT  [1 1]  AND y:0.934074: OR y:0.999926: NOR y:0.493711


## Training with backpropagation

First we initialize the NN with a hidden layer and random weights

In [6]:
n_hiddens = max(n_inputs, n_outputs)
NN = init_random_NN([n_inputs,n_hiddens,n_outputs])
print(NN)

[[{'w': array([0.28468051, 0.12332044, 1.9822295 ])}, {'w': array([-0.66979376, -0.02946977,  1.06339887])}, {'w': array([-0.52217385, -0.05858228,  0.93349379])}], [{'w': array([ 0.1924408 , -0.69651302, -0.05787673, -0.1032432 ])}, {'w': array([-0.26851213,  0.34483267, -0.68647113, -1.02890057])}, {'w': array([ 0.52742456, -0.18107997,  0.72957427,  0.29135883])}]]


Now we train the NN using backpropagation

In [7]:
def N_transfer_derivative(y_pred):
    return y_pred * (1.0 - y_pred)

def backward_propagate_error(NN, ground_sample):
    """ Backward propagate the error in the NN as currently configured by forward_propagate """
    for l in reversed(range(len(NN))):
        L = NN[l]
        errors = []
        if l == len(NN)-1: # Output layer
            for i in range(len(L)):
                N = L[i]
                N['e'] = ground_sample[i] - N['y']
                errors.append(N['e'])
        else: # Hidden layer
            for i in range(1,len(L)+1): # Skip the next layer bias
                N = L[i-1]
                N['e'] = 0.0
                for forwardN in NN[l+1]:
                    N['e'] += forwardN['w'][i] * forwardN['d']
                errors.append(N['e'])
        # Compute deltas and updates: is the same in all the layers
        for i in range(len(L)):
            N = L[i]
            N['d'] = errors[i] * N_transfer_derivative(N['y'])

def update_weights(NN, inputs, alpha):
    """ Update weights forward from the input sample """
    for l in range(len(NN)):
        for N in NN[l]:
            N['w'][0] += alpha*N['d']*1.0
            for i in range(len(inputs)):
                N['w'][i+1] += alpha*N['d']*inputs[i]
        inputs = [N['y'] for N in NN[l]]

def train_backpropagation(NN, X, G, transfer_fn, max_epochs = 1000, alpha = 0.1, loss_threshold = 0.01):
    """ Update the weights until the loss is less than loss_threshold
        X has the training set batch with the samples in columns
        G has the expected output values for each sample in a columns """
    X = X[1:,:] # Remove the biases as they are always 1.0
    n_samples = X.shape[1]
    n_outputs = G.shape[0]
    for epoch in range(max_epochs):
        epoch_loss = 0.0
        for s in range(n_samples):
            # For each sample
            input_sample = X[:,s]
            ground_sample = G[:,s]
            y_pred = np.array(forward_propagate(NN, input_sample, transfer_fn))
            epoch_loss += sum( (ground_sample[i]-y_pred[i])**2 for i in range(n_outputs))
            backward_propagate_error(NN, ground_sample)
            update_weights(NN, input_sample, alpha)
        if (epoch_loss<=loss_threshold):
            break

# We train the multi layer NN using backpropagation
transfer_fn = sigmoid 
max_epochs = 1000
alpha = 0.5
train_backpropagation(NN, X, G, transfer_fn, max_epochs, alpha)

We show the results with the trained NN. Note that now the NN can recognize all the gates, including the XOR gate

In [8]:
inputs = X[1:,:]
for input in inputs.T:
    y_pred = forward_propagate(NN, input, transfer_fn)
    print('INPUT ', input, ' AND y:%f: OR y:%f: XOR y:%f' %  tuple(y_pred))

INPUT  [0 0]  AND y:0.000452: OR y:0.053638: XOR y:0.123319
INPUT  [0 1]  AND y:0.040539: OR y:0.967055: XOR y:0.927659
INPUT  [1 0]  AND y:0.044682: OR y:0.961518: XOR y:0.861161
INPUT  [1 1]  AND y:0.934601: OR y:0.997399: XOR y:0.130921
