In [1]:
%%capture
%run ch04_linear_algebra.ipynb
%run ch08_gradient_descent.ipynb

In [2]:
# Perceptron (single neuron)

def step_function(x: float) -> float:
    return 1.0 if x >= 0 else 0.0

def perceptron_output(weights: Vector, bias: float, x: Vector) -> float:
    """Returns 1 if the perceptron 'fires', 0 if not"""
    calculation = dot(weights, x) + bias
    
    return step_function(calculation)

In [3]:
# AND operation

and_weights = [2., 2]
and_bias = -3.

assert perceptron_output(and_weights, and_bias, [1, 1]) == 1
assert perceptron_output(and_weights, and_bias, [0, 1]) == 0
assert perceptron_output(and_weights, and_bias, [1, 0]) == 0
assert perceptron_output(and_weights, and_bias, [0, 0]) == 0

In [4]:
# OR operation (XOR is not possible using a single neuron)

or_weights = [2., 2]
or_bias = -1.

assert perceptron_output(or_weights, or_bias, [1, 1]) == 1
assert perceptron_output(or_weights, or_bias, [0, 1]) == 1
assert perceptron_output(or_weights, or_bias, [1, 0]) == 1
assert perceptron_output(or_weights, or_bias, [0, 0]) == 0

In [5]:
# NOT operation

not_weights = [-2.]
not_bias = 1.

assert perceptron_output(not_weights, not_bias, [0]) == 1
assert perceptron_output(not_weights, not_bias, [1]) == 0

In [6]:
# Feed-forward Neural Networks (FNN)

import math

def sigmoid(t: float) -> float:
    # We cannot use step function to train a neural network (cannot compute gradients from it), 
    # so we need smooth functions like Sigmoid.
    
    return 1 / (1 + math.exp(-t))

def neuron_output(weights: Vector, inputs: Vector) -> float:
    # weights includes the bias term, inputs includes a 1
    
    return sigmoid(dot(weights, inputs))

In [7]:
from typing import List

# A neural network is a list (layers) of lists (layer) of vectors (weights connected to each neuron)

def feed_forward(neural_network: List[List[Vector]], 
                 input_vector: Vector) -> List[Vector]:
    
    """
    Feeds the input vector through the neural network.
    Returns the outputs of all layers (not just the last one).
    """
    outputs: List[Vector] = []
        
    for layer in neural_network:
        input_with_bias = input_vector + [1]             # Add a constant.
        output = [neuron_output(neuron, input_with_bias) # Compute the output 
                  for neuron in layer]                   # for each neuron.
        
        outputs.append(output)                           # Add to results.
    
        input_vector = output                            # Then the input to the next layer is the output of this one
        
        # Repeat this for all remaining layers
    
    return outputs

In [8]:
# XOR operation via FNN

xor_network = [
    [[20., 20, -30],    # AND neuron in the hidden layer 
     [20., 20, -10]],   # OR neuron in the hidden layer
    [[-60., 60, -30]]]  # XOR ('2nd input but not 1st input') neuron in the output layer

# feed_forward returns the outputs of all layers, so the [-1] gets the final output,
# and [0] gets the value of the resulting vector
assert 0.000 < feed_forward(xor_network, [0, 0])[-1][0] < 0.001
assert 0.999 < feed_forward(xor_network, [1, 0])[-1][0] < 1.000
assert 0.999 < feed_forward(xor_network, [0, 1])[-1][0] < 1.000
assert 0.000 < feed_forward(xor_network, [1, 1])[-1][0] < 0.001

In [9]:
hidden_outputs, final_output = feed_forward(xor_network, [1, 1])

print(hidden_outputs)
print(final_output)

[0.9999546021312976, 0.9999999999999065]
[9.383146683006828e-14]


In [10]:
# Backpropagation

def sqerror_gradients(network: List[List[Vector]], 
                      input_vector: Vector, 
                      target_vector: Vector) -> List[List[Vector]]:
    
    """
    Given a neural network, an input vector, and a target vector,
    make a prediction and compute the gradient of the squared error
    loss with repect to the neuron weights.
    """
    
    # forward pass
    hidden_outputs, outputs = feed_forward(network, input_vector)
    
    # gradients with respect to 'output' neuron 'pre-activation' outputs
    output_deltas = [output * (1 - output) * (output - target) 
                     for output, target in zip(outputs, target_vector)]
    
    # gradients with respect to 'output' neuron weights
    output_grads = [[output_deltas[i] * hidden_output 
                     for hidden_output in hidden_outputs + [1]] 
                    for i, output_neuron in enumerate(network[-1])]
    
    # gradients with respect to 'hidden' neuron 'pre-activation' outputs
    hidden_deltas = [hidden_output * (1 - hidden_output) * dot(output_deltas, [n[i] for n in network[-1]]) 
                     for i, hidden_output in enumerate(hidden_outputs)]
    
    # gradients with respect to 'hidden' neuron weights
    hidden_grads = [[hidden_deltas[i] * input for input in input_vector + [1]] 
                    for i, hidden_neuron in enumerate(network[0])]
    
    return [hidden_grads, output_grads]

In [11]:
# XOR operation via FNN (backpropagation)

import random
random.seed(0)

# training data
xs = [[0., 0], [0., 1], [1., 0], [1., 1]]
ys = [[0.], [1.], [1.], [0.]]

# start with random weights
network = [
    [[random.random() for _ in range(2 + 1)],    # 1st hidden neuron
     [random.random() for _ in range(2 + 1)]],   # 2nd hidden neuron 
    [[random.random() for _ in range(2 + 1)]]    # output neuron
]

network

[[[0.8444218515250481, 0.7579544029403025, 0.420571580830845],
  [0.25891675029296335, 0.5112747213686085, 0.4049341374504143]],
 [[0.7837985890347726, 0.30331272607892745, 0.4765969541523558]]]

In [12]:
import tqdm

learning_rate = 1.0

for epoch in tqdm.trange(20000, desc="neural net for xor"):
    for x, y in zip(xs, ys):
        gradients = sqerror_gradients(network, x, y)
        
        # Take a gradient step for each neuron in each layer
        network = [[gradient_step(neuron, grad, -learning_rate) 
                    for neuron, grad in zip(layer, layer_grad)] 
                   for layer, layer_grad in zip(network, gradients)]
        
network

neural net for xor: 100%|█████████████████████████████████████████████████████| 20000/20000 [00:01<00:00, 13093.90it/s]


[[[6.953505610104289, 6.952785792366962, -3.1484761965046655],
  [5.115899442661922, 5.115407875835949, -7.839603434415663]],
 [[10.961705832630562, -11.63060534664317, -5.144229056613082]]]

In [13]:
assert feed_forward(network, [0, 0])[-1][0] < 0.01
assert feed_forward(network, [1, 0])[-1][0] > 0.99
assert feed_forward(network, [0, 1])[-1][0] > 0.99
assert feed_forward(network, [1, 1])[-1][0] < 0.01

In [14]:
# Fizz Buzz example

def fizz_buzz_encode(x: int) -> Vector:
    if x % 15 == 0:
        return [0, 0, 0, 1]
    elif x % 5 == 0:
        return [0, 0, 1, 0]
    elif x % 3 == 0:
        return [0, 1, 0, 0]
    else:
        return [1, 0, 0, 0]
    
assert fizz_buzz_encode(2) == [1, 0, 0, 0]
assert fizz_buzz_encode(6) == [0, 1, 0, 0]
assert fizz_buzz_encode(10) == [0, 0, 1, 0]
assert fizz_buzz_encode(30) == [0, 0, 0, 1]

In [15]:
def binary_encode(x: int) -> Vector:
    binary: List[float] = []
        
    for i in range(10): # 10 bits representation
        binary.append(x % 2)
        x = x // 2
    
    return binary

#                             1  2  4  8 16 32 64 128 256 512
assert binary_encode(0)   == [0, 0, 0, 0, 0, 0, 0, 0,  0,  0]
assert binary_encode(1)   == [1, 0, 0, 0, 0, 0, 0, 0,  0,  0]
assert binary_encode(10)  == [0, 1, 0, 1, 0, 0, 0, 0,  0,  0]
assert binary_encode(101) == [1, 0, 1, 0, 0, 1, 1, 0,  0,  0]
assert binary_encode(999) == [1, 1, 1, 0, 0, 1, 1, 1,  1,  1]

In [16]:
# Generate training data

xs = [binary_encode(n) for n in range(101, 1024)]
ys = [fizz_buzz_encode(n) for n in range(101, 1024)]

In [17]:
# FNN (10-25-4 neurons)

NUM_HIDDEN = 25

network = [
    [[random.random() for _ in range(10 + 1)] for _ in range(NUM_HIDDEN)], # hidden layer: 10 inputs -> 25 hidden outputs
    [[random.random() for _ in range(NUM_HIDDEN + 1)] for _ in range(4)]   # output layer: 25 inputs -> 4 final outputs
]

learning_rate = 1.0

with tqdm.trange(500) as t:
    for epoch in t:
        epoch_loss = 0.0
        
        for x, y in zip(xs, ys):
            predicted = feed_forward(network, x)[-1] # output layer only
            epoch_loss += squared_distance(predicted, y)
            gradients = sqerror_gradients(network, x, y)
            
            # Take a gradient step for each neuron in each layer
            network = [[gradient_step(neuron, grad, -learning_rate) 
                        for neuron, grad in zip(layer, layer_grad)] 
                       for layer, layer_grad in zip(network, gradients)]
        
        t.set_description(f"fizz buzz (loss: {epoch_loss:.2f})")

fizz buzz (loss: 29.53): 100%|███████████████████████████████████████████████████████| 500/500 [02:55<00:00,  2.85it/s]


In [18]:
def argmax(xs: list) -> int:
    """Returns the index of the largest value"""
    return max(range(len(xs)), key=lambda i: xs[i])

assert argmax([0, -1]) == 0
assert argmax([-1, 0]) == 1
assert argmax([-1, 10, 5, 20, -3]) == 3

In [19]:
num_correct = 0

for n in range(1, 101):
    x = binary_encode(n)
    predicted = argmax(feed_forward(network, x)[-1])
    actual = argmax(fizz_buzz_encode(n))
    labels = [str(n), "fizz", "buzz", "fizzbuzz"]
    print(n, labels[predicted], labels[actual])
    
    if predicted == actual:
        num_correct += 1
    
    print(num_correct, "/", 100)

1 1 1
1 / 100
2 2 2
2 / 100
3 fizz fizz
3 / 100
4 4 4
4 / 100
5 buzz buzz
5 / 100
6 fizz fizz
6 / 100
7 7 7
7 / 100
8 8 8
8 / 100
9 fizz fizz
9 / 100
10 buzz buzz
10 / 100
11 11 11
11 / 100
12 fizz fizz
12 / 100
13 13 13
13 / 100
14 14 14
14 / 100
15 fizzbuzz fizzbuzz
15 / 100
16 16 16
16 / 100
17 17 17
17 / 100
18 fizz fizz
18 / 100
19 19 19
19 / 100
20 20 buzz
19 / 100
21 fizz fizz
20 / 100
22 22 22
21 / 100
23 23 23
22 / 100
24 fizz fizz
23 / 100
25 buzz buzz
24 / 100
26 26 26
25 / 100
27 fizz fizz
26 / 100
28 28 28
27 / 100
29 29 29
28 / 100
30 fizzbuzz fizzbuzz
29 / 100
31 31 31
30 / 100
32 32 32
31 / 100
33 fizz fizz
32 / 100
34 34 34
33 / 100
35 buzz buzz
34 / 100
36 fizz fizz
35 / 100
37 37 37
36 / 100
38 38 38
37 / 100
39 fizz fizz
38 / 100
40 buzz buzz
39 / 100
41 41 41
40 / 100
42 fizz fizz
41 / 100
43 43 43
42 / 100
44 44 44
43 / 100
45 fizzbuzz fizzbuzz
44 / 100
46 46 46
45 / 100
47 47 47
46 / 100
48 fizz fizz
47 / 100
49 49 49
48 / 100
50 buzz buzz
49 / 100
51 fizz fizz
5