In [1]:
import numpy as np
import time
np.random.seed(1) # A seed is set for reproducibility, ensuring that any random number generation will produce the same results each time the code is run.

In [2]:
# def relu(x):
#     return (x > 0) * x # relu is a popular activation function used in neural networks. It returns the input if it's positive and returns 0 if it's negative or zero.

# def relu_grad(x):
#     return x > 0 # relu_grad returns the gradient (derivative) of the ReLU function. This gradient is 1 for positive inputs and 0 for negative or zero inputs.

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_grad(x):
    sig_x = sigmoid(x)
    return sig_x * (1 - sig_x)

In [3]:
streetlights = np.array([[1,0,1], 
                         [0,1,1], 
                         [0,0,1], 
                         [1,1,1], 
                         [0,1,1], 
                         [1,0,1]])

In [4]:
walk_vs_stop = np.array([[0], [1], [0], [1], [1], [0]])

In [5]:
X,y = streetlights, walk_vs_stop

In [6]:
hidden_nodes = 8 # specifies that the hidden layer of the network contains n nodes.

In [7]:
epochs = 100  # number of iterations to go through the network

lr = 0.01      # how much we change the weights of the network each iteration

In [8]:
ws_1 = np.random.rand(X.shape[1], hidden_nodes) - 0.5 #  weights for the input-to-hidden layers
ws_2 = np.random.rand(hidden_nodes, y.shape[1]) - 0.5 # weights for the hidden-to-output layers

In [9]:
for epoch in range(epochs):     #number of training iterations, or times to change the weights of the nn
    for i in range(X.shape[0]): #for all samples in X, each streetlight
        layer_in = X[i:i+1]
        
        #forward pass/prediction
        layer_1 = sigmoid(layer_in.dot(ws_1))
        
        layer_out = layer_1.dot(ws_2)
        
        #calc error/distance (how far are we from goal)
        delta_2 = layer_out - y[i:i+1]
        
        #calc the the error each node in prev layer contributed
        delta_1 = delta_2.dot(ws_2.T) * sigmoid_grad(layer_1)
        
        #update weights
        ws_2 -= lr * (layer_1.T.reshape(hidden_nodes,1).dot(delta_2))
        ws_1 -= lr * (layer_in.T.reshape(X.shape[1],1).dot(delta_1))
    
    # Every 10 epochs, the squared error for the last sample is printed.
    if epoch % 10 == 0:
        error = delta_2**2
        print(round(error[0][0],6))#, end='\r')

1e-05
0.118242
0.195882
0.217798
0.219033
0.213802
0.20667
0.199017
0.191246
0.183476


Results relu:
0.019479
0.046095
0.062677
0.061989
0.052434
0.041336
0.031658
0.024053
0.018328
0.01518

Observations:
Starts at 0.019479 and gradually decreases to 0.01518.
Decreases in error are smaller with each epoch.

Results sigmoid:
1e-05
0.118242
0.195882
0.217798
0.219033
0.213802
0.20667
0.199017
0.191246
0.183476

Observations:
Starts extremely low at 1e-05 (0.00001) and then increases substantially in the first two checkpoints.
Peaks around 0.219033 and then begins to slightly decrease again.

What differences did you experience?
With ReLU, the initial error is much higher compared to the near-zero error of Sigmoid. However, the error in Sigmoid spikes significantly after the first iteration.

The ReLU error consistently decreases, whereas the Sigmoid error goes up before starting to decrease again after reaching its peak.

The maximum error in the ReLU case is much lower than that in the Sigmoid case.


Why do you think this difference happened?
ReLU and Sigmoid work differently. ReLU can produce a wide range of positive numbers, which might help it learn faster for this data. Sigmoid, on the other hand, gives numbers between 0 and 1, and this can cause some problems, like "saturation" or tiny changes that slow down learning. How we initially set up the system and the learning speed we pick also affects how well each function works.

In [10]:
### Code for testing different learning rates and epochs ###
learning_rates = [0.001, 0.01, 0.1, 1, 10]
max_epochs = 1000  # set a reasonable upper limit
tolerance = 1e-5  # threshold to determine when error changes are insignificant

best_lr = None
best_epochs = None
lowest_error = float('inf')

for lr in learning_rates:
    ws_1 = np.random.rand(X.shape[1], hidden_nodes) - 0.5
    ws_2 = np.random.rand(hidden_nodes, y.shape[1]) - 0.5
    
    previous_error = float('inf')
    
    for epoch in range(max_epochs):
        for i in range(X.shape[0]):
            layer_in = X[i:i+1]
            layer_1 = sigmoid(layer_in.dot(ws_1))
            layer_out = layer_1.dot(ws_2)
            delta_2 = layer_out - y[i:i+1]
            delta_1 = delta_2.dot(ws_2.T) * sigmoid_grad(layer_1)
            
            ws_2 -= lr * (layer_1.T.reshape(hidden_nodes,1).dot(delta_2))
            ws_1 -= lr * (layer_in.T.reshape(X.shape[1],1).dot(delta_1))
            
        error = (delta_2**2).mean()
        
        # Check for plateau or increase in error
        if abs(previous_error - error) < tolerance:
            break
        
        previous_error = error
        
    print(f"Learning rate: {lr}, Epochs taken: {epoch}, Final error: {error}")
    
    if error < lowest_error:
        best_lr = lr
        best_epochs = epoch
        lowest_error = error

print(f"\nBest Learning Rate: {best_lr} with {best_epochs} epochs and error: {lowest_error}")

Learning rate: 0.001, Epochs taken: 29, Final error: 3.3451762178580132e-06
Learning rate: 0.01, Epochs taken: 798, Final error: 0.0005182082504585931
Learning rate: 0.1, Epochs taken: 87, Final error: 0.0002286416733087626
Learning rate: 1, Epochs taken: 15, Final error: 0.002080598498202877
Learning rate: 10, Epochs taken: 1, Final error: 9.02821384069402e-50

Best Learning Rate: 10 with 1 epochs and error: 9.02821384069402e-50


When choosing how fast our model learns, we see some trade-offs. Using a slow learning speed (like 0.001) gets us a pretty accurate result, but it takes a long time. On the other hand, speeding things up massively (with a rate like 10) makes the model learn a lot faster but might be too quick, possibly missing out on the best solution. When the error becomes really small, like with the rate of 10, the model might be just memorizing the data, rather than understanding its pattern, which isn't good. Meanwhile, a middle-speed rate, like 0.1, seems to be the best learning rate - not too slow, not too fast. However, the really fast learning rate of 10 worked surprisingly well here.

In [11]:
### adding another layer ###
hidden_nodes = 8
hidden_nodes_2 = 8  # second hidden layer with 8 nodes

epochs = 100  # number of iterations to go through the network
lr = 0.1      # how much we change the weights of the network each iteration

ws_1 = np.random.rand(X.shape[1], hidden_nodes) - 0.5
ws_2 = np.random.rand(hidden_nodes, hidden_nodes_2) - 0.5
ws_3 = np.random.rand(hidden_nodes_2, y.shape[1]) - 0.5

for epoch in range(epochs):
    for i in range(X.shape[0]):
        layer_in = X[i:i+1]
        
        # forward pass/prediction
        layer_1 = sigmoid(layer_in.dot(ws_1))
        layer_2 = sigmoid(layer_1.dot(ws_2))
        layer_out = layer_2.dot(ws_3)
        
        # calc error/distance
        delta_3 = layer_out - y[i:i+1]
        
        # propagate the error backward
        delta_2 = delta_3.dot(ws_3.T) * sigmoid_grad(layer_2)
        delta_1 = delta_2.dot(ws_2.T) * sigmoid_grad(layer_1)
        
        # update weights
        ws_3 -= lr * (layer_2.T.reshape(hidden_nodes_2, 1).dot(delta_3))
        ws_2 -= lr * (layer_1.T.reshape(hidden_nodes, 1).dot(delta_2))
        ws_1 -= lr * (layer_in.T.reshape(X.shape[1], 1).dot(delta_1))

    if epoch % 10 == 0:
        error = delta_3**2
        print(round(error[0][0],6))


0.520186
0.426413
0.420115
0.413735
0.407131
0.400111
0.39241
0.383656
0.373325
0.360691


Comparing these results with the single-layer network with sigmoid activation, the error values are significantly higher for the two-layer model. The single-layer network achieved errors as low as 1e−05 and mostly stayed below 0.22, while the two-layer network doesn't go below 0.360691 even after the same number of epochs. 

In [12]:
### 4. Understanding the effect of activation function: Repeat the experiment as in 3. but by including an activation functions at various stages as shown. ###
### neural net (a) ###
hidden_nodes = 8
hidden_nodes_2 = 6  # second hidden layer with 8 nodes

epochs = 100  # number of iterations to go through the network
lr = 0.1      # how much we change the weights of the network each iteration

ws_1 = np.random.rand(X.shape[1], hidden_nodes) - 0.5
ws_2 = np.random.rand(hidden_nodes, hidden_nodes_2) - 0.5
ws_3 = np.random.rand(hidden_nodes_2, y.shape[1]) - 0.5
print("Neural net (a):")

for epoch in range(epochs):
    for i in range(X.shape[0]):
        layer_in = X[i:i+1]
        
        # forward pass/prediction
        layer_1_linear = layer_in.dot(ws_1)
        layer_1_active = sigmoid(layer_1_linear)
        layer_2 = layer_1_active.dot(ws_2)
        layer_out = layer_2.dot(ws_3)
        
        # calc error/distance
        delta_3 = layer_out - y[i:i+1]
        
        # propagate the error backward
        delta_2 = delta_3.dot(ws_3.T) * sigmoid_grad(layer_2)
        delta_1 = delta_2.dot(ws_2.T) * sigmoid_grad(layer_1)
        
        # update weights
        ws_3 -= lr * (layer_2.T.reshape(hidden_nodes_2, 1).dot(delta_3))
        ws_2 -= lr * (layer_1.T.reshape(hidden_nodes, 1).dot(delta_2))
        ws_1 -= lr * (layer_in.T.reshape(X.shape[1], 1).dot(delta_1))

    if epoch % 10 == 0:
        error = delta_3**2
        print(round(error[0][0],6))

Neural net (a):
0.052689
0.202071
0.155539
0.112984
0.076007
0.046419
0.025315
0.012132
0.004979
0.001645


In [13]:
### neural net (b) ###
hidden_nodes = 8
hidden_nodes_2 = 6  # second hidden layer with 8 nodes

epochs = 100  # number of iterations to go through the network
lr = 0.1      # how much we change the weights of the network each iteration

ws_1 = np.random.rand(X.shape[1], hidden_nodes) - 0.5
ws_2 = np.random.rand(hidden_nodes, hidden_nodes_2) - 0.5
ws_3 = np.random.rand(hidden_nodes_2, y.shape[1]) - 0.5
print("Neural net (b):")
for epoch in range(epochs):
    for i in range(X.shape[0]):
        layer_in = X[i:i+1]
        
        # forward pass/prediction
        layer_1 = layer_in.dot(ws_1)
        layer_2_linear = layer_1.dot(ws_2)
        layer_2_active = sigmoid(layer_2_linear)
        layer_out = layer_2_active.dot(ws_3)
        
        # calc error/distance
        delta_3 = layer_out - y[i:i+1]
        
        # propagate the error backward
        delta_2 = delta_3.dot(ws_3.T) * sigmoid_grad(layer_2)
        delta_1 = delta_2.dot(ws_2.T) * sigmoid_grad(layer_1)
        
        # update weights
        ws_3 -= lr * (layer_2.T.reshape(hidden_nodes_2, 1).dot(delta_3))
        ws_2 -= lr * (layer_1.T.reshape(hidden_nodes, 1).dot(delta_2))
        ws_1 -= lr * (layer_in.T.reshape(X.shape[1], 1).dot(delta_1))

    if epoch % 10 == 0:
        error = delta_3**2
        print(round(error[0][0],6))

Neural net (b):
0.105736
0.09129
0.102012
0.039496
0.012884
0.004266
0.001618
0.000755
0.000442
0.000312


In [14]:
### neural net (c) ###
hidden_nodes = 8
hidden_nodes_2 = 6  # second hidden layer with 8 nodes

epochs = 100  # number of iterations to go through the network
lr = 0.1      # how much we change the weights of the network each iteration

ws_1 = np.random.rand(X.shape[1], hidden_nodes) - 0.5
ws_2 = np.random.rand(hidden_nodes, hidden_nodes_2) - 0.5
ws_3 = np.random.rand(hidden_nodes_2, y.shape[1]) - 0.5
print("Neural net (c):")

for epoch in range(epochs):
    for i in range(X.shape[0]):
        layer_in = X[i:i+1]
        
        # forward pass/prediction
        layer_1 = sigmoid(layer_in.dot(ws_1))
        layer_2 = sigmoid(layer_1.dot(ws_2))
        layer_out = layer_2.dot(ws_3)
        layer_out = layer_2_active.dot(ws_3)
        
        # calc error/distance
        delta_3 = layer_out - y[i:i+1]
        
        # propagate the error backward
        delta_2 = delta_3.dot(ws_3.T) * sigmoid_grad(layer_2)
        delta_1 = delta_2.dot(ws_2.T) * sigmoid_grad(layer_1)
        
        # update weights
        ws_3 -= lr * (layer_2.T.reshape(hidden_nodes_2, 1).dot(delta_3))
        ws_2 -= lr * (layer_1.T.reshape(hidden_nodes, 1).dot(delta_2))
        ws_1 -= lr * (layer_in.T.reshape(X.shape[1], 1).dot(delta_1))

    if epoch % 10 == 0:
        error = delta_3**2
        print(round(error[0][0],6))

Neural net (c):
0.248042
0.360744
0.360419
0.360253
0.360293
0.360598
0.361252
0.362377
0.364154
0.36687


Neural net (a)
Starts with an error of 0.052689, and by the end of training, it has reduced this error significantly to 0.001645. The decline in error shows that this configuration learns efficiently from the data.

Neural net (b)
It starts with an error of 0.105736 and drops to 0.000312 by the end. This implies that this model seems to be the most effective at reducing the error compared to the other two, given the significant drop.

Neural net (c)
Starts with an error of 0.248042, but the error only reduces slightly to 0.36687. The error seems to be reducing at a very slow pace compared to the other two.