In [1]:
import numpy as np

In [2]:
# Hyperparameters - Think of these as tuning knobs
# learning_rate = This controls how much we descend on each iteration, too fast we don't converge, too slow and we get stuck in a local minima
# epochs = This is how many iterations we will allow our model to train on the full training dataset
# N = This is the number of samples in our training data
learning_rate = 0.01
epochs = 500
N = 10

# Random Variable Creation
x = np.random.rand(N,)
w = np.random.rand(1)
b = np.random.rand(1)
y = w*x + b

w_pred = 0.0  # <-- These two init values for w and b are used as the starting "guesses" for there values,
b_pred = 0.0  #     when doing gradient descent we should slowly migrate away from 0.0 and migrate towards their true values

In [3]:
# Gradient Descent for Linear Regression
# y_pred = wx + b
# loss = (y_true-y_pred)**2 / N   <-- MSE Loss Function
# loss = (y_true-(wx+b))**2 / N
# Now we take the derivate of loss with respect to w and b

def calculate_gradients(x_data, y_data, w, b, learning_rate, N):
    # Zero out the gradients
    dl_dw = 0.0
    dl_db = 0.0    
    
    # Now we loop through all of the training data and for each datapoint (each x and y pair), 
    # we calculate the sum of the gradients with respect to w and b
    for x, y_true in zip(x_data, y_data):
        dl_dw += (2/N)*(y_true-(w*x+b))*(-x)  # These derivatives are calculated using the chain rule
        dl_db += (2/N)*(y_true-(w*x+b))*(-1)
    
    # Update the parameters
    # The reason we do the parameter minus the learning rate * the gradient rather than the parameter plus the learning rate * the gradient
    # is because when we calculate the gradient, the value of the gradient provides the direction for the steepest rise of the curve, so by doing
    # a negative gradient, we invert the direction so we move move the parameters in the steepest descent because we want to minimize the parameters
    # that make up the loss function.
    w = w - dl_dw*learning_rate
    b = b - dl_db*learning_rate
    return w, b

In [4]:
for epoch in range(epochs):
    w_pred, b_pred = calculate_gradients(x, y, w_pred, b_pred, learning_rate, N)
    
    if epoch % 50 == 0:
        y_pred = w_pred*x + b_pred
        loss = (y-y_pred)**2 / N
        print(f'Epoch: {epoch} -- Loss: {np.sum(loss, axis=0)}')
    
print(f'Converged w: {w_pred} -- Actual w: {w}')
print(f'Converged b: {b_pred} -- Actual b: {b}')

Epoch: 0 -- Loss: 0.5611210401156657
Epoch: 50 -- Loss: 0.04560257093629283
Epoch: 100 -- Loss: 0.004656157898502693
Epoch: 150 -- Loss: 0.0012337146331380861
Epoch: 200 -- Loss: 0.0008086758099996045
Epoch: 250 -- Loss: 0.0006480255679699843
Epoch: 300 -- Loss: 0.0005309633590740321
Epoch: 350 -- Loss: 0.00043599278452311407
Epoch: 400 -- Loss: 0.0003580839063620629
Epoch: 450 -- Loss: 0.000294102713545505
Converged w: 0.394434903360037 -- Actual w: [0.43831717]
Converged b: 0.5665055819784126 -- Actual b: [0.54325005]
