### Gradient descent for simple Linear Regression with ordinary derivative

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Given data points
y = np.array([2, 4, 6, 8, 10])
x = np.array([1, 2, 3, 4, 5])
n = len(x)

# Function to calculate MSE
def calculate_mse(w, x, y):
    """
    Compute gradient: 
    The value of the MSE is informative and is often used to monitor the performance and convergence of the model during training.
    Calculate the gradient of the cost function (which is the MSE in this context) with respect to the weights. 
    This tells us how much the cost would change if we made a small change to the weight.
    """
    errors = (w * x - y) ** 2
    mse = errors.mean()
    return mse

# Function to calculate the gradient of the cost function with respect to 'w'
def calculate_gradient(w, x, y):
    """
    Update weights: 
    In gradient descent, the weights are updated based on the gradient of the cost function, NOT directly on the cost function value (MSE in this case).
    Adjust the weights by a small step in the opposite direction of the gradient. 
    This is because the gradient points in the direction of the steepest increase of the cost function, and we want to minimize the cost.
    """
    
    # Calculate the errors for each data point
    errors = w * x - y
    
    # Multiply the errors by the input (x) to get the gradient contribution per point
    # This is derived from the derivative of the squared error with respect to 'w'
    gradient_contributions = errors * x
    
    # Sum up the gradient contributions for all points and scale by 2/n
    # The factor of 2 comes from the derivative of the squared term, and we divide by 'n'
    # to get the average gradient across all data points.
    # To minimize the cost, we want to change `w` in the opposite direction of the gradient.
    gradient = (2.0 / n) * np.sum(gradient_contributions)
    
    # The calculated gradient indicates the slope of the cost function at the current 'w'
    # A positive gradient suggests increasing cost with increasing 'w' and vice versa.
    return gradient


# Function to plot the current model
def plot_current_model(w, x, y, epoch):
    plt.scatter(x, y, color='blue')  # Plot the data points
    y_pred = w * x
    plt.plot(x, y_pred, color='red', linestyle='dotted')  # Plot the line
    plt.title(f'Epoch {epoch+1}')
    plt.xlabel('x')
    plt.ylabel('y')
    plt.show()

# Gradient Descent Function
def gradient_descent(x, y, learning_rate=0.01, epochs=10):
    w = 0.01  # Start with an arbitrary weight
    for epoch in range(epochs):
        mse = calculate_mse(w, x, y)
        gradient = calculate_gradient(w, x, y)
        w -= learning_rate * gradient  # Update the weight

        # Print detailed explanation
        print(f'Epoch: {epoch+1}:')
        print(f'Current weight: {w}')
        print(f'(MSE) is the average of the squares of the errors between the predicted y (w*x) and actual y.: {mse}')
        print(f'Gradient for single param variable is ordinary derivative of the MSE with respect to weight w. It tells us the direction to move on the w-axis to minimize the MSE {gradient}')
        print('The gradient is the partial derivative of the MSE with respect to weight w. It tells us the direction to move on the w-axis to minimize the MSE.')
        
        # Call the function to plot the current model
        plot_current_model(w, x, y, epoch)
    
    return w

# Perform Gradient Descent
optimized_w = gradient_descent(x, y, learning_rate=0.01, epochs=5)


### Ordinary Derevative explained


In [None]:
import numpy as np

# Assume x and y are numpy arrays containing the data points
x = np.array([1, 2, 3, 4, 5, 6])
y = np.array([2, 4, 8, 16, 32, 64])
n = len(x)

def calculate_derivative(w, x, y):
    # Step 1: Calculate the predictions based on the current weight
    predictions = w * x  # This is the model's guess: y_hat = w * x
    
    # Step 2: Calculate the errors (differences between predictions and actual values)
    errors = predictions - y  # Error for each point: e_i = y_hat_i - y_i
    
    # Step 3: Calculate the derivative of the square of errors with respect to w
    # For each point, this is: d/dw (e_i^2) = 2 * e_i * d/dw (e_i)
    # And since e_i = w*x_i - y_i, d/dw (e_i) = x_i
    # Thus, the derivative for each point is: 2 * e_i * x_i
    derivative_errors = 2 * errors * x  # Derivative for each point
    
    # Step 4: Average the derivatives across all data points
    # This gives us the mean of the gradients from all data points
    gradient = np.sum(derivative_errors) / n  # The mean gradient
    
    return gradient

# Example of using the function with an initial weight
initial_w = 0.1
gradient_at_initial_w = calculate_derivative(initial_w, x, y)
print(f"The gradient at w = {initial_w} is {gradient_at_initial_w}")

# For visualization purposes, let's plot J(w) against w values
weights = np.linspace(-10, 10, 400)  # A range of weight values for plotting
costs = np.array([calculate_mse(w, x, y) for w in weights])  # Corresponding MSE values

import matplotlib.pyplot as plt

plt.plot(weights, costs, label='J(w)')
plt.xlabel('w')
plt.ylabel('J(w)')
plt.title('Cost Function J(w) vs. weight w')
plt.legend()
plt.grid(True)
plt.show()
