In [None]:
#imports
%matplotlib notebook 
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.animation import FuncAnimation

import ipywidgets as widgets

import numdifftools as nd
from sklearn import datasets, linear_model
import pandas as pd

from IPython.display import display
from IPython.display import clear_output

---

# Loading and Processing Data

We will use california housing data from the sklearn library's dataset for the demo. This dataset has 20,640 samples with 9 feature values. We will use only a subset of this dataset and regress Median Income over Median House Value to be able to visualize the fit.



In [None]:
# Load CSV and columns
df = datasets.fetch_california_housing(as_frame=True)['frame']
df

In [None]:
# Selecting a subset of the predictor and target variables
Y = df['MedHouseVal'].to_numpy()[:1000]
X = df['MedInc'].to_numpy()[:1000]

In [None]:
#Plotting the two variables
%matplotlib
plt.scatter(X,Y)
plt.title('Data')
plt.xlabel('Median Income')
plt.ylabel('Median House Value')

In [None]:
#Processing the data for training and evaluation
X=X.reshape(len(X),1)
Y=Y.reshape(len(Y),1)

# Split the data into training/testing sets
X_train = X[:-200]
X_test = X[-200:]

# Split the targets into training/testing sets
Y_train = Y[:-200]
Y_test = Y[-200:]

---

# Linear Regression Analytical Solution

We use sklearn's implementation of the Analytical solution of the coefficients of regression

In [None]:
# Create linear regression object
regr = linear_model.LinearRegression()

# Fit the model using training set
regr.fit(X_train, Y_train)

# Plot outputs
%matplotlib
plt.scatter(X_train, Y_train)
plt.title('Data')
plt.xlabel('Median Income')
plt.ylabel('Median House Value')
plt.xticks(())
plt.yticks(())
plt.plot(X_train, regr.predict(X_train), color='red',linewidth=2)
plt.show()

---

# Linear Regression with GD

Next, we calculate the same regression fit using different variations of the Gradient Descent algorithm. <br>
In the follwing function LR_through_GD, we can set the batch size to 1 for Stochastic Gradient Descent Algorithm, to 800 (size of the train dataset) for Batch Gradient Descent Algorithm and any value in between for Mini-Batch Gradient Descent.

In [None]:
# Performing MiniBatch Gradient Descent 
def LR_through_GD(init, lr, epochs, X_train, Y_train, batch_size = 50):
    '''
    Perform linear regression using Mini-Batch Gradient Descent.
    
    Parameters:
        init (list): Initial values for slope and intercept [w, c].
        lr (float): Learning rate for gradient descent.
        epochs (int): Number of optimization steps.
        X_train (numpy.ndarray): Training input data.
        Y_train (numpy.ndarray): Training target data.
        batch_size (int): Size of each mini-batch.
        
    Returns:
        history (list): List of [w, c] values at each step.
    '''
    history = []
    w = init[0]
    c = init[1]
    n = float(len(X_train)) # number of training data
            
    num_batches = int(n / batch_size)  # Calculate the number of batches
    
    for i in range(epochs):
        for batch_idx in range(num_batches):
            
            # Fetch data accordingly
            start_idx = batch_idx * batch_size
            end_idx = start_idx + batch_size
            batch_X = X_train[start_idx:end_idx]
            batch_Y = Y_train[start_idx:end_idx]
            
            batch_size_actual = len(batch_X)
            if(batch_size_actual <= 0):
                continue;
            
            
            Y_pred = w * batch_X + c   # The current predicted values of Y for the batch
            
            ###### TODO: 
            
            ### calcuate the derivative wrt to w, c

            D_w =  TODO  # Derivative wrt w
            D_c =  TODO  # Derivative wrt c
            
            ##### Finish TODO
            
            w = w - learning_rate_slider.value * D_w  # Update w
            c = c - learning_rate_slider.value * D_c  # Update c
            
            history.append([m, c])
            
    return history

In [None]:
def visualize_LR(history):
    '''
    Visualize the linear regression optimization process.
    
    Parameters:
        history (list): List of [w, c] values at each step.
        
    Returns:
        anim (FuncAnimation): Animation of the optimization process.
    '''
    fig, ax = plt.subplots(figsize=(8, 4))
    ax.set_xlabel('Income')
    ax.set_ylabel('Price')
    # ax.grid()

    scatter, = ax.plot(X_train, Y_train,'o')
    line, = ax.plot([], [], 'r')

    def update(i):
        y_pred = history[i][0]*X_train + history[i][1]
        line.set_data(X_train, y_pred)

    anim = FuncAnimation(fig, update, frames=len(history), interval=100, repeat = False)
    plt.show()
    return anim




#Interactive Widgets
global_animation = None

# Create a button widget
start_button = widgets.Button(description="Start Visualization", layout=widgets.Layout(width='auto'))

# Create an Output widget
output = widgets.Output()  

# Create sliders for learning rate and initial point
learning_rate_slider =  widgets.FloatSlider(value=0.01, min=0.01, max=.15, step=0.01, description="Learning Rate:")
init_slider =  widgets.FloatSlider(value=1.0, min=-5.0, max=5.0, step=0.1, description="Initial Point:")
epochs_slider =  widgets.IntSlider(value=500, min=50, max=1000, step=50, description="Iterations:")
batch_slider =  widgets.IntSlider(value=50, min=1, max=800, step=1, description="Batch Size:")


def start_visualization(_):
    with output: 
        clear_output(wait=True)
        global global_animation
        learning_rate = learning_rate_slider.value
        init = init_slider.value
        history = LR_through_GD([init, init], learning_rate, epochs_slider.value, X_train, Y_train, batch_slider.value)
        plt.close()
        global_animation= visualize_LR(history)
        
start_button.on_click(start_visualization)


# Arrange the widgets using HBox and VBox
widget_box = widgets.VBox([learning_rate_slider, init_slider, epochs_slider, batch_slider, start_button])
print("You can adjust the initial point and batch size using interactive widgets to see how they affect the optimization." +
     "\nBatch Size 1 corresponds to SGD and batch size 800 is equivalent to batch gradient descent algorithm")
display(widget_box, output)

In [None]:
history_sgd = LR_through_GD([0.0,0.0], 0.01, 500, X_train, Y_train, 1)
history_mini_batch = LR_through_GD([0.0,0.0], 0.01, 500, X_train, Y_train, 200)
history_batch = LR_through_GD([0.0,0.0], 0.01, 500, X_train, Y_train, 800)

In [None]:
print("Analytical Solution : c= ", regr.intercept_, " w= ", regr.coef_ )
print("Solution from SGD : c= ", history_sgd[-1][1], " w= ", history_sgd[-1][0])
print("Solution from Mini Batch GD : c= ", history_mini_batch[-1][1], " w= ", history_mini_batch[-1][0])
print("Solution from Batch GD : c= ", history_batch[-1][1], " w= ", history_batch[-1][0])

#### What version of SGD has the closest estimates to the Analytical solution?