# âš¡ Large Scale Machine Learning Examples

This notebook demonstrates **Batch Gradient Descent**, **Stochastic Gradient Descent (SGD)**, and **Mini-batch Gradient Descent** using a simple linear regression problem.

We will:
- Generate synthetic data
- Implement each gradient descent variant
- Compare convergence behavior

In [None]:
# Imports
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Generate synthetic linear data: y = 4x + 3 + noise
np.random.seed(42)
m = 200  # number of examples
X = 2 * np.random.rand(m, 1)
y = 4 * X + 3 + np.random.randn(m, 1)

# Add bias term (x0 = 1)
X_b = np.c_[np.ones((m, 1)), X]  # shape (m, 2)

## 1. Batch Gradient Descent

In [None]:
def batch_gradient_descent(X, y, alpha=0.1, n_iters=100):
    m = len(y)
    theta = np.random.randn(2,1)
    cost_history = []
    for i in range(n_iters):
        gradients = (2/m) * X.T @ (X @ theta - y)
        theta -= alpha * gradients
        cost = (1/m) * np.sum((X @ theta - y) ** 2)
        cost_history.append(cost)
    return theta, cost_history

theta_bgd, cost_bgd = batch_gradient_descent(X_b, y)
print("Batch GD theta:", theta_bgd.ravel())

## 2. Stochastic Gradient Descent (SGD)

In [None]:
def stochastic_gradient_descent(X, y, alpha=0.1, n_epochs=50):
    m = len(y)
    theta = np.random.randn(2,1)
    cost_history = []
    for epoch in range(n_epochs):
        for i in range(m):
            rand_i = np.random.randint(m)
            xi = X[rand_i:rand_i+1]
            yi = y[rand_i:rand_i+1]
            gradients = 2 * xi.T @ (xi @ theta - yi)
            theta -= alpha * gradients
        cost = (1/m) * np.sum((X @ theta - y) ** 2)
        cost_history.append(cost)
    return theta, cost_history

theta_sgd, cost_sgd = stochastic_gradient_descent(X_b, y)
print("SGD theta:", theta_sgd.ravel())

## 3. Mini-batch Gradient Descent

In [None]:
def minibatch_gradient_descent(X, y, alpha=0.1, n_iters=50, batch_size=20):
    m = len(y)
    theta = np.random.randn(2,1)
    cost_history = []
    for it in range(n_iters):
        indices = np.random.permutation(m)
        X_shuffled = X[indices]
        y_shuffled = y[indices]
        for i in range(0, m, batch_size):
            xi = X_shuffled[i:i+batch_size]
            yi = y_shuffled[i:i+batch_size]
            gradients = (2/batch_size) * xi.T @ (xi @ theta - yi)
            theta -= alpha * gradients
        cost = (1/m) * np.sum((X @ theta - y) ** 2)
        cost_history.append(cost)
    return theta, cost_history

theta_mbgd, cost_mbgd = minibatch_gradient_descent(X_b, y)
print("Mini-batch GD theta:", theta_mbgd.ravel())

## Compare Convergence

In [None]:
plt.figure(figsize=(10,6))
plt.plot(cost_bgd, label="Batch GD")
plt.plot(cost_sgd, label="SGD")
plt.plot(cost_mbgd, label="Mini-batch GD")
plt.xlabel("Iteration/Epoch")
plt.ylabel("Cost (MSE)")
plt.title("Convergence of Gradient Descent Variants")
plt.legend()
plt.show()