In [2]:
import torch

# import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt

print(torch.__version__)

2.5.1+cu124


# Adaptive Stochastic Gradient Descent

In [3]:
def stochastic_gradient_descent(X, y, learning_rate=0.01, epochs=100, batch_size=1, verbose=True):
    """
    Stochastic Gradient Descent (SGD) for least squares optimization.

    Parameters:
    - X: numpy array of shape (m, n) - input features
    - y: numpy array of shape (m,) - target values
    - learning_rate: float - step size for parameter updates
    - epochs: int - number of full passes over the dataset
    - batch_size: int - number of samples per mini-batch
    - verbose: bool - whether to print progress

    Returns:
    - theta: numpy array - optimized parameters
    - losses: list - loss values over epochs
    """
    m, n = X.shape
    theta = np.zeros(n)  # Initialize model parameters
    losses = []

    for epoch in range(epochs):
        # Shuffle data
        indices = np.random.permutation(m)
        X_shuffled = X[indices]
        y_shuffled = y[indices]

        # Mini-batch update
        for i in range(0, m, batch_size):
            X_batch = X_shuffled[i:i+batch_size]
            y_batch = y_shuffled[i:i+batch_size]

            # Compute gradient: X_batch.T @ (X_batch @ theta - y_batch)
            gradient = X_batch.T @ (X_batch @ theta - y_batch)
            theta -= learning_rate * gradient  # Update parameters

        # Compute loss on the full dataset
        loss = 0.5 * np.mean((y - X @ theta) ** 2)
        losses.append(loss)

        if verbose and epoch % 10 == 0:
            print(f"Epoch {epoch}, Loss: {loss:.6f}")

    return theta, losses


In [4]:
# Example usage: Synthetic data
np.random.seed(42)
X = np.random.rand(100, 2)  # 100 samples, 2 features
true_theta = np.array([3.0, 5.0])  # True parameters
y = X @ true_theta + np.random.normal(0, 0.1, 100)  # Noisy target

In [5]:
# Run SGD
theta, losses = stochastic_gradient_descent(X, y, learning_rate=0.01, epochs=1000, batch_size=10)
print("Optimized parameters:", theta)

Epoch 0, Loss: 2.921508
Epoch 10, Loss: 0.018266
Epoch 20, Loss: 0.007115
Epoch 30, Loss: 0.005305
Epoch 40, Loss: 0.005004
Epoch 50, Loss: 0.004956
Epoch 60, Loss: 0.004948
Epoch 70, Loss: 0.004946
Epoch 80, Loss: 0.004946
Epoch 90, Loss: 0.004946
Epoch 100, Loss: 0.004946
Epoch 110, Loss: 0.004946
Epoch 120, Loss: 0.004946
Epoch 130, Loss: 0.004946
Epoch 140, Loss: 0.004948
Epoch 150, Loss: 0.004946
Epoch 160, Loss: 0.004947
Epoch 170, Loss: 0.004947
Epoch 180, Loss: 0.004948
Epoch 190, Loss: 0.004946
Epoch 200, Loss: 0.004947
Epoch 210, Loss: 0.004947
Epoch 220, Loss: 0.004946
Epoch 230, Loss: 0.004946
Epoch 240, Loss: 0.004946
Epoch 250, Loss: 0.004946
Epoch 260, Loss: 0.004946
Epoch 270, Loss: 0.004946
Epoch 280, Loss: 0.004946
Epoch 290, Loss: 0.004946
Epoch 300, Loss: 0.004946
Epoch 310, Loss: 0.004947
Epoch 320, Loss: 0.004947
Epoch 330, Loss: 0.004946
Epoch 340, Loss: 0.004947
Epoch 350, Loss: 0.004946
Epoch 360, Loss: 0.004946
Epoch 370, Loss: 0.004946
Epoch 380, Loss: 0.0049

In [6]:
def stochastic_gradient_descent_adaptive(X, y, epochs=100, batch_size=10, alpha=0.00000001, verbose=True):
    """
    Stochastic Gradient Descent (SGD) for least squares optimization.

    Parameters:
    - X: numpy array of shape (m, n) - input features
    - y: numpy array of shape (m,) - target values
    - learning_rate: float - step size for parameter updates
    - epochs: int - number of full passes over the dataset
    - batch_size: int - number of samples per mini-batch
    - verbose: bool - whether to print progress

    Returns:
    - theta: numpy array - optimized parameters
    - losses: list - loss values over epochs
    """
    m, n = X.shape
    theta = np.zeros(n)  # Initialize model parameters
    losses = []
    learning_rate = 0.01
    past_theta = np.array([0.1, 0.1])

    for epoch in range(epochs):
        # Shuffle data
        indices = np.random.permutation(m)
        X_shuffled = X[indices]
        y_shuffled = y[indices]

        
        # Mini-batch update
        for i in range(0, m, batch_size):
            X_batch = X_shuffled[i:i+batch_size]
            y_batch = y_shuffled[i:i+batch_size]

            # Compute gradient: X_batch.T @ (X_batch @ theta - y_batch)
            gradient = X_batch.T @ (X_batch @ theta - y_batch)
            theta = theta - learning_rate * gradient  # Update parameters
            new_gradient =  X_batch.T @ (X_batch @ theta - y_batch)
            learning_rate = learning_rate - alpha * gradient @ new_gradient.T
             

        # Compute loss on the full dataset
        loss = 0.5 * np.mean((y - X @ theta) ** 2)
        losses.append(loss)

        if verbose and epoch % 10 == 0:
            print(f"Epoch {epoch}, Loss: {loss:.6f}")
            print(f"Epoch {epoch}, Learning_Rate: {learning_rate:.6f}")

    return theta, losses


In [7]:
# Run SGD Adaptive
theta, losses = stochastic_gradient_descent_adaptive(X, y, epochs=1000, batch_size=10)
print("Optimized parameters:", theta)

Epoch 0, Loss: 2.946068
Epoch 0, Learning_Rate: 0.009937
Epoch 10, Loss: 0.017582
Epoch 10, Learning_Rate: 0.009906
Epoch 20, Loss: 0.007047
Epoch 20, Learning_Rate: 0.009906
Epoch 30, Loss: 0.005296
Epoch 30, Learning_Rate: 0.009906
Epoch 40, Loss: 0.005005
Epoch 40, Learning_Rate: 0.009906
Epoch 50, Loss: 0.004957
Epoch 50, Learning_Rate: 0.009906
Epoch 60, Loss: 0.004947
Epoch 60, Learning_Rate: 0.009906
Epoch 70, Loss: 0.004946
Epoch 70, Learning_Rate: 0.009906
Epoch 80, Loss: 0.004947
Epoch 80, Learning_Rate: 0.009905
Epoch 90, Loss: 0.004947
Epoch 90, Learning_Rate: 0.009905
Epoch 100, Loss: 0.004946
Epoch 100, Learning_Rate: 0.009905
Epoch 110, Loss: 0.004946
Epoch 110, Learning_Rate: 0.009905
Epoch 120, Loss: 0.004946
Epoch 120, Learning_Rate: 0.009905
Epoch 130, Loss: 0.004946
Epoch 130, Learning_Rate: 0.009905
Epoch 140, Loss: 0.004946
Epoch 140, Learning_Rate: 0.009905
Epoch 150, Loss: 0.004946
Epoch 150, Learning_Rate: 0.009905
Epoch 160, Loss: 0.004947
Epoch 160, Learning_

# AdaGrad

In [8]:
def adagrad(X, y, learning_rate=0.01, epochs=100, batch_size=10, verbose=True, epsilon=1e-8):
    """
    Adagrad optimization for least squares regression.

    Parameters:
    - X: numpy array of shape (m, n) - input features
    - y: numpy array of shape (m,) - target values
    - learning_rate: float - initial learning rate
    - epochs: int - number of full passes over the dataset
    - batch_size: int - number of samples per mini-batch
    - verbose: bool - whether to print progress
    - epsilon: float - small constant to avoid division by zero

    Returns:
    - theta: numpy array - optimized parameters
    - losses: list - loss values over epochs
    """
    m, n = X.shape
    theta = np.zeros(n)  # Initialize model parameters
    sum_grads = np.zeros(n)  # Sum of squared gradients
    losses = []

    for epoch in range(epochs):
        # Shuffle data
        indices = np.random.permutation(m)
        X_shuffled = X[indices]
        y_shuffled = y[indices]

        # Mini-batch update
        for i in range(0, m, batch_size):
            X_batch = X_shuffled[i:i+batch_size]
            y_batch = y_shuffled[i:i+batch_size]

            # Compute gradient: X_batch.T @ (X_batch @ theta - y_batch)
            gradient = X_batch.T @ (X_batch @ theta - y_batch)
            sum_grads += gradient ** 2  # Accumulate squared gradients

            # Adaptive learning rate per parameter
            adaptive_lr = learning_rate / (np.sqrt(sum_grads) + epsilon)

            # Update parameters
            theta -= adaptive_lr * gradient

        # Compute loss on the full dataset
        loss = 0.5 * np.mean((y - X @ theta) ** 2)
        losses.append(loss)

        if verbose and epoch % 10 == 0:
            print(f"Epoch {epoch}, Loss: {loss:.6f}")

    return theta, losses


# Example usage: Synthetic data
np.random.seed(42)
X = np.random.rand(100, 2)  # 100 samples, 2 features
true_theta = np.array([3.0, 5.0])  # True parameters
y = X @ true_theta + np.random.normal(0, 0.1, 100)  # Noisy target

# Run Adagrad
theta, losses = adagrad(X, y, learning_rate=0.1, epochs=1000, batch_size=10)
print("Optimized parameters:", theta)

Epoch 0, Loss: 6.812798
Epoch 10, Loss: 2.886682
Epoch 20, Loss: 1.647228
Epoch 30, Loss: 1.014884
Epoch 40, Loss: 0.653440
Epoch 50, Loss: 0.435936
Epoch 60, Loss: 0.300977
Epoch 70, Loss: 0.215429
Epoch 80, Loss: 0.160195
Epoch 90, Loss: 0.123868
Epoch 100, Loss: 0.099478
Epoch 110, Loss: 0.082711
Epoch 120, Loss: 0.070841
Epoch 130, Loss: 0.062152
Epoch 140, Loss: 0.055567
Epoch 150, Loss: 0.050388
Epoch 160, Loss: 0.046172
Epoch 170, Loss: 0.042633
Epoch 180, Loss: 0.039584
Epoch 190, Loss: 0.036903
Epoch 200, Loss: 0.034507
Epoch 210, Loss: 0.032340
Epoch 220, Loss: 0.030365
Epoch 230, Loss: 0.028553
Epoch 240, Loss: 0.026884
Epoch 250, Loss: 0.025342
Epoch 260, Loss: 0.023913
Epoch 270, Loss: 0.022589
Epoch 280, Loss: 0.021359
Epoch 290, Loss: 0.020217
Epoch 300, Loss: 0.019155
Epoch 310, Loss: 0.018168
Epoch 320, Loss: 0.017250
Epoch 330, Loss: 0.016396
Epoch 340, Loss: 0.015601
Epoch 350, Loss: 0.014862
Epoch 360, Loss: 0.014174
Epoch 370, Loss: 0.013535
Epoch 380, Loss: 0.0129