# DATASCI 315, Homework 3: Gradient Descent and Linear Regression

To submit, please upload an HTML file to Canvas showing the results of running this notebook using the process described in Group Work Assignment 1.

In [None]:
import matplotlib.pyplot as plt
import torch

## Part 1: Gradient Descent for an Arbitrary Function

### Problem 1: Gradient Descent for a Function of One Variable

Recall that gradient descent is an optimization algorithm used to minimize some function by iteratively moving in the direction of steepest descent as defined by the negative of the function's gradient.

Namely, the procedure is as follows:
1. Start with an initial value $w$.
2. Update $w$ by moving in the direction of the negative gradient of the function at $w$: $$w \leftarrow w - \eta \nabla f(w).$$
Here, $\eta$ is the *learning rate*; it controls the size of the step we're taking in the direction of the negative gradient.
3. Repeat step 2 until the variable $w$ changes very little between iterations.

Let's say we want to minimize $$f(w) = (w - 2)^2,$$
that is, find the $w$ where $f(w)$ achieves the minimum value **using gradient descent**.

Implement `problem1()` to perform gradient descent and return the final value of $w$. The function takes the following optional parameters:
- `epsilon`: convergence threshold for the change in $w$ between iterations (default: 0.01)
- `initial_w`: starting value for $w$ (default: 10)
- `eta`: learning rate (default: 0.1)

**Hint:** The gradient of $f(w) = (w - 2)^2$ is $f'(w) = 2(w - 2) = 2w - 4$.

In [None]:
def problem1(epsilon=0.01, initial_w=10, eta=0.1):
    """Perform gradient descent to minimize f(w) = (w - 2)^2."""

    # BEGIN SOLUTION
    def gradient(w):
        return 2 * w - 4

    w = initial_w
    diff = float("inf")

    while diff > epsilon:
        old_w = w
        w = w - eta * gradient(w)
        diff = abs(w - old_w)

    return w
    # END SOLUTION

In [None]:
# Test assertions
result1 = problem1()
assert abs(result1 - 2) < 0.1, f"Expected w close to 2, got {result1}"
assert problem1(epsilon=0.001) is not None, "Function should return a value"
print("All tests passed!")

# BEGIN HIDDEN TESTS
assert (
    abs(problem1(epsilon=0.0001, initial_w=10, eta=0.1) - 2) < 0.01
), "Should converge closer to 2 with smaller epsilon"
assert (
    abs(problem1(epsilon=0.01, initial_w=-5, eta=0.1) - 2) < 0.1
), "Should work with negative initial value"
assert (
    abs(problem1(epsilon=0.01, initial_w=2, eta=0.1) - 2) < 0.1
), "Should work when starting at minimum"
# END HIDDEN TESTS

### Problem 2: Gradient Descent for a Function of Two Variables

Same as above, but now we want to minimize the [Rosenbrock function](https://en.wikipedia.org/wiki/Rosenbrock_function):
$$f(w_1, w_2) = (1 - w_1)^2 + 100(w_2 - w_1^2)^2$$

This function has a global minimum at $(w_1, w_2) = (1, 1)$.

Implement `problem2()` to perform gradient descent and return the final values of $w_1$ and $w_2$.

**Hint 1:** You can treat $w$ as a vector (using a NumPy array), but this is not necessary.

**Hint 2:** If your algorithm is running into trouble, try adjusting the learning rate first. A smaller learning rate (e.g., 0.001 or smaller) may be needed.

**Hint 3:** The gradient is:
$$\nabla f = \begin{bmatrix} -2(1 - w_1) - 400 w_1 (w_2 - w_1^2) \\ 200(w_2 - w_1^2) \end{bmatrix}$$

In [None]:
def problem2(epsilon=0.0001, initial_w=None, eta=0.0015):
    """Perform gradient descent to minimize the Rosenbrock function."""

    # BEGIN SOLUTION
    if initial_w is None:
        initial_w = torch.tensor([-1.2, 1.0])

    def gradient(w):
        dw1 = -2 * (1 - w[0]) - 400 * w[0] * (w[1] - w[0] ** 2)
        dw2 = 200 * (w[1] - w[0] ** 2)
        return torch.tensor([dw1, dw2])

    w = initial_w.clone()
    diff = float("inf")

    while diff > epsilon:
        old_w = w.clone()
        w = w - eta * gradient(w)
        diff = torch.sum(torch.abs(w - old_w)).item()

    return w[0].item(), w[1].item()
    # END SOLUTION

In [None]:
# Test assertions
w1, w2 = problem2()
assert w1 is not None and w2 is not None, "Function should return two values"
assert abs(w1 - 1) < 0.5, f"Expected w1 close to 1, got {w1}"
assert abs(w2 - 1) < 0.5, f"Expected w2 close to 1, got {w2}"
print("All tests passed!")

# BEGIN HIDDEN TESTS
w1_precise, w2_precise = problem2(epsilon=0.00001)
assert (
    abs(w1_precise - 1) < 0.2
), f"With smaller epsilon, w1 should be closer to 1, got {w1_precise}"
assert (
    abs(w2_precise - 1) < 0.2
), f"With smaller epsilon, w2 should be closer to 1, got {w2_precise}"
# END HIDDEN TESTS

## Part 2: Gradient Descent for Linear Regression

The previous functions just depended on some parameters $(w_1, w_2)$. Now we are interested in optimizing loss functions that depend on data $(X, y)$.

### Problem 3: Simple Linear Regression

We are interested in:
1. "Fitting" a linear model:
$$\hat{y}_i = \alpha + \beta x_i$$

2. By minimizing the squared error loss:
$$L(\alpha, \beta) = \frac{1}{2n}\sum_{i=1}^n (y_i - \hat{y}_i)^2 = \frac{1}{2n}\sum_{i=1}^n (y_i - \alpha - \beta x_i)^2$$

(Note: The $\frac{1}{2}$ factor is just a convention in Machine Learning that simplifies the gradient.)

3. Using gradient descent:
$$\alpha_k = \alpha_{k-1} - \eta \frac{\partial L}{\partial \alpha}$$
$$\beta_k = \beta_{k-1} - \eta \frac{\partial L}{\partial \beta}$$

The gradients are:
$$\frac{\partial L}{\partial \alpha} = -\frac{1}{n}\sum_{i=1}^n (y_i - \alpha - \beta x_i)$$
$$\frac{\partial L}{\partial \beta} = -\frac{1}{n}\sum_{i=1}^n (y_i - \alpha - \beta x_i) x_i$$

You can check the convergence of the gradient descent by checking if the changes made to $\alpha_k$ and $\beta_k$ are smaller than epsilon:
$$|\alpha_k - \alpha_{k-1}| + |\beta_k - \beta_{k-1}| < \epsilon$$

In [None]:
def problem3(x, y, eta, initial_alpha, initial_beta, epsilon):
    """Perform gradient descent to fit a simple linear regression model."""
    # BEGIN SOLUTION
    alpha = initial_alpha
    beta = initial_beta
    diff = float("inf")

    while diff > epsilon:
        # Compute predictions
        y_pred = alpha + beta * x
        residuals = y - y_pred

        # Compute gradients
        grad_alpha = -torch.mean(residuals)
        grad_beta = -torch.mean(residuals * x)

        # Update parameters
        old_alpha, old_beta = alpha, beta
        alpha = alpha - eta * grad_alpha
        beta = beta - eta * grad_beta

        # Check convergence
        diff = abs(alpha - old_alpha) + abs(beta - old_beta)

    return alpha.item() if hasattr(alpha, "item") else alpha, (
        beta.item() if hasattr(beta, "item") else beta
    )
    # END SOLUTION

Use the following cells to test your solution.

**Note:** The obtained coefficients do not need to be exactly the same as the sklearn solution, but should be getting closer if you keep the algorithm running longer (i.e., make the $\epsilon$ smaller).

In [None]:
# Generate some random data
torch.manual_seed(42)
x_data = torch.rand(100)
true_alpha = torch.rand(1)
true_beta = torch.rand(1)
y_data = true_alpha + x_data * true_beta + torch.rand(100) / 10

In [None]:
# Check the solution using scikit-learn
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(x_data[:, None], y_data)
print(f"sklearn intercept: {lr.intercept_}, sklearn coefficient: {lr.coef_[0]}")

In [None]:
# Test assertions
eta, initial_alpha, initial_beta, epsilon = 0.1, 0, 0, 0.0001
alpha_hat, beta_hat = problem3(x_data, y_data, eta, initial_alpha, initial_beta, epsilon)
print(f"Your solution: alpha = {alpha_hat}, beta = {beta_hat}")

assert alpha_hat is not None and beta_hat is not None, "Function should return alpha and beta"
assert abs(alpha_hat - lr.intercept_) < 0.1, "Alpha should be close to sklearn intercept"
assert abs(beta_hat - lr.coef_[0]) < 0.1, "Beta should be close to sklearn coefficient"
print("All tests passed!")

# BEGIN HIDDEN TESTS
alpha_test, beta_test = problem3(x_data, y_data, 0.5, 0, 0, 0.00001)
assert abs(alpha_test - lr.intercept_) < 0.05, "With smaller epsilon, should be closer to sklearn"
assert abs(beta_test - lr.coef_[0]) < 0.05, "With smaller epsilon, should be closer to sklearn"
# END HIDDEN TESTS

### Problem 4: Multivariate Linear Regression

In the vector form, gradient descent is given by:
$$w_k = w_{k-1} - \eta \nabla L,$$

where $w$ is the weight vector of the model, $\eta$ is the learning rate, $L$ is the loss function and $\nabla L$ is the corresponding gradient.

Consider now multiple/multivariate linear regression:
$$L(w) = \frac{1}{2n}\sum_{i=1}^n (y_i - x_i^T w)^2 = \frac{1}{2n}||Y - Xw||^2,$$

where each $x_i$ is a row vector, or correspondingly, $X$ is a matrix. Likewise $y_i$ is the corresponding element of the vector $Y$.

To simplify things, we have calculated the closed-form formula of the gradient for you:
$$\nabla L = \frac{1}{n} X^T(Xw - Y)$$

The convergence criterion is:
$$||w_k - w_{k-1}||_{\infty} < \epsilon$$

where $||\cdot||_{\infty}$ is the infinity norm (maximum absolute value). You can compute this with `torch.max(torch.abs(w - old_w))`.

Unlike the previous question, please also keep track of the loss at each iteration and return the whole loss history.

In [None]:
def problem4(features, y, eta, initial_w, epsilon):
    """Perform gradient descent for multivariate linear regression."""
    n = features.shape[0]

    # To fit the intercept, we pad the features matrix with a column of ones
    ones = torch.ones((n, 1))
    features_augmented = torch.hstack((ones, features))
    y_col = y.reshape((n, 1))

    losses = []
    w = initial_w.clone()

    # BEGIN SOLUTION
    diff = float("inf")

    while diff > epsilon:
        # Compute predictions
        y_pred = features_augmented @ w

        # Compute and store loss
        loss = 0.5 * torch.mean((y_col - y_pred) ** 2)
        losses.append(loss.item())

        # Compute gradient
        grad = (1 / n) * features_augmented.T @ (y_pred - y_col)

        # Update weights
        old_w = w.clone()
        w = w - eta * grad

        # Check convergence
        diff = torch.max(torch.abs(w - old_w)).item()
    # END SOLUTION

    return w, losses

We'll use the California Housing dataset to check our solution. This dataset contains housing prices and features from the 1990 California census.

**Note:** The coefficients obtained with your gradient descent implementation might not quite coincide with sklearn's solution. The important thing is that your training loss keeps decreasing, and that the final loss is comparable to sklearn's.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load California Housing dataset
housing_df = pd.read_csv("housing.csv")

# Select numeric features
feature_cols = [
    "longitude",
    "latitude",
    "housing_median_age",
    "total_rooms",
    "total_bedrooms",
    "population",
    "households",
    "median_income",
]
X_calif = housing_df[feature_cols].dropna()
# Scale to units of $100k
y_calif = housing_df.loc[X_calif.index, "median_house_value"] / 100000

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_calif, y_calif, test_size=0.3, random_state=42
)

# Standardize features (important for gradient descent convergence)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

n_features = X_calif.shape[1]
print(f"California Housing: {len(y_calif)} samples, {n_features} features")

In [None]:
# Test assertions
torch.manual_seed(42)
num_features = n_features + 1  # +1 for intercept
X_train_t = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_t = torch.tensor(y_train.values, dtype=torch.float32)
w_gd, losses = problem4(
    X_train_t,
    y_train_t,
    eta=0.01,
    initial_w=torch.randn(num_features, 1),
    epsilon=0.001,
)

# Plot the loss curve
plt.plot(torch.arange(1, len(losses) + 1), losses, color="red")
plt.xlabel("Iteration")
plt.ylabel("Loss")
plt.title("Training Loss over Iterations")
plt.show()

print(f"Final training loss: {losses[-1]}")

# Compute test loss
X_test_t = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_t = torch.tensor(y_test.values, dtype=torch.float32)
X_test_augmented = torch.hstack((torch.ones((X_test_t.shape[0], 1)), X_test_t))
y_test_pred = X_test_augmented @ w_gd
y_test_col = y_test_t.reshape((-1, 1))
test_loss = 0.5 * torch.mean((y_test_col - y_test_pred) ** 2)
print(f"Test loss: {test_loss.item()}")

assert len(losses) > 0, "Should return a non-empty loss history"
assert losses[-1] < losses[0], "Loss should decrease during training"
assert losses[-1] < 0.5, f"Final training loss should be reasonable, got {losses[-1]}"
print("All tests passed!")

# BEGIN HIDDEN TESTS
assert all(
    losses[i] >= losses[i + 1] - 1e-10 for i in range(len(losses) - 1)
), "Loss should be monotonically decreasing"
assert w_gd.shape == (num_features, 1), f"Weight shape should be ({num_features}, 1)"
# END HIDDEN TESTS

### Problem 5: Minibatch Gradient Descent

In minibatch gradient descent, we apply gradient descent to a subset of the data (called a "batch") at each iteration. This is more efficient for large datasets and can help escape local minima.

The algorithm is as follows:

**Inputs:** $X, Y, w_0, \eta, \epsilon, |B|$ (batch size)

**Step 0:** Set $n$ = number of rows in $X$ and calculate the number of batches as $m = \lceil n / |B| \rceil$

**Step 1:** Initialize $w_k = w_0$

**Step 2:** Repeat until $||w_k - w_{k-1}||_{\infty} < \epsilon$:

- **Step 2a:** Shuffle the data randomly and split into $m$ batches: $B_1, B_2, ..., B_m$

- **Step 2b:** For each batch $j$ in $\{1, 2, ..., m\}$, update weights:
  $$w_k = w_{k-1} - \eta \frac{1}{|B_j|} X_{B_j}^T(X_{B_j}w_{k-1} - Y_{B_j})$$

- **Step 2c:** After processing all batches, calculate and store the loss for the **whole** dataset:
  $$L_k = \frac{1}{2n}||Y - Xw_k||^2$$

**Output:** $w_k$, list of losses

**Hint:** It might be helpful to use `torch.randperm()` to shuffle the data:

```python
# Shuffle indices
perm = torch.randperm(n)
X_shuffled = X[perm]
y_shuffled = y[perm]

# Select a minibatch
X_mini = X_shuffled[start:end]
y_mini = y_shuffled[start:end]
```

Note that $X$ and $y$ must **always be shuffled together** (think about why this is important!).

If $|B|$ is not exactly divisible by the sample size, make the last minibatch of size $n \mod |B|$ and adjust the update rule accordingly.

In [None]:
def problem5(features, y, eta, initial_w, epsilon, batch_size):
    """Perform minibatch gradient descent for multivariate linear regression."""
    n = features.shape[0]
    ones = torch.ones((n, 1))
    features_augmented = torch.hstack((ones, features))
    y_col = y.reshape((n, 1))

    w = initial_w.clone()
    losses = []

    # BEGIN SOLUTION
    diff = float("inf")
    num_batches = (n + batch_size - 1) // batch_size  # ceiling division

    while diff > epsilon:
        old_w = w.clone()

        # Shuffle data using random permutation
        perm = torch.randperm(n)
        features_shuffled = features_augmented[perm]
        y_shuffled = y_col[perm]

        # Process each batch
        for j in range(num_batches):
            start_idx = j * batch_size
            end_idx = min((j + 1) * batch_size, n)

            features_mini = features_shuffled[start_idx:end_idx]
            y_mini = y_shuffled[start_idx:end_idx]
            batch_n = features_mini.shape[0]

            # Compute gradient for this batch
            grad = (1 / batch_n) * features_mini.T @ (features_mini @ w - y_mini)
            w = w - eta * grad

        # Compute loss on full dataset
        y_pred = features_augmented @ w
        loss = 0.5 * torch.mean((y_col - y_pred) ** 2)
        losses.append(loss.item())

        # Check convergence
        diff = torch.max(torch.abs(w - old_w)).item()
    # END SOLUTION

    return w, losses

We can reuse the test code from the previous question, as we're again solving a multivariate linear regression problem, this time with a slightly different optimization algorithm. Same caveats apply.

In [None]:
# Test assertions
torch.manual_seed(42)
num_features = n_features + 1
X_train_t = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_t = torch.tensor(y_train.values, dtype=torch.float32)
w_minibatch, losses_minibatch = problem5(
    X_train_t,
    y_train_t,
    eta=0.01,
    initial_w=torch.randn(num_features, 1),
    epsilon=0.001,
    batch_size=512,
)

# Plot the loss curve
plt.plot(torch.arange(1, len(losses_minibatch) + 1), losses_minibatch, color="red")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training Loss over Epochs (Minibatch GD)")
plt.show()

print(f"Final training loss: {losses_minibatch[-1]}")

# Compute test loss
X_test_t = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_t = torch.tensor(y_test.values, dtype=torch.float32)
X_test_augmented = torch.hstack((torch.ones((X_test_t.shape[0], 1)), X_test_t))
y_test_pred_mb = X_test_augmented @ w_minibatch
y_test_col = y_test_t.reshape((-1, 1))
test_loss_mb = 0.5 * torch.mean((y_test_col - y_test_pred_mb) ** 2)
print(f"Test loss: {test_loss_mb.item()}")

assert len(losses_minibatch) > 0, "Should return a non-empty loss history"
assert (
    losses_minibatch[-1] < 0.5
), f"Final training loss should be reasonable, got {losses_minibatch[-1]}"
print("All tests passed!")

# BEGIN HIDDEN TESTS
assert w_minibatch.shape == (
    num_features,
    1,
), f"Weight shape should be ({num_features}, 1)"
# Test with different batch size
torch.manual_seed(42)
w_small_batch, _ = problem5(
    X_train_t,
    y_train_t,
    eta=0.01,
    initial_w=torch.randn(num_features, 1),
    epsilon=0.001,
    batch_size=64,
)
assert w_small_batch.shape == (num_features, 1), "Should work with smaller batch size"
# END HIDDEN TESTS