# DATASCI 315, Homework 4: Multiclass Logistic Regression

**Submission instructions:** Upon completion, run the entire notebook, export as HTML, and upload to Canvas.

In [None]:
import torch

### Problem 1: One-Hot Encodings for Multiclass Data

In Group Work 4, we explored logistic regression for binary classification. This homework extends logistic regression to more than two classes.

In [None]:
# this part loads the iris dataset from sklearn and splits it
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

data_iris = load_iris()
X_iris = torch.tensor(data_iris.data, dtype=torch.float32)
y_iris = torch.tensor(data_iris.target, dtype=torch.long)

# add the intercept/bias term to X
n, p = X_iris.shape
ones = torch.ones((n, 1))
X_iris = torch.hstack((ones, X_iris))

If we have a vector $y'$ with dimensions $n \times 1$ that contains $m$ class labels stored in $l$ with dimensions $m \times 1$, the one-hot encoding is an $n \times m$ matrix $Y$ where $Y_{ij} = 1$ if $y'_i = l_j$ and $Y_{ij} = 0$ otherwise.

For example, if
$$
y' = \begin{bmatrix} \textrm{Sahana} \\
\textrm{Eduardo} \\
\textrm{Jake} \\
\textrm{Eduardo} \\
\textrm{Jake} \\
\textrm{Jake} \\
\textrm{Eduardo}
\end{bmatrix}
\quad \text{and} \quad
l = \begin{bmatrix} \textrm{Sahana} \\
\textrm{Eduardo} \\
\textrm{Jake}
\end{bmatrix},
$$
then
$$
Y = \begin{bmatrix} 1 & 0 & 0 \\
0 & 1 & 0\\
0 & 0 & 1\\
0 & 1 & 0\\
0 & 0 & 1\\
0 & 0 & 1\\
0 & 1 & 0
\end{bmatrix}.
$$

You can think of the first column as indicating whether $y'_i$ is Sahana, the second column as indicating whether $y'_i$ is Eduardo, and third column as indicating whether $y'_i$ is Jake.

Write a function `one_hot_encoding(y_prime, l)` that converts class labels to a one-hot encoding:
- **Input:** `y_prime` of shape $(n,)$ or $(n, 1)$, and `l` of shape $(m,)$ containing the unique labels
- **Output:** `Y` of shape $(n, m)$

**Requirements:**
- Do not use `for` or `while` loops across observations (loops over the $m$ labels are acceptable since $m$ is typically small)
- Use PyTorch vectorization for efficiency

**Hint:** You may find [`torch.where`](https://pytorch.org/docs/stable/generated/torch.where.html) useful, or use `torch.nn.functional.one_hot` directly.

In [None]:
def one_hot_encoding(y_prime, labels):
    # BEGIN SOLUTION
    # Create n x m matrix of zeros, set column j to 1 where y_prime matches labels[j]
    one_hot = torch.zeros((len(y_prime), len(labels)))
    for j, val in enumerate(labels):
        one_hot[:, j] = (y_prime.ravel() == val).float()
    return one_hot
    # END SOLUTION

In [None]:
# Test assertions
torch.manual_seed(42)
labels = torch.arange(3)
y_iris_check = y_iris.reshape((y_iris.shape[0], 1))[torch.randint(150, (10,))]
one_hot_y = one_hot_encoding(y_iris_check, labels)
assert one_hot_y.shape == (10, 3), f"Expected shape (10, 3), got {one_hot_y.shape}"
assert torch.allclose(one_hot_y.sum(dim=1), torch.ones(10)), "Each row should sum to 1"
print("All tests passed!")

# BEGIN HIDDEN TESTS
# Test with different random seed
torch.manual_seed(123)
y_test = torch.tensor([0, 1, 2, 0, 1])
one_hot_test = one_hot_encoding(y_test, torch.arange(3))
assert one_hot_test.shape == (5, 3), "Shape mismatch for 5 samples"
assert torch.equal(one_hot_test[0], torch.tensor([1.0, 0.0, 0.0])), "First row should be [1, 0, 0]"
assert torch.equal(one_hot_test[2], torch.tensor([0.0, 0.0, 1.0])), "Third row should be [0, 0, 1]"
# END HIDDEN TESTS

### Problem 2: Making Predictions with Multiclass Logistic Regression

A probabilistic classifier for multiclass data returns a length-$m$ vector of probabilities for each observation. Each entry of this vector is the probability of the observation belonging to the corresponding class. However, model outputs can take arbitrary values, so we need to transform these outputs so that they are nonnegative and sum to 1.

The **softmax** function provides this transformation. Suppose that $z_i = (z_{i1}, \ldots, z_{im})$ is the output of the model for data point $i$. The softmax function $\sigma$ is defined as:
$$
\sigma(z_i) = \left[
  \frac{e^{z_{i1}}}{\sum_{j=1}^m e^{z_{ij}}},
  \ldots,
  \frac{e^{z_{im}}}{\sum_{j=1}^m e^{z_{ij}}}
\right].
$$

In multiclass logistic regression, the probability that an observation with features $x$ belongs to class $j$ is:
$$
p(y = j \mid x) = \sigma(Wx)_j,
$$
where $W$ is an $m \times p$ matrix of parameters.

Write a function `lr_predict(X, W)` that computes class probabilities:
- **Input:** Design matrix $X$ of shape $(n, p)$ and parameter matrix $W$ of shape $(m, p)$
- **Output:** Probability matrix $\hat{P}$ of shape $(n, m)$ where $\hat{P}_{ij}$ is the probability that observation $i$ belongs to class $j$

**Requirements:** Use only NumPy functions (not SciPy).

In [None]:
def lr_predict(features, weights):
    # BEGIN SOLUTION
    # Compute logits Z = features @ weights.T, then apply softmax
    numerator = torch.exp(features @ weights.T)
    denominator = torch.sum(numerator, dim=1, keepdim=True)
    return numerator / denominator
    # END SOLUTION

In [None]:
# Test assertions
n, p = X_iris.shape
m = labels.shape[0]
torch.manual_seed(42)
W = torch.randn(m, p)
P_hat = lr_predict(X_iris, W)
assert P_hat.shape == (n, m), f"Expected shape ({n}, {m}), got {P_hat.shape}"
assert torch.allclose(torch.sum(P_hat, dim=1), torch.ones(n)), "Probabilities should sum to 1"
assert torch.all(P_hat >= 0), "Probabilities should be non-negative"
print("All tests passed!")

# BEGIN HIDDEN TESTS
# Check probabilities are in valid range
assert torch.all(P_hat <= 1), "Probabilities should be at most 1"
# Test with identity weight matrix
W_identity = torch.eye(m, p)
P_identity = lr_predict(X_iris[:5], W_identity)
assert P_identity.shape == (5, m), "Shape mismatch with identity weights"
assert torch.allclose(P_identity.sum(dim=1), torch.ones(5)), "Probabilities should sum to 1"
# END HIDDEN TESTS

### Problem 3: Numerically Stable Prediction of Log Probabilities

One can naively implement the loss function for multiclass logistic regression by taking the log of `P_hat` from `lr_predict`. However, this can be numerically unstable if `P_hat` contains entries that are nearly zero; applying a log to a variable that is not guaranteed to be strictly positive is inadvisable.

To avoid this instability, we compute log probabilities directly using the **log-sum-exp** function:
$$
\mathrm{logsumexp}(x_1, \ldots, x_k) = \log \left[ \sum_{j=1}^k \exp(x_j) \right].
$$
PyTorch provides `torch.logsumexp` which implements this in a numerically stable way.

Observe that for log-softmax:
$$
\log \sigma(z_i)_j = z_{ij} - \log\left(\sum_{k=1}^m \exp(z_{ik})\right) = z_{ij} - \mathrm{logsumexp}(z_{i1}, \ldots, z_{im}).
$$

Write a function `lr_predict_log(X, W)` that computes log probabilities:
- **Input:** Design matrix $X$ of shape $(n, p)$ and parameter matrix $W$ of shape $(m, p)$
- **Output:** Log-probability matrix of shape $(n, m)$

**Hint:** Use `torch.logsumexp` with the `dim` parameter.

In [None]:
def lr_predict_log(features, weights):
    # BEGIN SOLUTION
    # Compute logits and subtract logsumexp for numerical stability
    logits = features @ weights.T
    return logits - torch.logsumexp(logits, dim=1, keepdim=True)
    # END SOLUTION

In [None]:
# Test assertions
P_hat_log = lr_predict_log(X_iris, W)
assert P_hat_log.shape == (n, m), f"Expected shape ({n}, {m}), got {P_hat_log.shape}"
assert torch.allclose(
    torch.sum(torch.exp(P_hat_log), dim=1), torch.ones(n)
), "exp(log_probs) should sum to 1"
assert torch.all(P_hat_log <= 0), "Log probabilities should be non-positive"
print("All tests passed!")

# BEGIN HIDDEN TESTS
# Verify consistency with lr_predict
P_hat_from_log = torch.exp(P_hat_log)
P_hat_direct = lr_predict(X_iris, W)
assert torch.allclose(P_hat_from_log, P_hat_direct), "exp(log_probs) should match probs"
# Test with extreme values to check numerical stability
W_extreme = torch.tensor([[100.0, 0, 0, 0, 0], [0, 100.0, 0, 0, 0], [0, 0, 100.0, 0, 0]])
P_extreme_log = lr_predict_log(X_iris[:3], W_extreme)
assert not torch.any(torch.isnan(P_extreme_log)), "Should handle extreme values without NaN"
assert not torch.any(torch.isinf(P_extreme_log)), "Should handle extreme values without Inf"
# END HIDDEN TESTS

### Problem 4: Gradient Descent for Multiclass Logistic Regression

The **cross-entropy loss** for multiclass logistic regression is:
$$
L(W) = -\sum_{i=1}^n \sum_{j=1}^{m} Y_{ij} \log \hat{P}_{ij}
$$
where $\hat{P}$ is an $n \times m$ matrix containing the class probabilities (given the parameters $W$) and $Y$ is the one-hot encoding of the responses.

The **gradient** of this loss with respect to $W$ is:
$$
\nabla L(W) = (\hat{P} - Y)^\top X
$$

Implement gradient descent to find optimal weights $W$:
- **Input:**
  - $X$: design matrix of shape $(n, p)$
  - $y$: labels of shape $(n,)$
  - `eta`: learning rate
  - `initial_W`: initial weights of shape $(m, p)$
  - `epsilon`: convergence threshold
- **Output:** A tuple `(W, loss)` containing the optimized weights and final training loss

**Convergence criterion:** Stop when $\|W_{\text{new}} - W_{\text{old}}\|_2 < \epsilon$.

**Requirements:**
- Use `lr_predict_log` for computing log probabilities (for numerical stability)
- Use only NumPy (not scikit-learn)

In [None]:
def lr_train(features, y, eta, initial_weights, epsilon):
    # BEGIN SOLUTION
    # Gradient descent: update weights until convergence
    one_hot_y = one_hot_encoding(y, torch.arange(3))

    weights = initial_weights.clone()
    while True:
        weights_old = weights.clone()
        p_hat = lr_predict(features, weights)
        grad = (p_hat - one_hot_y).T @ features
        weights = weights - eta * grad
        if torch.linalg.norm(weights - weights_old) < epsilon:
            loss = -torch.sum(lr_predict_log(features, weights) * one_hot_y)
            return weights, loss.item()
    # END SOLUTION

In [None]:
# Test assertions
torch.manual_seed(42)

# Split the data (using sklearn, then convert to torch)
X_train_np, X_test_np, y_train_np, y_test_np = train_test_split(
    X_iris.numpy(), y_iris.numpy(), test_size=0.3, random_state=42
)
X_train_iris = torch.tensor(X_train_np, dtype=torch.float32)
X_test_iris = torch.tensor(X_test_np, dtype=torch.float32)
y_train_iris = torch.tensor(y_train_np, dtype=torch.long)
y_test_iris = torch.tensor(y_test_np, dtype=torch.long)

w_init = torch.randn(m, p)
eta = 1e-3
epsilon = 0.001
W_graddescent, training_loss = lr_train(X_train_iris, y_train_iris, eta, w_init, epsilon)
assert training_loss < 15, f"Training loss {training_loss} is too high"
assert training_loss > 5, f"Training loss {training_loss} is suspiciously low"

pred_y_test = lr_predict_log(X_test_iris, W_graddescent)
Y_test_onehot = one_hot_encoding(y_test_iris.reshape((y_test_iris.shape[0], 1)), torch.arange(m))
test_loss = -torch.sum(pred_y_test * Y_test_onehot)
assert test_loss < 5, f"Test loss {test_loss} is too high"
assert test_loss > 1, f"Test loss {test_loss} is suspiciously low"
print("All tests passed!")

# BEGIN HIDDEN TESTS
# Check weight matrix shape
assert W_graddescent.shape == (m, p), f"Expected ({m}, {p}), got {W_graddescent.shape}"
# Check that predictions are valid probabilities
P_test = lr_predict(X_test_iris, W_graddescent)
assert torch.allclose(
    P_test.sum(dim=1), torch.ones(P_test.shape[0])
), "Probabilities should sum to 1"
# Check accuracy is reasonable
predictions = torch.argmax(P_test, dim=1)
accuracy = torch.mean((predictions == y_test_iris).float())
assert accuracy > 0.8, f"Accuracy {accuracy} is too low for Iris dataset"
# END HIDDEN TESTS