In [1]:
import numpy as np


# Generate some example data
X = 2 * np.random.rand(100, 2)
y = (X[:, 0] + X[:, 1] > 2).astype(int)  # Example: classify based on sum > 2
X = np.c_[np.ones((100, 1)), X] # Add bias term (x0 = 1)
y = (y + (np.random.rand(100) > 0.5).astype(int)) % 2 # Make it binary, but add some noise


In [2]:
# One-hot encode the target labels for Softmax
num_classes = 2
y_one_hot = np.zeros((len(y), num_classes))
y_one_hot[np.arange(len(y)), y] = 1

# Split into training and validation sets
split_ratio = 0.8
split_index = int(len(X) * split_ratio)
X_train, X_val = X[:split_index], X[split_index:]
y_train, y_one_hot_val = y_one_hot[:split_index], y_one_hot[split_index:]


In [3]:
# Batch Gradient Descent parameters
learning_rate = 0.1
n_epochs = 1000
batch_size = len(X_train) # Full batch
m = batch_size
best_loss = np.inf # Correctly using np.inf
best_theta = None
patience = 50
patience_counter = 0

# Initialize theta (weights) randomly
theta = np.random.randn(X_train.shape[1], num_classes) # Correct initialization

# Softmax function
def softmax(logits):
    exp_logits = np.exp(logits)
    return exp_logits / np.sum(exp_logits, axis=1, keepdims=True)

# Cross-entropy loss
def cross_entropy_loss(y_true, y_pred):
    return -np.mean(np.sum(y_true * np.log(y_pred + 1e-10), axis=1))


In [4]:
# Training loop
for epoch in range(n_epochs):
    # Compute logits
    logits = X_train.dot(theta)
    # Compute predictions using softmax
    y_pred = softmax(logits)
    # Compute gradient
    error = y_pred - y_train
    gradient = (1/m) * X_train.T.dot(error)
    # Update weights
    theta -= learning_rate * gradient

    # Early stopping check on validation set
    val_logits = X_val.dot(theta)
    val_y_pred = softmax(val_logits)
    val_loss = cross_entropy_loss(y_one_hot_val, val_y_pred)

    if val_loss < best_loss:
        best_loss = val_loss
        best_theta = theta
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch}!")
            break

print("Training finished.")
if best_theta is not None:
    print("Final weights (theta):")
    print(best_theta)


Early stopping at epoch 99!
Training finished.
Final weights (theta):
[[-0.19917553 -1.04924409]
 [ 0.13365938 -0.22472968]
 [-0.71596593  0.31697102]]


In [5]:
# Example prediction
X_new = np.array([[1, 1.5, 1.5]]) # Use the same bias term format
probabilities = softmax(X_new.dot(best_theta))
prediction = np.argmax(probabilities, axis=1)
print(f"Prediction for X_new: {prediction}, probabilities: {probabilities}")


Prediction for X_new: [1], probabilities: [[0.45964962 0.54035038]]
