In [3]:
import random
import numpy as np
from data_process import get_CIFAR10_data
from scipy.spatial import distance
from models import Perceptron, SVM, Softmax
from kaggle_submission import output_submission_csv
%matplotlib inline

# For auto-reloading external modules
# See http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

# Loading CIFAR-10

In the following cells we determine the number of images for each split and load the images.
<br /> 
TRAIN_IMAGES + VAL_IMAGES = (0, 50000]
, TEST_IMAGES = 10000

In [5]:
# You can change these numbers for experimentation
# For submission we will use the default values 
TRAIN_IMAGES = 40000
VAL_IMAGES = 10000

In [7]:
data = get_CIFAR10_data(TRAIN_IMAGES, VAL_IMAGES)
X_train_CIFAR, y_train_CIFAR = data['X_train'], data['y_train']
X_val_CIFAR, y_val_CIFAR = data['X_val'], data['y_val']
X_test_CIFAR, y_test_CIFAR = data['X_test'], data['y_test']
n_class_CIFAR = len(np.unique(y_test_CIFAR))

Convert the sets of images from dimensions of **(N, 3, 32, 32) -> (N, 3072)** where N is the number of images so that each **3x32x32** image is represented by a single vector.

In [10]:
X_train_CIFAR = np.reshape(X_train_CIFAR, (X_train_CIFAR.shape[0], -1))
X_val_CIFAR = np.reshape(X_val_CIFAR, (X_val_CIFAR.shape[0], -1))
X_test_CIFAR = np.reshape(X_test_CIFAR, (X_test_CIFAR.shape[0], -1))

### Get Accuracy

This function computes how well your model performs using accuracy as a metric.

In [14]:
def get_acc(pred, y_test):
    return np.sum(y_test == pred) / len(y_test) * 100

# Perceptron

Perceptron has 2 hyperparameters that you can experiment with:
- **Learning rate** - controls how much we change the current weights of the classifier during each update. We set it at a default value of 0.5, but you should experiment with different values. We recommend changing the learning rate by factors of 10 and observing how the performance of the classifier changes. You should also try adding a **decay** which slowly reduces the learning rate over each epoch.
- **Number of Epochs** - An epoch is a complete iterative pass over all of the data in the dataset. During an epoch we predict a label using the classifier and then update the weights of the classifier according to the perceptron update rule for each sample in the training set. You should try different values for the number of training epochs and report your results.

You will implement the Perceptron classifier in the **models/perceptron.py**

The following code: 
- Creates an instance of the Perceptron classifier class 
- The train function of the Perceptron class is trained on the training data
- We use the predict function to find the training accuracy as well as the testing accuracy


## Train Perceptron on CIFAR

In [50]:
import numpy as np
import pandas as pd  # Used for handling CSV creation for Kaggle submission
from tensorflow.keras.datasets import cifar10
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder  # To convert labels into one-hot encoding

# Activation Functions and their Derivatives
def relu(x):
    """
    ReLU activation function: returns the input if positive, otherwise returns 0.
    Useful for introducing non-linearity to the model.
    """
    return np.maximum(0, x)

def relu_derivative(x):
    """
    Derivative of ReLU function: returns 1 if input is positive, otherwise 0.
    Required for backpropagation to compute gradients.
    """
    return (x > 0).astype(float)

# Loss Functions
def cross_entropy_loss(y_true, logits):
    """
    Computes cross-entropy loss given the true labels and the predicted logits.
    Cross-entropy loss is used in classification tasks where the output is a probability distribution.
    
    Parameters:
    - y_true: The true labels (one-hot encoded).
    - logits: The raw predicted scores before softmax.

    Returns:
    - loss: The average cross-entropy loss across all samples.
    """
    m = y_true.shape[0]  # Number of samples
    log_likelihood = -np.log(np.exp(logits[range(m), y_true.argmax(axis=1)]) / np.sum(np.exp(logits), axis=1))
    return np.sum(log_likelihood) / m  # Average loss over all samples

def cross_entropy_loss_derivative(y_true, logits):
    """
    Derivative of cross-entropy loss with respect to logits. 
    Used to calculate gradients during backpropagation.

    Parameters:
    - y_true: The true labels (one-hot encoded).
    - logits: The raw predicted scores before softmax.

    Returns:
    - Derivative of the loss with respect to the logits.
    """
    m = y_true.shape[0]
    exp_logits = np.exp(logits)  # Exponentiate the logits
    probs = exp_logits / np.sum(exp_logits, axis=1, keepdims=True)  # Convert logits to probabilities
    return probs - y_true  # Subtract the one-hot encoded true labels from the probabilities

# Accuracy Calculation
def get_acc(predictions, true_labels):
    """
    Computes the accuracy of the predictions.

    Parameters:
    - predictions: The predicted class indices.
    - true_labels: The true labels (one-hot encoded).

    Returns:
    - accuracy: The percentage of correct predictions.
    """
    return np.mean(predictions == np.argmax(true_labels, axis=1)) * 100

# MLP (Multi-Layer Perceptron) Class Definition
class MLP:
    def __init__(self, input_size, hidden_layer_sizes, output_size, lr=0.01):
        """
        Initializes the MLP with random weights and zero biases.
        
        Parameters:
        - input_size: The number of input features (flattened image size).
        - hidden_layer_sizes: A list of sizes for hidden layers.
        - output_size: The number of output classes.
        - lr: The learning rate for gradient descent.
        """
        self.lr = lr  # Learning rate
        # Initialize weights for each layer, small random values
        self.weights = [
            np.random.randn(input_size, hidden_layer_sizes[0]) * 0.01,  # Weights from input to first hidden layer
            np.random.randn(hidden_layer_sizes[0], hidden_layer_sizes[1]) * 0.01,  # Weights between hidden layers
            np.random.randn(hidden_layer_sizes[1], output_size) * 0.01  # Weights from last hidden to output layer
        ]
        # Initialize biases for each layer as zero
        self.biases = [
            np.zeros((1, hidden_layer_sizes[0])),  # Bias for the first hidden layer
            np.zeros((1, hidden_layer_sizes[1])),  # Bias for the second hidden layer
            np.zeros((1, output_size))  # Bias for the output layer
        ]

    def forward(self, X):
        """
        Performs the forward pass through the network.
        
        Parameters:
        - X: Input data.

        Returns:
        - logits: The raw scores from the output layer (before softmax).
        """
        # First hidden layer
        self.z1 = np.dot(X, self.weights[0]) + self.biases[0]
        self.a1 = relu(self.z1)  # Apply ReLU activation
        # Second hidden layer
        self.z2 = np.dot(self.a1, self.weights[1]) + self.biases[1]
        self.a2 = relu(self.z2)  # Apply ReLU activation
        # Output layer (logits)
        self.z3 = np.dot(self.a2, self.weights[2]) + self.biases[2]
        return self.z3  # Raw class scores (logits)

    def backward(self, X, y_true, logits):
        """
        Performs the backward pass (backpropagation) to update weights and biases.
        
        Parameters:
        - X: Input data.
        - y_true: True labels (one-hot encoded).
        - logits: Raw scores from the forward pass.
        """
        m = y_true.shape[0]  # Number of samples
        
        # Calculate gradients for the output layer
        dz3 = cross_entropy_loss_derivative(y_true, logits)  # Derivative of loss w.r.t. logits
        dw3 = np.dot(self.a2.T, dz3) / m  # Gradient of weights between hidden2 and output
        db3 = np.sum(dz3, axis=0, keepdims=True) / m  # Gradient of biases for output layer

        # Calculate gradients for the second hidden layer
        dz2 = np.dot(dz3, self.weights[2].T) * relu_derivative(self.z2)  # Derivative through ReLU
        dw2 = np.dot(self.a1.T, dz2) / m  # Gradient of weights between hidden1 and hidden2
        db2 = np.sum(dz2, axis=0, keepdims=True) / m  # Gradient of biases for hidden2

        # Calculate gradients for the first hidden layer
        dz1 = np.dot(dz2, self.weights[1].T) * relu_derivative(self.z1)  # Derivative through ReLU
        dw1 = np.dot(X.T, dz1) / m  # Gradient of weights between input and hidden1
        db1 = np.sum(dz1, axis=0, keepdims=True) / m  # Gradient of biases for hidden1

        # Update weights and biases using gradients
        self.weights[2] -= self.lr * dw3  # Update weights for hidden2 -> output
        self.biases[2] -= self.lr * db3  # Update biases for output layer
        self.weights[1] -= self.lr * dw2  # Update weights for hidden1 -> hidden2
        self.biases[1] -= self.lr * db2  # Update biases for hidden2 layer
        self.weights[0] -= self.lr * dw1  # Update weights for input -> hidden1
        self.biases[0] -= self.lr * db1  # Update biases for hidden1 layer

    def train(self, X_train, y_train, epochs=30, batch_size=64):
        """
        Trains the MLP using mini-batch gradient descent.

        Parameters:
        - X_train: The input training data.
        - y_train: The true training labels (one-hot encoded).
        - epochs: The number of training epochs.
        - batch_size: The size of each mini-batch for gradient descent.
        """
        for epoch in range(epochs):
            # Shuffle data at the start of each epoch
            indices = np.arange(X_train.shape[0])
            np.random.shuffle(indices)
            X_train = X_train[indices]
            y_train = y_train[indices]

            # Mini-batch gradient descent
            for i in range(0, X_train.shape[0], batch_size):
                X_batch = X_train[i:i + batch_size]
                y_batch = y_train[i:i + batch_size]
                logits = self.forward(X_batch)
                self.backward(X_batch, y_batch, logits)

            # Calculate loss for the entire training set at the end of the epoch
            logits = self.forward(X_train)
            loss = cross_entropy_loss(y_train, logits)
            print(f'Epoch {epoch + 1}/{epochs}, Loss: {loss:.4f}')  # Print the loss after every epoch

    def predict(self, X):
        """
        Predicts class labels for input data by forwarding through the network.

        Parameters:
        - X: Input data.

        Returns:
        - Predicted class labels.
        """
        logits = self.forward(X)  # Get raw scores
        return np.argmax(logits, axis=1)  # Return the class with the highest score

# Data Preparation
(X_train, y_train), (X_test, y_test) = cifar10.load_data()  # Load CIFAR-10 dataset
encoder = OneHotEncoder(sparse_output=False)  # Create a one-hot encoder for labels
y_train = encoder.fit_transform(y_train)  # Transform training labels into one-hot encoded vectors
y_test = encoder.transform(y_test)  # Transform test labels into one-hot encoded vectors

# Normalize the image data (pixel values between 0 and 1)
X_train = X_train.astype('float32') / 255
X_test = X_test.astype('float32') / 255
X_train = X_train.reshape(X_train.shape[0], -1)  # Flatten the images for input into the MLP
X_test = X_test.reshape(X_test.shape[0], -1)  # Flatten the images for test set

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Initialize and train the MLP model
mlp = MLP(input_size=32*32*3, hidden_layer_sizes=[512, 256], output_size=10, lr=0.01)
mlp.train(X_train, y_train, epochs=30, batch_size=64)  # Train the model

# Training Accuracy
pred_percept = mlp.predict(X_train)  # Get predictions for the training set
print('The training accuracy is given by: %f' % get_acc(pred_percept, y_train))  # Print the training accuracy

# Validation Accuracy
pred_percept = mlp.predict(X_val)  # Get predictions for the validation set
print('The validation accuracy is given by: %f' % get_acc(pred_percept, y_val))  # Print the validation accuracy

# Test Accuracy
pred_percept = mlp.predict(X_test)  # Get predictions for the test set
print('The testing accuracy is given by: %f' % get_acc(pred_percept, y_test))  # Print the testing accuracy

# Kaggle Submission
def output_submission_csv(filename, predictions):
    """
    Saves predictions to a CSV file for Kaggle submission.

    Parameters:
    - filename: The name of the CSV file.
    - predictions: The predicted class labels for the test data.
    """
    df = pd.DataFrame({
        'id': np.arange(0, len(predictions)),  # Create an 'id' column starting from 0
        'category': predictions  # The predicted category labels
    })
    df.to_csv(filename, index=False)  # Save the DataFrame to a CSV file

output_submission_csv('perceptron_cifar_submission.csv', pred_percept)  # Save the predictions to a CSV file


Epoch 1/30, Loss: 2.2779
Epoch 2/30, Loss: 2.0966
Epoch 3/30, Loss: 2.0395
Epoch 4/30, Loss: 1.9566
Epoch 5/30, Loss: 1.9063
Epoch 6/30, Loss: 1.8650
Epoch 7/30, Loss: 1.8305
Epoch 8/30, Loss: 1.7863
Epoch 9/30, Loss: 1.7497
Epoch 10/30, Loss: 1.7144
Epoch 11/30, Loss: 1.6888
Epoch 12/30, Loss: 1.6795
Epoch 13/30, Loss: 1.6364
Epoch 14/30, Loss: 1.6232
Epoch 15/30, Loss: 1.5920
Epoch 16/30, Loss: 1.5615
Epoch 17/30, Loss: 1.5446
Epoch 18/30, Loss: 1.5230
Epoch 19/30, Loss: 1.4935
Epoch 20/30, Loss: 1.5064
Epoch 21/30, Loss: 1.5254
Epoch 22/30, Loss: 1.4823
Epoch 23/30, Loss: 1.4329
Epoch 24/30, Loss: 1.4594
Epoch 25/30, Loss: 1.4271
Epoch 26/30, Loss: 1.4012
Epoch 27/30, Loss: 1.4028
Epoch 28/30, Loss: 1.3733
Epoch 29/30, Loss: 1.3509
Epoch 30/30, Loss: 1.3502
The training accuracy is given by: 52.342500
The validation accuracy is given by: 48.300000
The testing accuracy is given by: 49.390000


### Validate Perceptron on CIFAR

### Test Perceptron on CIFAR

### Perceptron_CIFAR Kaggle Submission

Once you are satisfied with your solution and test accuracy, output a file to submit your test set predictions to the Kaggle for Assignment 1 CIFAR. Use the following code to do so:

# Support Vector Machines (with SGD)

Next, you will implement a "soft margin" SVM. In this formulation you will maximize the margin between positive and negative training examples and penalize margin violations using a hinge loss.

We will optimize the SVM loss using SGD. This means you must compute the loss function with respect to model weights. You will use this gradient to update the model weights.

SVM optimized with SGD has 3 hyperparameters that you can experiment with:
- **Learning rate** - similar to as defined above in Perceptron, this parameter scales by how much the weights are changed according to the calculated gradient update. 
- **Epochs** - similar to as defined above in Perceptron.
- **Regularization constant** - Hyperparameter to determine the strength of regularization. In this case it is a coefficient on the term which maximizes the margin. You could try different values. The default value is set to 0.05.

You will implement the SVM using SGD in the **models/svm.py**

The following code: 
- Creates an instance of the SVM classifier class 
- The train function of the SVM class is trained on the training data
- We use the predict function to find the training accuracy as well as the testing accuracy

## Train SVM on CIFAR

In [24]:
import numpy as np
from tensorflow.keras.datasets import cifar10
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Define the SVM class
# This class implements a simple Support Vector Machine (SVM) using gradient descent optimization.
class SVM:
    def __init__(self, n_class: int, lr: float, epochs: int, reg_const: float):
        """
        Initialize the SVM classifier.
        
        Parameters:
        - n_class: The number of unique classes (CIFAR-10 has 10 classes).
        - lr: The learning rate, which controls how much to adjust weights with each update.
        - epochs: The number of times to loop over the entire training data (full passes).
        - reg_const: The regularization constant, which helps to prevent overfitting by penalizing large weights.
        """
        self.n_class = n_class  # Store number of classes
        self.lr = lr  # Store learning rate
        self.epochs = epochs  # Store number of epochs (iterations)
        self.reg_const = reg_const  # Store regularization constant
        self.weights = []  # This will hold weights for each class

    def calc_gradient(self, X_train: np.ndarray, y_train: np.ndarray, w: np.ndarray) -> np.ndarray:
        """
        Calculate the gradient of the SVM hinge loss with respect to the weights.

        Parameters:
        - X_train: Training data (N samples, D features).
        - y_train: Training labels (binary labels for current class).
        - w: The weights for the current class.

        Returns:
        - gradient: Gradient of the hinge loss with respect to the weights.
        """
        # Margin is the distance from the decision boundary
        margin = y_train * np.dot(X_train, w)  # Compute the margin for the current class
        
        # Only compute the gradient for samples where the margin is less than 1 (i.e., misclassified or too close)
        indicator = margin < 1  # Binary mask: True if the sample is within the margin or on the wrong side
        
        # Initialize the gradient (same shape as weights)
        gradient = np.zeros(w.shape)
        
        # Loop over all samples in the training set
        for i, xi in enumerate(X_train):
            # If the sample is misclassified or within the margin, update the gradient
            if indicator[i]:
                gradient -= y_train[i] * xi  # Subtract the feature vector scaled by the label

        # Divide by the number of samples to normalize and add regularization to the gradient
        gradient = gradient / X_train.shape[0] + self.reg_const * w
        return gradient

    def train(self, X_train: np.ndarray, y_train: np.ndarray):
        """
        Train the SVM model using batch gradient descent.

        Parameters:
        - X_train: Training data (N samples, D features).
        - y_train: Training labels (N samples, integer labels from 0 to n_class-1).
        """
        n_samples, n_features = X_train.shape  # Get the number of samples and features
        
        # Train one classifier per class (One-vs-All approach)
        for class_label in range(self.n_class):
            # Create binary labels: +1 for the current class, -1 for all other classes
            y_train_binary = np.where(y_train == class_label, 1, -1)
            
            # Initialize the weights for the current class to zeros
            w = np.zeros(n_features)
            
            # Perform gradient descent for the specified number of epochs
            for epoch in range(self.epochs):
                # Compute the gradient of the loss function for the current class
                gradient = self.calc_gradient(X_train, y_train_binary, w)
                
                # Update the weights using the computed gradient
                w -= self.lr * gradient
                
                # Optionally, compute the hinge loss for monitoring the progress
                if epoch % 10 == 0:
                    loss = self.hinge_loss(X_train, y_train_binary, w)
                    print(f"Epoch {epoch}/{self.epochs}, Class {class_label}, Loss: {loss:.4f}")

            # After training, store the final weights for the current class
            self.weights.append(w)

    def hinge_loss(self, X_train: np.ndarray, y_train: np.ndarray, w: np.ndarray) -> float:
        """
        Compute the hinge loss for the given data and weights.

        Parameters:
        - X_train: Training data.
        - y_train: Binary labels for the current class.
        - w: Weights for the current class.

        Returns:
        - hinge_loss: The computed hinge loss for the current class.
        """
        # Compute the margin (distance from the decision boundary)
        margin = y_train * np.dot(X_train, w)
        
        # Hinge loss is zero for correct classifications (margin >= 1) and positive for incorrect ones
        hinge_loss = np.maximum(0, 1 - margin)
        
        # The total loss is the average hinge loss plus the regularization penalty
        return np.mean(hinge_loss) + 0.5 * self.reg_const * np.dot(w, w)

    def predict(self, X_test: np.ndarray) -> np.ndarray:
        """
        Predict the class labels for the input test data.

        Parameters:
        - X_test: Test data (N samples, D features).

        Returns:
        - predictions: Predicted class labels for each test sample.
        """
        # Initialize an array to hold decision values (scores) for each class
        predictions = np.zeros((X_test.shape[0], self.n_class))
        
        # For each class, compute the decision value using the learned weights
        for class_label, w in enumerate(self.weights):
            predictions[:, class_label] = np.dot(X_test, w)
        
        # The predicted class is the one with the highest score
        return np.argmax(predictions, axis=1)


# Load CIFAR-10 dataset
# CIFAR-10 contains 60,000 32x32 color images in 10 different classes.
(X_train, y_train), (X_test, y_test) = cifar10.load_data()
y_train = y_train.flatten()  # Flatten the labels array to a 1D vector
y_test = y_test.flatten()

# Preprocess the data
# We need to flatten the images from 32x32x3 (3 color channels) into 1D vectors
X_train = X_train.reshape(X_train.shape[0], -1).astype('float32')  # Flatten the images and convert to float
X_test = X_test.reshape(X_test.shape[0], -1).astype('float32')

# Standardize the data
# Subtract the mean and divide by the standard deviation so that all features have zero mean and unit variance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Split the training set into a smaller training set and a validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Initialize and train the SVM model
n_classes = 10  # CIFAR-10 has 10 different classes
lr = 0.001  # Learning rate for gradient descent
epochs = 60  # Number of epochs (iterations) to train the model
reg_const = 0.01  # Regularization constant to penalize large weights

# Create an SVM model with the specified parameters
svm_model = SVM(n_class=n_classes, lr=lr, epochs=epochs, reg_const=reg_const)

# Train the model on the training data
svm_model.train(X_train, y_train)

# Function to compute the accuracy of the predictions
def calculate_accuracy(y_pred, y_true):
    """
    Calculate the accuracy of the predictions compared to the true labels.

    Parameters:
    - y_pred: Predicted class labels.
    - y_true: True class labels.

    Returns:
    - accuracy: The percentage of correctly classified samples.
    """
    return np.mean(y_pred == y_true) * 100

# Evaluate the model on the training set
y_train_pred = svm_model.predict(X_train)
train_acc = calculate_accuracy(y_train_pred, y_train)
print(f"Training accuracy: {train_acc:.2f}%")

# Evaluate the model on the validation set
y_val_pred = svm_model.predict(X_val)
val_acc = calculate_accuracy(y_val_pred, y_val)
print(f"Validation accuracy: {val_acc:.2f}%")

# Evaluate the model on the test set
y_test_pred = svm_model.predict(X_test)
test_acc = calculate_accuracy(y_test_pred, y_test)
print(f"Test accuracy: {test_acc:.2f}%")


Epoch 0/60, Class 0, Loss: 0.9815
Epoch 10/60, Class 0, Loss: 0.9040
Epoch 20/60, Class 0, Loss: 0.8907
Epoch 30/60, Class 0, Loss: 0.8843
Epoch 40/60, Class 0, Loss: 0.8812
Epoch 50/60, Class 0, Loss: 0.8793
Epoch 0/60, Class 1, Loss: 0.9956
Epoch 10/60, Class 1, Loss: 0.9527
Epoch 20/60, Class 1, Loss: 0.9289
Epoch 30/60, Class 1, Loss: 0.9159
Epoch 40/60, Class 1, Loss: 0.9074
Epoch 50/60, Class 1, Loss: 0.9016
Epoch 0/60, Class 2, Loss: 0.9988
Epoch 10/60, Class 2, Loss: 0.9864
Epoch 20/60, Class 2, Loss: 0.9740
Epoch 30/60, Class 2, Loss: 0.9639
Epoch 40/60, Class 2, Loss: 0.9574
Epoch 50/60, Class 2, Loss: 0.9524
Epoch 0/60, Class 3, Loss: 0.9968
Epoch 10/60, Class 3, Loss: 0.9656
Epoch 20/60, Class 3, Loss: 0.9560
Epoch 30/60, Class 3, Loss: 0.9513
Epoch 40/60, Class 3, Loss: 0.9478
Epoch 50/60, Class 3, Loss: 0.9449
Epoch 0/60, Class 4, Loss: 0.9943
Epoch 10/60, Class 4, Loss: 0.9494
Epoch 20/60, Class 4, Loss: 0.9396
Epoch 30/60, Class 4, Loss: 0.9335
Epoch 40/60, Class 4, Los

### Validate SVM on CIFAR

### Test SVM on CIFAR

### SVM_CIFAR Kaggle Submission

Once you are satisfied with your solution and test accuracy output a file to submit your test set predictions to the Kaggle for Assignment 1 CIFAR. Use the following code to do so:

# Softmax Classifier (with SGD)

Next, you will train a Softmax classifier. This classifier consists of a linear function of the input data followed by a softmax function which outputs a vector of dimension C (number of classes) for each data point. Each entry of the softmax output vector corresponds to a confidence in one of the C classes, and like a probability distribution, the entries of the output vector sum to 1. We use a cross-entropy loss on this sotmax output to train the model. 

Check the following link as an additional resource on softmax classification: http://cs231n.github.io/linear-classify/#softmax

Once again we will train the classifier with SGD. This means you need to compute the gradients of the softmax cross-entropy loss function according to the weights and update the weights using this gradient. Check the following link to help with implementing the gradient updates: https://deepnotes.io/softmax-crossentropy

The softmax classifier has 3 hyperparameters that you can experiment with:
- **Learning rate** - As above, this controls how much the model weights are updated with respect to their gradient.
- **Number of Epochs** - As described for perceptron.
- **Regularization constant** - Hyperparameter to determine the strength of regularization. In this case, we minimize the L2 norm of the model weights as regularization, so the regularization constant is a coefficient on the L2 norm in the combined cross-entropy and regularization objective.

You will implement a softmax classifier using SGD in the **models/softmax.py**

The following code: 
- Creates an instance of the Softmax classifier class 
- The train function of the Softmax class is trained on the training data
- We use the predict function to find the training accuracy as well as the testing accuracy

## Train Softmax on CIFAR

In [48]:
import numpy as np
import pandas as pd  # Import pandas to handle CSV export for Kaggle submission
from tensorflow.keras.datasets import cifar10
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Define the SoftmaxClassifier class
class SoftmaxClassifier:
    def __init__(self, n_classes: int, lr: float, epochs: int, reg_const: float):
        """
        Initialize the Softmax Classifier.
        
        Parameters:
        - n_classes: The number of classes in the dataset (CIFAR-10 has 10 classes).
        - lr: Learning rate for gradient descent (how much we adjust weights at each step).
        - epochs: Number of iterations (complete passes through the dataset).
        - reg_const: Regularization constant to prevent overfitting by penalizing large weights.
        """
        self.n_classes = n_classes  # Store number of classes
        self.lr = lr  # Store learning rate
        self.epochs = epochs  # Store the number of training epochs
        self.reg_const = reg_const  # Store regularization constant (to avoid overfitting)
        self.weights = None  # Initialize weights later when we know input dimensions

    def softmax(self, logits):
        """
        Apply softmax function to convert logits (raw scores) into probabilities.

        Parameters:
        - logits: Raw predicted scores from the model.
        
        Returns:
        - probabilities: Normalized probabilities for each class.
        """
        exp_logits = np.exp(logits - np.max(logits, axis=1, keepdims=True))  # Normalize for numerical stability
        return exp_logits / np.sum(exp_logits, axis=1, keepdims=True)  # Compute probabilities using softmax

    def cross_entropy_loss(self, softmax_probs, y_true):
        """
        Compute the cross-entropy loss, which is the standard loss function for classification tasks.
        
        Parameters:
        - softmax_probs: Probabilities from the softmax function for each class.
        - y_true: True labels for the data.
        
        Returns:
        - loss: The computed cross-entropy loss.
        """
        m = y_true.shape[0]  # Number of samples
        log_likelihood = -np.log(softmax_probs[range(m), y_true])  # Calculate log-likelihood for the correct class
        loss = np.sum(log_likelihood) / m  # Average loss over all samples
        return loss

    def compute_gradients(self, X, softmax_probs, y_true):
        """
        Compute the gradient of the loss function with respect to the weights.
        
        Parameters:
        - X: Input data.
        - softmax_probs: Probabilities output by the softmax function.
        - y_true: True labels for the data.
        
        Returns:
        - grad: Gradient of the loss with respect to the weights.
        """
        m = X.shape[0]  # Number of samples
        softmax_probs[range(m), y_true] -= 1  # Subtract 1 from the probability of the correct class
        grad = np.dot(X.T, softmax_probs) / m  # Compute the gradient of the weights
        return grad

    def train(self, X_train, y_train):
        """
        Train the Softmax Classifier using gradient descent.
        
        Parameters:
        - X_train: The training data.
        - y_train: The true labels for the training data.
        """
        n_samples, n_features = X_train.shape  # Number of training samples and input features
        
        # Initialize the weights to small random values
        self.weights = np.zeros((n_features, self.n_classes))

        # Loop through the number of epochs to train the model
        for epoch in range(self.epochs):
            # Step 1: Forward pass - Compute logits (raw class scores)
            logits = np.dot(X_train, self.weights)
            softmax_probs = self.softmax(logits)  # Apply softmax to get probabilities
            
            # Step 2: Compute loss - Cross-entropy loss + regularization penalty
            loss = self.cross_entropy_loss(softmax_probs, y_train)
            loss += self.reg_const * np.sum(self.weights ** 2) / 2  # Regularization to penalize large weights

            # Step 3: Backward pass - Compute gradients and update weights
            grad = self.compute_gradients(X_train, softmax_probs, y_train)
            grad += self.reg_const * self.weights  # Add regularization gradient
            self.weights -= self.lr * grad  # Update weights using gradient descent

            # Optionally print the loss every 10 epochs to monitor progress
            if epoch % 10 == 0:
                print(f"Epoch {epoch}/{self.epochs}, Loss: {loss:.4f}")

    def predict(self, X):
        """
        Make predictions on new data by calculating logits and returning the class with the highest score.
        
        Parameters:
        - X: Input data for prediction.
        
        Returns:
        - predicted_labels: Predicted class labels for each input sample.
        """
        logits = np.dot(X, self.weights)  # Compute the raw class scores (logits)
        softmax_probs = self.softmax(logits)  # Apply softmax to get class probabilities
        return np.argmax(softmax_probs, axis=1)  # Return the class with the highest probability


# Load CIFAR-10 dataset
# CIFAR-10 contains 60,000 images of size 32x32 in 10 different classes
(X_train, y_train), (X_test, y_test) = cifar10.load_data()  # Load training and testing data
y_train = y_train.flatten()  # Flatten the labels to a 1D array
y_test = y_test.flatten()

# Preprocess data (flatten images and standardize features)
X_train = X_train.reshape(X_train.shape[0], -1).astype('float32')  # Flatten the images
X_test = X_test.reshape(X_test.shape[0], -1).astype('float32')

# Standardize features (zero mean, unit variance)
scaler = StandardScaler()  # Create a StandardScaler instance
X_train = scaler.fit_transform(X_train)  # Standardize the training data
X_test = scaler.transform(X_test)  # Standardize the test data using the same scaling

# Split training set into a smaller training set and a validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Initialize and train the Softmax classifier
n_classes = 10  # CIFAR-10 has 10 classes
lr = 0.01  # Learning rate
epochs = 100  # Number of epochs to train
reg_const = 0.001  # Regularization constant

# Create an instance of the SoftmaxClassifier
softmax_model = SoftmaxClassifier(n_classes=n_classes, lr=lr, epochs=epochs, reg_const=reg_const)

# Train the softmax model on the training data
softmax_model.train(X_train, y_train)

# Function to compute the accuracy of predictions
def calculate_accuracy(y_pred, y_true):
    """
    Calculate accuracy of model predictions.
    
    Parameters:
    - y_pred: Predicted labels.
    - y_true: True labels.
    
    Returns:
    - accuracy: The percentage of correct predictions.
    """
    return np.mean(y_pred == y_true) * 100  # Calculate percentage of correct predictions

# Evaluate the model on the training set
y_train_pred = softmax_model.predict(X_train)
train_acc = calculate_accuracy(y_train_pred, y_train)
print(f"Training accuracy: {train_acc:.2f}%")

# Evaluate the model on the validation set
y_val_pred = softmax_model.predict(X_val)
val_acc = calculate_accuracy(y_val_pred, y_val)
print(f"Validation accuracy: {val_acc:.2f}%")

# Evaluate the model on the test set
y_test_pred = softmax_model.predict(X_test)
test_acc = calculate_accuracy(y_test_pred, y_test)
print(f"Test accuracy: {test_acc:.2f}%")


# Function to save test predictions to a CSV file
def save_predictions_to_csv(predictions, filename="softmax_cifar_submission.csv"):
    """
    Save the test predictions to a CSV file for submission.

    Parameters:
    - predictions: Predicted class labels for the test data.
    - filename: The name of the CSV file to save the results.
    """
    # Create a DataFrame with 'id' starting from 0 and 'category' as the predicted label
    df = pd.DataFrame({
        'id': np.arange(0, len(predictions)),  # Create an ID column starting from 0
        'category': predictions  # Assign predicted labels to the 'category' column
    })
    
    # Save the DataFrame to a CSV file
    df.to_csv(filename, index=False)
    print(f"Predictions saved to {filename}")

# Save the test predictions to a CSV file for submission
save_predictions_to_csv(y_test_pred)


Epoch 0/100, Loss: 2.3026
Epoch 10/100, Loss: 1.9415
Epoch 20/100, Loss: 1.8799
Epoch 30/100, Loss: 1.8484
Epoch 40/100, Loss: 1.8280
Epoch 50/100, Loss: 1.8131
Epoch 60/100, Loss: 1.8015
Epoch 70/100, Loss: 1.7920
Epoch 80/100, Loss: 1.7841
Epoch 90/100, Loss: 1.7773
Training accuracy: 40.06%
Validation accuracy: 39.14%
Test accuracy: 39.19%
Predictions saved to softmax_cifar_submission.csv


### Validate Softmax on CIFAR

### Testing Softmax on CIFAR

### Softmax_CIFAR Kaggle Submission

Once you are satisfied with your solution and test accuracy output a file to submit your test set predictions to the Kaggle for Assignment 1 CIFAR. Use the following code to do so: