In [1]:
# Problem 1: Classification of Fully Connected Layers
class FC:
    """
    Fully connected layer from n_nodes1 to n_nodes2 nodes
    Parameters
    ----------
    n_nodes1 : int
        Number of nodes in the previous layer
    n_nodes2 : int
        Number of nodes in the next layer
    initializer : instance of initialization method
    optimizer : instance of optimization method
    """
    def __init__(self, n_nodes1, n_nodes2, initializer, optimizer):
        self.optimizer = optimizer
        # Initialization
        # Use initializer's methods to initialize self.W and self.B
        self.W = initializer.W(n_nodes1, n_nodes2)
        self.B = initializer.B(n_nodes2)
        self.n_nodes1 = n_nodes1
        self.n_nodes2 = n_nodes2
        
    def forward(self, X):
        """
        Forward pass
        Parameters
        ----------
        X : ndarray of shape (batch_size, n_nodes1)
            Input
        Returns
        -------
        A : ndarray of shape (batch_size, n_nodes2)
            Output
        """
        self.X = X  # Store input for backpropagation
        # A = X·W + B
        A = X @ self.W + self.B
        return A
        
    def backward(self, dA):
        """
        Backward pass
        Parameters
        ----------
        dA : ndarray of shape (batch_size, n_nodes2)
            Gradient flowing from the next layer
        Returns
        -------
        dZ : ndarray of shape (batch_size, n_nodes1)
            Gradient to pass to the previous layer
        """
        batch_size = dA.shape[0]
        
        # Gradient with respect to W
        self.dW = self.X.T @ dA / batch_size
        
        # Gradient with respect to B
        self.dB = np.sum(dA, axis=0) / batch_size
        
        # Gradient with respect to X
        dZ = dA @ self.W.T
        
        # Update weights and biases
        self = self.optimizer.update(self)
        
        return dZ

In [2]:
# Problem 2: Classifying the Initialization Method
class SimpleInitializer:
    """
    Simple initialization using Gaussian distribution
    Parameters
    ----------
    sigma : float
        Standard deviation of the Gaussian distribution
    """
    def __init__(self, sigma):
        self.sigma = sigma
        
    def W(self, n_nodes1, n_nodes2):
        """
        Weight initialization
        Parameters
        ----------
        n_nodes1 : int
            Number of nodes in the previous layer
        n_nodes2 : int
            Number of nodes in the next layer
        
        Returns
        -------
        W : ndarray of shape (n_nodes1, n_nodes2)
            Initialized weights
        """
        W = self.sigma * np.random.randn(n_nodes1, n_nodes2)
        return W
        
    def B(self, n_nodes2):
        """
        Bias initialization
        Parameters
        ----------
        n_nodes2 : int
            Number of nodes in the next layer
        
        Returns
        -------
        B : ndarray of shape (n_nodes2,)
            Initialized biases
        """
        B = np.zeros(n_nodes2)
        return B

In [3]:
# Problem 3: Classification of Optimization Methods
class SGD:
    """
    Stochastic Gradient Descent
    Parameters
    ----------
    lr : float
        Learning rate
    """
    def __init__(self, lr):
        self.lr = lr
        
    def update(self, layer):
        """
        Update weights and biases of a layer
        Parameters
        ----------
        layer : instance of a layer
            Layer to update
        
        Returns
        -------
        layer : instance of a layer
            Updated layer
        """
        # Update weights
        layer.W -= self.lr * layer.dW
        
        # Update biases
        layer.B -= self.lr * layer.dB
        
        return layer

In [4]:
# Problem 4: Classification of Activation Functions
class Tanh:
    """
    Hyperbolic tangent activation function
    """
    def forward(self, X):
        """
        Forward pass
        Parameters
        ----------
        X : ndarray
            Input
        
        Returns
        -------
        Y : ndarray
            Output after applying tanh
        """
        self.X = X
        return np.tanh(X)
        
    def backward(self, dY):
        """
        Backward pass
        Parameters
        ----------
        dY : ndarray
            Gradient from the next layer
        
        Returns
        -------
        dX : ndarray
            Gradient to pass to the previous layer
        """
        # Derivative of tanh(x) is 1 - tanh^2(x)
        dX = dY * (1 - np.tanh(self.X)**2)
        return dX

class Softmax:
    """
    Softmax activation function with cross-entropy loss
    """
    def forward(self, X):
        """
        Forward pass
        Parameters
        ----------
        X : ndarray of shape (batch_size, n_classes)
            Input
        
        Returns
        -------
        Y : ndarray of shape (batch_size, n_classes)
            Output probabilities after applying softmax
        """
        # Subtract max for numerical stability
        self.X = X
        exp_X = np.exp(X - np.max(X, axis=1, keepdims=True))
        self.Y = exp_X / np.sum(exp_X, axis=1, keepdims=True)
        return self.Y
        
    def backward(self, T, Y=None):
        """
        Backward pass (combined with cross-entropy loss)
        Parameters
        ----------
        T : ndarray of shape (batch_size, n_classes)
            True labels (one-hot encoded)
        Y : ndarray of shape (batch_size, n_classes), optional
            Output probabilities from forward pass
        
        Returns
        -------
        dX : ndarray of shape (batch_size, n_classes)
            Gradient to pass to the previous layer
        """
        if Y is None:
            Y = self.Y
            
        batch_size = Y.shape[0]
        # Derivative of softmax with cross-entropy is (y - t)
        dX = (Y - T) / batch_size
        return dX

In [5]:
# Problem 5: Creating a ReLU Class
class ReLU:
    """
    Rectified Linear Unit activation function
    """
    def forward(self, X):
        """
        Forward pass
        Parameters
        ----------
        X : ndarray
            Input
        
        Returns
        -------
        Y : ndarray
            Output after applying ReLU
        """
        self.X = X
        return np.maximum(0, X)
        
    def backward(self, dY):
        """
        Backward pass
        Parameters
        ----------
        dY : ndarray
            Gradient from the next layer
        
        Returns
        -------
        dX : ndarray
            Gradient to pass to the previous layer
        """
        # Derivative of ReLU is 1 if x > 0, 0 otherwise
        dX = dY * (self.X > 0)
        return dX

In [6]:
# Problem 6: Initial Weight Values
class XavierInitializer:
    """
    Xavier (Glorot) initialization
    Suitable for sigmoid and tanh activation functions
    """
    def W(self, n_nodes1, n_nodes2):
        """
        Weight initialization
        Parameters
        ----------
        n_nodes1 : int
            Number of nodes in the previous layer
        n_nodes2 : int
            Number of nodes in the next layer
        
        Returns
        -------
        W : ndarray of shape (n_nodes1, n_nodes2)
            Initialized weights
        """
        # Xavier initialization: sigma = 1/sqrt(n)
        sigma = 1.0 / np.sqrt(n_nodes1)
        W = sigma * np.random.randn(n_nodes1, n_nodes2)
        return W
        
    def B(self, n_nodes2):
        """
        Bias initialization
        Parameters
        ----------
        n_nodes2 : int
            Number of nodes in the next layer
        
        Returns
        -------
        B : ndarray of shape (n_nodes2,)
            Initialized biases
        """
        B = np.zeros(n_nodes2)
        return B

class HeInitializer:
    """
    He initialization
    Suitable for ReLU activation functions
    """
    def W(self, n_nodes1, n_nodes2):
        """
        Weight initialization
        Parameters
        ----------
        n_nodes1 : int
            Number of nodes in the previous layer
        n_nodes2 : int
            Number of nodes in the next layer
        
        Returns
        -------
        W : ndarray of shape (n_nodes1, n_nodes2)
            Initialized weights
        """
        # He initialization: sigma = sqrt(2/n)
        sigma = np.sqrt(2.0 / n_nodes1)
        W = sigma * np.random.randn(n_nodes1, n_nodes2)
        return W
        
    def B(self, n_nodes2):
        """
        Bias initialization
        Parameters
        ----------
        n_nodes2 : int
            Number of nodes in the next layer
        
        Returns
        -------
        B : ndarray of shape (n_nodes2,)
            Initialized biases
        """
        B = np.zeros(n_nodes2)
        return B

In [7]:
# Problem 7: Optimization Methods
class AdaGrad:
    """
    Adaptive Gradient Algorithm (AdaGrad)
    Parameters
    ----------
    lr : float
        Learning rate
    """
    def __init__(self, lr):
        self.lr = lr
        self.h = {}  # Dictionary to store accumulated squared gradients
        
    def update(self, layer):
        """
        Update weights and biases of a layer
        Parameters
        ----------
        layer : instance of a layer
            Layer to update
        
        Returns
        -------
        layer : instance of a layer
            Updated layer
        """
        # Initialize accumulated squared gradients if not already present
        if id(layer) not in self.h:
            self.h[id(layer)] = {
                'W': np.zeros_like(layer.W),
                'B': np.zeros_like(layer.B)
            }
        
        # Update accumulated squared gradients for weights
        self.h[id(layer)]['W'] += layer.dW ** 2
        
        # Update weights
        layer.W -= self.lr * layer.dW / (np.sqrt(self.h[id(layer)]['W']) + 1e-7)
        
        # Update accumulated squared gradients for biases
        self.h[id(layer)]['B'] += layer.dB ** 2
        
        # Update biases
        layer.B -= self.lr * layer.dB / (np.sqrt(self.h[id(layer)]['B']) + 1e-7)
        
        return layer

In [8]:
# Problem 8: Completing the Deep Neural Network Class
import numpy as np

class ScratchDeepNeuralNetworkClassifier:
    """
    Deep Neural Network Classifier from scratch
    Parameters
    ----------
    hidden_layer_sizes : list
        List of integers representing the number of nodes in each hidden layer
    activation : str, default='tanh'
        Activation function to use ('tanh', 'relu')
    optimizer : str, default='sgd'
        Optimization method ('sgd', 'adagrad')
    initializer : str, default='simple'
        Weight initialization method ('simple', 'xavier', 'he')
    learning_rate : float, default=0.1
        Learning rate for optimization
    sigma : float, default=0.01
        Standard deviation for simple initialization
    max_epochs : int, default=1000
        Maximum number of epochs
    batch_size : int, default=32
        Batch size for mini-batch gradient descent
    random_state : int, default=None
        Random seed for reproducibility
    """
    def __init__(self, hidden_layer_sizes, activation='tanh', optimizer='sgd',
                 initializer='simple', learning_rate=0.1, sigma=0.01,
                 max_epochs=1000, batch_size=32, random_state=None):
        self.hidden_layer_sizes = hidden_layer_sizes
        self.activation = activation
        self.optimizer = optimizer
        self.initializer = initializer
        self.learning_rate = learning_rate
        self.sigma = sigma
        self.max_epochs = max_epochs
        self.batch_size = batch_size
        self.random_state = random_state
        
    def _get_activation(self, activation_name):
        """
        Get activation function instance
        Parameters
        ----------
        activation_name : str
            Name of the activation function
        
        Returns
        -------
        activation : instance of activation function
        """
        if activation_name == 'tanh':
            return Tanh()
        elif activation_name == 'relu':
            return ReLU()
        elif activation_name == 'softmax':
            return Softmax()
        else:
            raise ValueError(f"Unsupported activation function: {activation_name}")
    
    def _get_initializer(self, initializer_name):
        """
        Get initializer instance
        Parameters
        ----------
        initializer_name : str
            Name of the initializer
        
        Returns
        -------
        initializer : instance of initializer
        """
        if initializer_name == 'simple':
            return SimpleInitializer(self.sigma)
        elif initializer_name == 'xavier':
            return XavierInitializer()
        elif initializer_name == 'he':
            return HeInitializer()
        else:
            raise ValueError(f"Unsupported initializer: {initializer_name}")
    
    def _get_optimizer(self, optimizer_name):
        """
        Get optimizer instance
        Parameters
        ----------
        optimizer_name : str
            Name of the optimizer
        
        Returns
        -------
        optimizer : instance of optimizer
        """
        if optimizer_name == 'sgd':
            return SGD(self.learning_rate)
        elif optimizer_name == 'adagrad':
            return AdaGrad(self.learning_rate)
        else:
            raise ValueError(f"Unsupported optimizer: {optimizer_name}")
    
    def _init_layers(self, n_features, n_classes):
        """
        Initialize network layers
        Parameters
        ----------
        n_features : int
            Number of input features
        n_classes : int
            Number of output classes
        """
        self.n_features = n_features
        self.n_classes = n_classes
        
        # Get initializer and optimizer instances
        initializer = self._get_initializer(self.initializer)
        optimizer = self._get_optimizer(self.optimizer)
        
        # Create layers
        self.layers = []
        
        # Input layer to first hidden layer
        layer_sizes = [n_features] + self.hidden_layer_sizes + [n_classes]
        activation_list = [self.activation] * len(self.hidden_layer_sizes) + ['softmax']
        
        for i in range(len(layer_sizes) - 1):
            # Add fully connected layer
            fc = FC(layer_sizes[i], layer_sizes[i+1], initializer, optimizer)
            self.layers.append(('fc', fc))
            
            # Add activation function except for the last layer (handled separately)
            if i < len(layer_sizes) - 2:
                activation = self._get_activation(activation_list[i])
                self.layers.append(('activation', activation))
        
        # Output layer activation (softmax)
        self.output_activation = self._get_activation('softmax')
    
    def fit(self, X, y):
        """
        Fit the neural network to the training data
        Parameters
        ----------
        X : ndarray of shape (n_samples, n_features)
            Training data
        y : ndarray of shape (n_samples,)
            Target values
        
        Returns
        -------
        self : object
            Returns self
        """
        if self.random_state is not None:
            np.random.seed(self.random_state)
        
        n_samples, n_features = X.shape
        n_classes = len(np.unique(y))
        
        # Initialize layers
        self._init_layers(n_features, n_classes)
        
        # One-hot encode the target values
        T = np.zeros((n_samples, n_classes))
        T[np.arange(n_samples), y] = 1
        
        # Training loop
        for epoch in range(self.max_epochs):
            # Shuffle data
            indices = np.random.permutation(n_samples)
            X_shuffled = X[indices]
            T_shuffled = T[indices]
            
            # Mini-batch training
            for i in range(0, n_samples, self.batch_size):
                X_batch = X_shuffled[i:i+self.batch_size]
                T_batch = T_shuffled[i:i+self.batch_size]
                
                # Forward pass
                A = X_batch
                for layer_type, layer in self.layers:
                    A = layer.forward(A)
                
                Y = self.output_activation.forward(A)
                
                # Backward pass
                dA = self.output_activation.backward(T_batch, Y)
                
                for layer_type, layer in reversed(self.layers):
                    dA = layer.backward(dA)
            
            # Calculate and print loss occasionally
            if epoch % 100 == 0:
                loss = self._compute_loss(X, T)
                print(f"Epoch {epoch}, Loss: {loss:.6f}")
        
        return self
    
    def _compute_loss(self, X, T):
        """
        Compute the cross-entropy loss
        Parameters
        ----------
        X : ndarray of shape (n_samples, n_features)
            Input data
        T : ndarray of shape (n_samples, n_classes)
            One-hot encoded target values
        
        Returns
        -------
        loss : float
            Cross-entropy loss
        """
        # Forward pass
        A = X
        for layer_type, layer in self.layers:
            A = layer.forward(A)
        
        Y = self.output_activation.forward(A)
        
        # Calculate cross-entropy loss
        eps = 1e-7
        loss = -np.sum(T * np.log(Y + eps)) / len(X)
        return loss
    
    def predict(self, X):
        """
        Predict class labels for samples in X
        Parameters
        ----------
        X : ndarray of shape (n_samples, n_features)
            Input data
        
        Returns
        -------
        y_pred : ndarray of shape (n_samples,)
            Predicted class labels
        """
        # Forward pass
        A = X
        for layer_type, layer in self.layers:
            A = layer.forward(A)
        
        # Get probabilities
        Y = self.output_activation.forward(A)
        
        # Get class with highest probability
        y_pred = np.argmax(Y, axis=1)
        return y_pred
    
    def score(self, X, y):
        """
        Calculate accuracy
        Parameters
        ----------
        X : ndarray of shape (n_samples, n_features)
            Input data
        y : ndarray of shape (n_samples,)
            True class labels
        
        Returns
        -------
        accuracy : float
            Classification accuracy
        """
        y_pred = self.predict(X)
        accuracy = np.mean(y_pred == y)
        return accuracy

In [10]:
# Problem 9: Learning and Estimation with MNIST
import numpy as np
from sklearn.datasets import fetch_openml
from tensorflow.keras.datasets import mnist
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import time

def load_mnist():
    """
    Load and preprocess the MNIST dataset
    """
    print("Loading MNIST dataset...")
    
    # Load data
    (X_train, y_train), (X_test, y_test) = mnist.load_data()
    
    # Reshape and scale features to [0, 1]
    X_train, X_test = X_train.reshape(-1, 28*28) / 255.0, X_test.reshape(-1, 28*28) / 255.0
    
    # Combine train and test sets for splitting
    X = np.vstack((X_train, X_test))  # Stack them together
    y = np.hstack((y_train, y_test))  # Stack labels
    
    # Split data into new train-test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    print(f"Training data shape: {X_train.shape}")
    print(f"Test data shape: {X_test.shape}")
    
    return X_train, X_test, y_train, y_test


def evaluate_network(network_config, X_train, X_test, y_train, y_test):
    """
    Train and evaluate a neural network with the given configuration
    """
    # Unpack configuration
    name = network_config['name']
    hidden_layer_sizes = network_config['hidden_layer_sizes']
    activation = network_config['activation']
    optimizer = network_config['optimizer']
    initializer = network_config['initializer']
    
    print(f"\nEvaluating {name}...")
    print(f"Configuration: {hidden_layer_sizes}, {activation}, {optimizer}, {initializer}")
    
    # Create and train the network
    start_time = time.time()
    
    nn = ScratchDeepNeuralNetworkClassifier(
        hidden_layer_sizes=hidden_layer_sizes,
        activation=activation,
        optimizer=optimizer,
        initializer=initializer,
        learning_rate=0.1,
        max_epochs=300, 
        batch_size=128,
        random_state=42
    )
    

    subset_size = 10000 
    X_train_subset = X_train[:subset_size]
    y_train_subset = y_train[:subset_size]
    
    nn.fit(X_train_subset, y_train_subset)
    
    training_time = time.time() - start_time
    
    # Evaluate on test set
    test_score = nn.score(X_test, y_test)
    
    print(f"Training time: {training_time:.2f} seconds")
    print(f"Test accuracy: {test_score:.4f}")
    
    return test_score, training_time

def main():
    # Load and preprocess MNIST data
    X_train, X_test, y_train, y_test = load_mnist()
    
    # Define different network configurations
    configurations = [
        {
            'name': 'Basic Network (1 layer, Tanh, SGD)',
            'hidden_layer_sizes': [100],
            'activation': 'tanh',
            'optimizer': 'sgd',
            'initializer': 'xavier'
        },
        {
            'name': 'Deep Network (2 layers, Tanh, SGD)',
            'hidden_layer_sizes': [100, 100],
            'activation': 'tanh',
            'optimizer': 'sgd',
            'initializer': 'xavier'
        },
        {
            'name': 'ReLU Network (1 layer, ReLU, SGD)',
            'hidden_layer_sizes': [100],
            'activation': 'relu',
            'optimizer': 'sgd',
            'initializer': 'he'
        },
        {
            'name': 'Deep ReLU Network (2 layers, ReLU, SGD)',
            'hidden_layer_sizes': [100, 100],
            'activation': 'relu',
            'optimizer': 'sgd',
            'initializer': 'he'
        },
        {
            'name': 'AdaGrad Network (1 layer, ReLU, AdaGrad)',
            'hidden_layer_sizes': [100],
            'activation': 'relu',
            'optimizer': 'adagrad',
            'initializer': 'he'
        }
    ]
    
    # Evaluate each configuration
    results = []
    for config in configurations:
        accuracy, training_time = evaluate_network(config, X_train, X_test, y_train, y_test)
        results.append((config['name'], accuracy, training_time))
    
    # Print summary
    print("\nSummary of Results:")
    print("=" * 60)
    print(f"{'Network Configuration':<40} {'Accuracy':<10} {'Time (s)':<10}")
    print("-" * 60)
    for name, accuracy, time in results:
        print(f"{name:<40} {accuracy:.4f}    {time:.2f}")
    print("=" * 60)

if __name__ == "__main__":
    main()

Loading MNIST dataset...
Training data shape: (56000, 784)
Test data shape: (14000, 784)

Evaluating Basic Network (1 layer, Tanh, SGD)...
Configuration: [100], tanh, sgd, xavier
Epoch 0, Loss: 2.298116
Epoch 100, Loss: 0.623967
Epoch 200, Loss: 0.461936
Training time: 77.54 seconds
Test accuracy: 0.8923

Evaluating Deep Network (2 layers, Tanh, SGD)...
Configuration: [100, 100], tanh, sgd, xavier
Epoch 0, Loss: 2.311108
Epoch 100, Loss: 0.576857
Epoch 200, Loss: 0.413986
Training time: 123.49 seconds
Test accuracy: 0.8996

Evaluating ReLU Network (1 layer, ReLU, SGD)...
Configuration: [100], relu, sgd, he
Epoch 0, Loss: 2.325059
Epoch 100, Loss: 0.543723
Epoch 200, Loss: 0.406492
Training time: 53.14 seconds
Test accuracy: 0.9005

Evaluating Deep ReLU Network (2 layers, ReLU, SGD)...
Configuration: [100, 100], relu, sgd, he
Epoch 0, Loss: 2.355095
Epoch 100, Loss: 0.456876
Epoch 200, Loss: 0.332186
Training time: 76.61 seconds
Test accuracy: 0.9097

Evaluating AdaGrad Network (1 layer