In [1]:
import numpy as np 
import sys
import pdb
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt 
from sklearn.preprocessing import OneHotEncoder

def get_data(x_path, y_path):
    '''
    Args:
        x_path: path to x file
        y_path: path to y file
    Returns:
        x: np array of [NUM_OF_SAMPLES x n]
        y: np array of [NUM_OF_SAMPLES]
    '''
    x = np.load(x_path)
    y = np.load(y_path)

    x = x.astype('float')

    #normalize each example in x to have 0 mean and 1 std
    
    # Calculate the mean and standard deviation for each feature
    feature_means = np.mean(x, axis=0)
    feature_stds = np.std(x, axis=0)
    feature_stds = feature_stds + (feature_stds == 0)

    # Normalize each feature to have 0 mean and 1 std
    x = (x - feature_means) / (feature_stds)
    
    # Adjust labels to start from 0 if they start from 1
    y = y - 1
    
    return x, y

def get_metric(y_true, y_pred):
    '''
    Args:
        y_true: np array of [NUM_SAMPLES x r] (one hot) 
                or np array of [NUM_SAMPLES]
        y_pred: np array of [NUM_SAMPLES x r] (one hot) 
                or np array of [NUM_SAMPLES]
                
    '''
    results = classification_report(y_pred, y_true)
    print(results)

In [2]:
x_test, y_test = get_data('x_test.npy', 'y_test.npy')
x_train, y_train = get_data('x_train.npy', 'y_train.npy')

x_train.shape, y_train.shape, x_test.shape, y_test.shape

((10000, 1024), (10000,), (1000, 1024), (1000,))

In [3]:
label_encoder = OneHotEncoder(sparse_output = False)
label_encoder.fit(np.expand_dims(y_train, axis = -1))

y_train_onehot = label_encoder.transform(np.expand_dims(y_train, axis = -1))
y_test_onehot = label_encoder.transform(np.expand_dims(y_test, axis = -1))

In [33]:
class NeuralNetwork:
    def __init__(self, layer_sizes):
        """
        Initializes the neural network with the given layer sizes.
        layer_sizes is a list of integers, where the i-th integer represents
        the number of neurons in the i-th layer.
        """
        self.layer_sizes = layer_sizes
        self.weights = []
        self.biases = []
        self.activations = []

        # Initialize weights and biases for each layer
        for i in range(len(layer_sizes) - 1):
            # Weights are initialized with small random values
            self.weights.append(np.random.randn(layer_sizes[i+1], layer_sizes[i]) * 0.01)
            self.biases.append(np.zeros((layer_sizes[i+1], 1)))
            
    def relu(self, z):
        return np.maximum(0, z)

    def relu_derivative(self, z):
        return (z > 0).astype(float)


    def sigmoid(self, z):
        """
        The sigmoid activation function.
        """
        return 1 / (1 + np.exp(-z))

    def softmax(self, z):
        """
        The softmax function.
        """
        e_z = np.exp(z)  # Subtracting np.max(z) for numerical stability
        return e_z / e_z.sum(axis=0, keepdims=True)

    def feedforward(self, x):
        """
        Performs a feedforward computation.
        """
        activation = x
        self.activations = [x]  # List to store all the activations, layer by layer

        # Compute activations for each layer
        for w, b in zip(self.weights, self.biases):
            z = np.dot(w, activation) + b
            activation = self.relu(z) if w is not self.weights[-1] else self.softmax(z)
            self.activations.append(activation)

        return self.activations[-1]  # The final activation is the output of the network

# Let's test the initialization and feedforward computation with a small network
nn = NeuralNetwork([1024, 100, 5])  # A network with 1024 input features, one hidden layer with 100 neurons, and 5 output classes
sample_input = np.random.randn(1024, 1)  # A random sample input
output = nn.feedforward(sample_input)  # Perform a feedforward computation

output  # Display the output probabilities

array([[0.20171276],
       [0.19835099],
       [0.19595456],
       [0.20106116],
       [0.20292053]])

In [34]:
class NeuralNetwork(NeuralNetwork):  # Extending the previously defined NeuralNetwork class
    def cross_entropy_loss(self, y_pred, y_true):
        """
        Computes the cross-entropy loss.
        """
        m = y_true.shape[1]  # Number of examples
        # To avoid division by zero, we clip the predictions to a minimum value
        # y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
        # Compute the cross-entropy loss
        loss = -np.sum(y_true * np.log(y_pred)) / m
        return loss

    def backpropagation(self, y_true):
        """
        Performs backpropagation to compute the gradients of the loss function
        with respect to the weights and biases.
        """
        m = y_true.shape[1]  # Number of examples
        y_pred = self.activations[-1]  # The output of the last layer
        y_true = y_true.reshape(y_pred.shape)  # Ensure same shape

        # Initialize gradients for each layer
        d_weights = [np.zeros(w.shape) for w in self.weights]
        d_biases = [np.zeros(b.shape) for b in self.biases]

        # Calculate derivative of loss w.r.t. the last layer output
        d_loss = y_pred - y_true

        for i in reversed(range(len(d_weights))):
            d_activations = d_loss * self.relu_derivative(self.activations[i+1]) if i != len(d_weights) - 1 else d_loss
            d_weights[i] = np.dot(d_activations, self.activations[i].T) / m
            d_biases[i] = np.sum(d_activations, axis=1, keepdims=True) / m
            if i != 0:
                d_loss = np.dot(self.weights[i].T, d_activations)

        return d_weights, d_biases

    def sigmoid_derivative(self, s):
        """
        Derivative of the sigmoid function.
        """
        return s * (1 - s)

    def update_parameters(self, d_weights, d_biases, learning_rate):
        """
        Updates the parameters using the computed gradients.
        """
        # Update each parameter with a simple gradient descent step
        for i in range(len(self.weights)):
            self.weights[i] -= learning_rate * d_weights[i]
            self.biases[i] -= learning_rate * d_biases[i]
            
    def train(self, x_train, y_train, epochs, mini_batch_size, learning_rate, conv_threshold = 0.001, conv_epochs = 5):
        n = x_train.shape[1]  # Total number of training examples

        # Training loop
        loss_history = []
        permutation = np.random.permutation(n)
        for epoch in range(epochs):
            # Shuffle the training data for each epoch
            x_train_shuffled = x_train[:, permutation]
            y_train_shuffled = y_train[:, permutation]

            # Mini-batch loop
            for k in range(0, n, mini_batch_size):
                mini_batch_x = x_train_shuffled[:, k:k + mini_batch_size]
                mini_batch_y = y_train_shuffled[:, k:k + mini_batch_size]
                # Forward pass
                self.feedforward(mini_batch_x)
                # Backward pass
                d_weights, d_biases = self.backpropagation(mini_batch_y)
                # Update parameters
                self.update_parameters(d_weights, d_biases, learning_rate)
            
            loss = self.cross_entropy_loss(self.feedforward(x_train), y_train)
            # Optional: Print the loss after each epoch (can be commented out for speed)
            print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss:.4f}")
            
            loss_history.append(loss)
            
            if len(loss_history) > conv_epochs:
                temp = loss_history[-conv_epochs:]
                if np.std(temp) < conv_threshold:
                    print('Converged')
                    break

In [7]:
# Create a fresh instance of the neural network with the corrected train method
nn = NeuralNetwork([1024, 100, 5])
# Convert labels to one-hot encoding again
y_train_one_hot = np.eye(5)[y_train].T

# Train the neural network again with the corrected training method
nn.train(x_train.T, y_train_one_hot, epochs=10, mini_batch_size=32, learning_rate=0.01)

Epoch 1/10, Loss: 0.9515
Epoch 2/10, Loss: 0.7405
Epoch 3/10, Loss: 0.5967
Epoch 4/10, Loss: 0.5117
Epoch 5/10, Loss: 0.4558
Epoch 6/10, Loss: 0.4163
Epoch 7/10, Loss: 0.3864
Epoch 8/10, Loss: 0.3627
Epoch 9/10, Loss: 0.3430
Epoch 10/10, Loss: 0.3260


In [35]:
class NeuralNetwork(NeuralNetwork):  # Extending the previously defined NeuralNetwork class
    def train_c(self, x_train, y_train, epochs, mini_batch_size, learning_rate, conv_threshold = 0.001, conv_epochs = 5):
        n = x_train.shape[1]  # Total number of training examples

        # Training loop
        loss_history = []
        permutation = np.random.permutation(n)
        learning_rate_c = learning_rate
        for epoch in range(epochs):
            learning_rate = learning_rate_c / pow(epoch+1, 0.5)
            # Shuffle the training data for each epoch
            x_train_shuffled = x_train[:, permutation]
            y_train_shuffled = y_train[:, permutation]

            # Mini-batch loop
            for k in range(0, n, mini_batch_size):
                mini_batch_x = x_train_shuffled[:, k:k + mini_batch_size]
                mini_batch_y = y_train_shuffled[:, k:k + mini_batch_size]
                # Forward pass
                self.feedforward(mini_batch_x)
                # Backward pass
                d_weights, d_biases = self.backpropagation(mini_batch_y)
                # Update parameters
                self.update_parameters(d_weights, d_biases, learning_rate)
            
            loss = self.cross_entropy_loss(self.feedforward(x_train), y_train)
            # Optional: Print the loss after each epoch (can be commented out for speed)
            print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss:.4f}")
            
            loss_history.append(loss)
            
            if len(loss_history) > conv_epochs:
                temp = loss_history[-conv_epochs:]
                if np.std(temp) < conv_threshold:
                    print('Converged')
                    break

In [49]:
hidden_layers = [[512], [512, 256], [512, 256, 128], [512, 256, 128, 64]]
layers = [[1024] + hidden_layer + [5] for hidden_layer in hidden_layers]
models_e = {}
for layer in layers:
    print(f'layer: {layer}')
    nn = NeuralNetwork(layer)
    nn.train_c(x_train.T, y_train_one_hot, epochs=200, mini_batch_size=32, learning_rate=0.01)
    models_e[str(layer)] = nn

layer: [1024, 512, 5]
Epoch 1/200, Loss: 0.8414
Epoch 2/200, Loss: 0.6865
Epoch 3/200, Loss: 0.6037
Epoch 4/200, Loss: 0.5507
Epoch 5/200, Loss: 0.5130
Epoch 6/200, Loss: 0.4843
Epoch 7/200, Loss: 0.4614
Epoch 8/200, Loss: 0.4426
Epoch 9/200, Loss: 0.4268
Epoch 10/200, Loss: 0.4132


KeyboardInterrupt: 

In [24]:
for layer in layers[-2:]:
    print(f'layer: {layer}')
    nn = NeuralNetwork(layer)
    nn.train_c(x_train.T, y_train_one_hot, epochs=200, mini_batch_size=32, learning_rate=0.01, conv_threshold=1e-9, conv_epochs=10)
    models_e[str(layer)] = nn

layer: [1024, 100, 100, 100, 5]
Epoch 1/200, Loss: 1.6092
Epoch 2/200, Loss: 1.6091
Epoch 3/200, Loss: 1.6091
Epoch 4/200, Loss: 1.6091
Epoch 5/200, Loss: 1.6091
Epoch 6/200, Loss: 1.6090
Epoch 7/200, Loss: 1.6090
Epoch 8/200, Loss: 1.6090
Epoch 9/200, Loss: 1.6090
Epoch 10/200, Loss: 1.6090
Epoch 11/200, Loss: 1.6090
Epoch 12/200, Loss: 1.6090
Epoch 13/200, Loss: 1.6090
Epoch 14/200, Loss: 1.6089
Epoch 15/200, Loss: 1.6089
Epoch 16/200, Loss: 1.6089
Epoch 17/200, Loss: 1.6089
Epoch 18/200, Loss: 1.6089
Epoch 19/200, Loss: 1.6089
Epoch 20/200, Loss: 1.6088
Epoch 21/200, Loss: 1.6088
Epoch 22/200, Loss: 1.6088
Epoch 23/200, Loss: 1.6088
Epoch 24/200, Loss: 1.6087
Epoch 25/200, Loss: 1.6087
Epoch 26/200, Loss: 1.6087
Epoch 27/200, Loss: 1.6086
Epoch 28/200, Loss: 1.6086
Epoch 29/200, Loss: 1.6086
Epoch 30/200, Loss: 1.6085
Epoch 31/200, Loss: 1.6085
Epoch 32/200, Loss: 1.6084
Epoch 33/200, Loss: 1.6084
Epoch 34/200, Loss: 1.6083
Epoch 35/200, Loss: 1.6082
Epoch 36/200, Loss: 1.6082
Epoch

KeyboardInterrupt: 

In [48]:
layer = layers[-1]
print(f'layer: {layer}')
nn = NeuralNetwork(layer)
for
nn.train(x_train.T, y_train_one_hot, epochs=200, mini_batch_size=32, learning_rate=1, conv_threshold=1e-6, conv_epochs=10)
models_e[str(layer)] = nn

layer: [1024, 100, 100, 100, 100, 5]
Epoch 1/200, Loss: 1.6157
Epoch 2/200, Loss: 1.6157
Epoch 3/200, Loss: 1.6157
Epoch 4/200, Loss: 1.6157
Epoch 5/200, Loss: 1.6157
Epoch 6/200, Loss: 1.6157
Epoch 7/200, Loss: 1.6157
Epoch 8/200, Loss: 1.6157
Epoch 9/200, Loss: 1.6157
Epoch 10/200, Loss: 1.6157
Epoch 11/200, Loss: 1.6157
Converged


In [11]:
# save models
import pickle

with open('pickles/models_e.pickle', 'wb') as handle:
    pickle.dump(models_e, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [12]:
# report results
for layer in layers:
    nn = models_e[str(layer)]
    y_train_pred = np.argmax(nn.feedforward(x_train.T), axis=0)
    y_test_pred = np.argmax(nn.feedforward(x_test.T), axis=0)
    results = classification_report(y_train_pred, y_train)
    print(f"{layer} hidden layer size")
    print('Training')
    print(results)
    
    results = classification_report(y_test_pred, y_test)
    print('Test')
    print(results)

[1024, 100, 5] hidden layer size
Training
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1988
           1       0.95      0.95      0.95      1983
           2       0.90      0.91      0.90      1930
           3       0.88      0.88      0.88      2015
           4       0.94      0.94      0.94      2084

    accuracy                           0.93     10000
   macro avg       0.93      0.93      0.93     10000
weighted avg       0.93      0.93      0.93     10000

Test
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       223
           1       0.87      0.92      0.89       187
           2       0.79      0.81      0.80       194
           3       0.74      0.72      0.73       193
           4       0.89      0.82      0.85       203

    accuracy                           0.86      1000
   macro avg       0.85      0.85      0.85      1000
weighted avg       0.86      0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
avg_f1_scores_training = []
avg_f1_scores_test = []
for layer in layers:
    nn = models_e[str(layer)]
    y_train_pred = np.argmax(nn.feedforward(x_train.T), axis=0)
    y_test_pred = np.argmax(nn.feedforward(x_test.T), axis=0)
    results = classification_report(y_train_pred, y_train, output_dict=True)
    avg_f1_scores_training.append(results['weighted avg']['f1-score'])
    
    results = classification_report(y_test_pred, y_test, output_dict=True)
    avg_f1_scores_test.append(results['weighted avg']['f1-score'])
    
plt.plot([len(layer) for layer in layers], avg_f1_scores_training, label = 'Training')
plt.plot([len(layer) for layer in layers], avg_f1_scores_test, label = 'Test')
plt.xlabel('Hidden Layers Depth')
plt.ylabel('Average F1 Score')
plt.legend()
plt.savefig('(e) relu adaptive training f1 vs hidden_depth.png')
plt.show()