In [1]:
import numpy as np

In [31]:

class NeuralNetwork:
    def __init__(self, input_size=784, hidden_layers=[512, 512], output_size=10):
        self.input_size = input_size
        self.hidden_layers = hidden_layers
        self.output_size = output_size
        self.weights = []
        self.biases = []
        self.gradientWeights = []
        self.gradientBiases = []
        self.iterations = 0  # Initialize iterations

        # Initializes the first hidden layer
        self.weights.append(0.01 * np.random.randn(input_size, hidden_layers[0]))
        self.biases.append(np.zeros((1, hidden_layers[0])))

        # Initializes additional hidden layers
        for i in range(len(hidden_layers) - 1):
            self.weights.append(0.01 * np.random.randn(hidden_layers[i], hidden_layers[i + 1]))
            self.biases.append(np.zeros((1, hidden_layers[i + 1])))

        # Connects last hidden layer to output
        self.weights.append(0.01 * np.random.randn(hidden_layers[-1], output_size))
        self.biases.append(np.zeros((1, output_size)))

    def forward_propagation(self, inputs):
        self.outputs = [inputs]
        for i in range(len(self.weights)):
            self.outputs.append(np.dot(self.outputs[-1], self.weights[i]) + self.biases[i])
            if i == (len(self.weights) - 1):
                final_output = np.exp(self.outputs[-1] - np.max(self.outputs[-1], axis=1, keepdims=True))
                final_output /= np.sum(final_output, axis=1, keepdims=True)
                self.outputs.append(final_output)
            else:
                self.outputs.append(np.maximum(0, self.outputs[-1]))  # ReLU activation
        return self.outputs[-1]

    @staticmethod
    def loss_categorical_cross_entropy(y_pred, y_true):
        y_pred = np.clip(y_pred, 1e-10, 1 - 1e-10)
        loss = -np.sum(y_true * np.log(y_pred), axis=1)
        return np.mean(loss)

    @staticmethod
    def sparse_to_one_hot(sparse_labels, num_classes):
        one_hot_encoded = np.zeros((len(sparse_labels), num_classes))
        one_hot_encoded[np.arange(len(sparse_labels)), sparse_labels] = 1
        return one_hot_encoded

    def backwards_propagation(self, y_true):
        samples = len(self.outputs[-1])

        if len(y_true.shape) == 2:
            y_true = np.argmax(y_true, axis=1)

        dSoftMaxCrossEntropy = self.outputs[-1].copy()
        dSoftMaxCrossEntropy[range(samples), y_true] -= 1
        dSoftMaxCrossEntropy /= samples  # Normalize gradient

        # Start with final layer gradients
        dInputs = np.dot(dSoftMaxCrossEntropy.copy(), self.weights[-1].T)
        dWeights = np.dot(self.outputs[-3].T, dSoftMaxCrossEntropy.copy())
        dBiases = np.sum(dSoftMaxCrossEntropy.copy(), axis=0, keepdims=True)

        self.gradientWeights = [dWeights] + self.gradientWeights
        self.gradientBiases = [dBiases] + self.gradientBiases

        for layer in range(len(self.hidden_layers), 0, -1):  # Reverse loop from last hidden layer to first
            dInputsRelu = dInputs.copy()
            dInputsRelu[self.outputs[layer] <= 0] = 0  # Apply ReLU derivative

            dInputs = np.dot(dInputsRelu, self.weights[layer].T)  # Backprop error to previous layer
            dWeights = np.dot(self.outputs[layer - 1].T, dInputsRelu)  # Compute weight gradients
            dBiases = np.sum(dInputsRelu, axis=0, keepdims=True)  # Compute bias gradients

            self.gradientWeights.insert(0, dWeights)
            self.gradientBiases.insert(0, dBiases)

        
    def SGD(self, lr=0.05, decay=1e-7):
        lr = lr * (1. / (1. + decay * self.iterations))

        for i in range(len(self.weights)):
            assert self.weights[i].shape == self.gradientWeights[i].shape
            self.weights[i] -= lr * self.gradientWeights[i]
            self.biases[i] -= lr * self.gradientBiases[i]

        self.iterations += 1

    def train(self, X_train, y_train, epochs=10, batch_size=32, lr=0.05):
        num_samples = X_train.shape[0]

        for epoch in range(epochs):
            shuffled_indices = np.random.permutation(num_samples)
            X_train, y_train = X_train[shuffled_indices], y_train[shuffled_indices]

            for i in range(0, num_samples, batch_size):
                X_batch = X_train[i:i + batch_size]
                y_batch = y_train[i:i + batch_size]

                # Forward pass
                y_pred = self.forward_propagation(X_batch)

                # Convert labels to one-hot encoding
                y_batch_one_hot = self.sparse_to_one_hot(y_batch, self.output_size)

                # Compute loss
                loss = self.loss_categorical_cross_entropy(y_pred, y_batch_one_hot)

                # Backward pass
                self.backwards_propagation(y_batch_one_hot)

                # Update weights
                self.SGD(lr)

            print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss:.4f}")



In [32]:
import torch
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt

In [34]:
from tensorflow.keras.datasets import mnist

# Load dataset
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Normalize pixel values to [0, 1]
x_train = x_train.reshape(x_train.shape[0], -1) / 255.0
x_test = x_test.reshape(x_test.shape[0], -1) / 255.0

# Convert labels to one-hot encoding
num_classes = 10
y_train_one_hot = np.eye(num_classes)[y_train]
y_test_one_hot = np.eye(num_classes)[y_test]

nn = NeuralNetwork()
nn.train(x_train, y_train, epochs=10, batch_size=64, lr=0.01)  # Call manually


ValueError: shapes (64,512) and (10,512) not aligned: 512 (dim 1) != 10 (dim 0)