In [None]:
#create similar micrograd library for neural networks from scratch using numpy library and data types

In [None]:
#necessary items
#neurons:
#have input x and weights w
#
#
#layers
#perceptron

In [1]:
#represent a single neuron in a neural network layer
#attributes: weights corresponding to each input value, single bias per neuron, number of input to neuron
#methods: convert input to output - forward, compute gradient - backward

import numpy as np

class Neuron:
    def __init__(self, n_input, nonlin=True):
        self.w = np.random.randn(n_input, 1) * np.sqrt(2 / n_input)
        self.b = np.zeros((1, 1))
        self.nonlin = nonlin
        self.params = self.w + self.b
        self.activ = lambda x: None
        self.activ_deriv = lambda x: None
        self.last_input = None  # should be a np array of inputs
        self.last_output = None  # should be a np array of outputs

        # Fixing the logic here: nonlin=True should be ReLU
        if self.nonlin:
            self.activ = lambda x: np.where(x > 0, x, 0.01 * x)  # Leaky ReLU activation
            self.activ_deriv = lambda x: np.where(x > 0, 1, 0.01)
        else:
            self.activ = lambda x: x  # Linear activation
            self.activ_deriv = lambda x: 1  # Linear derivative

    def __repr__(self):
        out = f"Input to neuron: {len(self.w)} Parameters: {self.params}"
        return out

    # Takes input vector, multiplies with weights, and adds bias to return scalar output
    def forward(self, input):
        self.last_input = input
        out = np.dot(input, self.w) + self.b
        self.last_output = self.activ(out)
        return self.last_output

    # Backpropagates the loss and updates parameters
    def backward(self, post_activation_loss, lr):
        #given loss of output of neuron after activation, compute gradient for input of current neuron and update parameters going of(going into) neuron
        #1. since each neuron goes through activation, need to compute derivative of activation from last output of current neuron
        #2. use gradient from post activation neuron ouput * derivative of activation = gradient of pre activation output
        d_activation = self.activ_deriv(self.last_output)
        d_out = post_activation_loss * d_activation #gradient wrt loss of x1w1 + x2w2 + b = value of neuron output before activation

        # Compute Gradients for weights and bias:
        # Now that we have the gradient with respect to the pre activation output of current neuron,
        # Compute specific gradients with respect to weight and bias using this gradient
        d_w = np.dot(self.last_input.T, d_out)  # weight gradient will be X*pre_activation_output_grad
        d_b = np.sum(d_out, axis=0, keepdims=True) # bias gradient is the same as gradient of pre activation output, sum rows for all examples in batch (bias shared across all examples equally)

        # Update weights and bias
        self.w -= lr * d_w
        self.b -= lr * d_b

        # Return gradient of input to current neuron (X), to previous layer, as gradient of OUTPUT of previous layer with updates
        # dL/dout (x=input to layer, output from previous layer) * w = dL/dout * dout/dx == dL/dx
        return np.dot(d_out, self.w.T)


class Layer:
    def __init__(self, n_input_per_neuron, n_neurons, nonlin=True):
        self.neurons = [Neuron(n_input_per_neuron, nonlin) for i in range(n_neurons)]
        self.n_input_per_neuron = n_input_per_neuron
        self.n_neurons = n_neurons

    def __repr__(self):
        output = f"Number of input to neurons: {self.n_input_per_neuron} Number of neurons: {self.n_neurons}"
        return output

    def forward(self, input):
        self.last_input = input
        self.last_output = np.hstack([neuron.forward(input) for neuron in self.neurons])#concatenated output of each neuron in list in order
        return self.last_output

    def backward(self, loss_prev_list, lr):
        #initialize grad accumulator with zeros of same shape of input from single neuron - each neuron has same input in same layer
        grad_accum = np.zeros_like(self.neurons[0].last_input)

        #for each neuron in order, perform backpropagation using loss from subsequent neurons as loss wrt. output of current neuron
        #1. From output layer we send the loss gradient - m(examples)x n output neurons column vectors ex: 3x2
        #2. For each neuron in output layer, compute the gradient of input from previous layer (X) and update weights
        for i, neuron in enumerate(self.neurons):
            grad = neuron.backward(loss_prev_list[:, [i]], lr)#all examples gradients for specific neuron
            if i == 0:
                grad_accum = grad
            else:
                grad_accum += grad
        return grad_accum


def mse_loss(y_pred, y_true):
    loss = np.mean((y_pred - y_true) ** 2)
    return loss

def mse_derivative(y_pred, y_true):
    grad = 2 * (y_pred - y_true) / y_true.shape[0]
    return grad


class Network:
    def __init__(self, layer_sizes, nonlin=True):
        self.layers = []
        for i in range(len(layer_sizes) - 1):
            self.layers.append(Layer(layer_sizes[i], layer_sizes[i + 1], nonlin))

        self.prediction = 0
        self.loss = 0
        self.grad = 0

    def forward(self, input):
        out = self.layers[0].forward(input)
        for layer in self.layers[1:]:
            out = layer.forward(out)
        self.prediction = out
        return self.prediction

    def backward(self, grad, lr):#should not call forward
        for layer in reversed(self.layers):
          grad = layer.backward(grad, lr)

    def predict(self, X):
        return self.forward(X)

    def train(self, X, y, epochs, learning_rate=.001, batch_size=16):
        for epoch in range(epochs):
            total_loss = 0
            for i in range(0, len(X), batch_size):
                X_batch = X[i:i+batch_size]
                y_batch = y[i:i+batch_size]

                y_pred = self.forward(X_batch)
                loss = mse_loss(y_pred, y_batch)
                total_loss += loss * len(X_batch)

                output_gradient = mse_derivative(y_pred, y_batch)
                self.backward(output_gradient, learning_rate)

            avg_loss = total_loss / len(X)
            print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss}")


In [14]:
y_true = np.array([[1,2],[1,1],[0,1]])
y_pred = np.array([[.67,1.1],[.44,1.5],[.77,.99]])

In [15]:
output_grad = mse_derivative(y_pred, y_true)#shape(batch_size,num_output_neurons)

In [16]:
l = Layer(3,2)

In [17]:
l.forward(np.array([[1,2,3],[2,3,4],[5,6,7]]))

array([[-0.01062592,  0.86169118],
       [-0.0293614 ,  1.81149586],
       [-0.08556783,  4.66090991]])

In [20]:
output_grad.shape

(3, 2)

In [22]:
output_grad[:, [0]], output_grad[:, [1]]

(array([[-0.22      ],
        [-0.37333333],
        [ 0.51333333]]),
 array([[-0.6       ],
        [ 0.33333333],
        [-0.00666667]]))

In [18]:
l.backward(output_grad, lr=.01).shape

(3, 3)

In [43]:
import pandas as pd
mnist_train = pd.read_csv('/content/sample_data/mnist_train_small.csv')

In [44]:
mnist_train

Unnamed: 0,6,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,0.581,0.582,0.583,0.584,0.585,0.586,0.587,0.588,0.589,0.590
0,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19994,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19995,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19996,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19997,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [97]:
y_train = mnist_train.iloc[:,0]
#y_train = y_train.reshape(len(y_train),1)#get 2d array of y labels

x_train = mnist_train.iloc[:,1:]

In [98]:
# normalize the data
    # scale all pixel values from 0-255 to 0-1
x_train = np.array(x_train.astype(np.float32) / 255.0)
#X_test = x_train.astype(np.float32) / 255.0

# one-hot encode the labels
    # represent categorical data as binary vectors
y_train = np.eye(10)[y_train]
#y_test_onehot = np.eye(10)[y_test]


In [80]:
y_train.shape, x_train.shape

((19999, 10), (19999, 784))

In [None]:
if __name__ == "__main__":
  np.random.seed(42)

  # Define network architecture: 2 inputs, 3 neurons in hidden layer, 1 output
  nn = Network(layer_sizes = [784, 128, 64, 10])
  nn.train(x_train, y_train, epochs = 10)

Epoch 1/10, Loss: 0.08174047578319589
Epoch 2/10, Loss: 0.057068117946955745
Epoch 3/10, Loss: 0.049930327996763994
Epoch 4/10, Loss: 0.04486909032324081
Epoch 5/10, Loss: 0.04140568948682911
Epoch 6/10, Loss: 0.03952990039655105
Epoch 7/10, Loss: 0.03826307127808743
Epoch 8/10, Loss: 0.037300843468398626
Epoch 9/10, Loss: 0.03645898716607613


In [None]:
nn = Network(layer_sizes = [784, 128, 64, 10])

In [4]:

# Example usage
if __name__ == "__main__":
    np.random.seed(42)

    # Define network architecture: 2 inputs, 3 neurons in hidden layer, 1 output
    nn = Network([2, 3, 1])

    # Example input and output, call with 2d arrays
    X = np.array([[1, 2], [2, 3], [3, 4]])  # Batch of 3 samples with 2 features each
    y = np.array([[1], [0], [1]])  # Binary outputs for this example

    # Forward pass
    epochs = 100
    for epoch in range(epochs):
      predictions = nn.forward(X)
      print(f"Predictions: {predictions}")
      # Backward pass
      nn.backward(y)


Predictions: [[2.08464602]
 [3.50175676]
 [4.91886751]]
MSE Loss: 9.598760001711051
Predictions: [[0.64473241]
 [1.21768521]
 [1.79063802]]
MSE Loss: 0.7446936063668862
Predictions: [[0.36138424]
 [0.76807176]
 [1.17475929]]
MSE Loss: 0.34276837726783554
Predictions: [[0.2813222 ]
 [0.63956348]
 [0.99780477]]
MSE Loss: 0.3085146839906245
Predictions: [[0.2584644 ]
 [0.60131691]
 [0.94416942]]
MSE Loss: 0.3048580429000102
Predictions: [[0.25264908]
 [0.58995929]
 [0.92726951]]
MSE Loss: 0.3039583634737935
Predictions: [[0.25197893]
 [0.58672605]
 [0.92147318]]
MSE Loss: 0.3033164832466461
Predictions: [[0.25286535]
 [0.58595808]
 [0.9190508 ]]
MSE Loss: 0.3027032732793373
Predictions: [[0.25421985]
 [0.58593886]
 [0.91765787]]
MSE Loss: 0.30209753446312737
Predictions: [[0.25571154]
 [0.58614659]
 [0.91658164]]
MSE Loss: 0.3014972524648111
Predictions: [[0.25723974]
 [0.58642242]
 [0.91560511]]
MSE Loss: 0.30090218835739385
Predictions: [[0.25877381]
 [0.58671798]
 [0.91466215]]
MSE Los

In [None]:
x = Network([3,2,1])

In [None]:
x = Layer(3,2)

In [None]:
x.forward(np.array([3,2,1]))

array([[2.54419492]])

In [None]:
x.backward(np.array([10]))

MSE Loss: [[55.58902943]]
[[-14.91161016]]
[-14.91161016]
[[2.66345899 2.16573133]]


ValueError: non-broadcastable output operand with shape (2,1) doesn't match the broadcast shape (2,2)

In [None]:
import numpy as np
import struct
import os
class Neuron:
    def __init__(self, num_inputs):
        self.weights = np.random.randn(num_inputs, 1) * 0.01
        self.bias = np.zeros((1, 1))
        self.last_input = None
        self.last_output = None

    def relu(self, z):
        return np.maximum(0, z)

    def relu_derivative(self, z):
        return np.where(z > 0, 1, 0)

    def forward(self, activations):
        self.last_input = activations
        z = np.dot(activations, self.weights) + self.bias
        self.last_output = self.relu(z)
        return self.last_output

    def backward(self, dC_da, learning_rate):
        da_dz = self.relu_derivative(self.last_output)
        dC_dz = dC_da * da_dz
        dC_dw = np.dot(self.last_input.T, dC_dz)
        dC_db = np.sum(dC_dz, axis=0, keepdims=True)

        self.weights -= learning_rate * dC_dw
        self.bias -= learning_rate * dC_db

        return np.dot(dC_dz, self.weights.T)


    # output_gradient:
        # A positive gradient means we need to decrease that output
        # A negative gradient means we need to increase that output

    # learning_rate: how big of a step is taken while updating weights and biases


class Layer:
    def __init__(self, num_neurons, num_inputs_per_neuron):
        self.neurons = [Neuron(num_inputs_per_neuron) for _ in range(num_neurons)]

    def forward(self, activations):
        return np.hstack([neuron.forward(activations) for neuron in self.neurons])

    def backward(self, output_gradient, learning_rate):
        return np.sum([neuron.backward(output_gradient[:, [i]], learning_rate) for i, neuron in enumerate(self.neurons)], axis=0)

class NeuralNetwork:
    def __init__(self, layer_sizes):
        self.layers = []
        for i in range(len(layer_sizes) - 1):
            self.layers.append(Layer(layer_sizes[i+1], layer_sizes[i]))

    def forward(self, activations):
        for layer in self.layers:
            activations = layer.forward(activations)
        return activations

    def mse_loss(self, y, activations):
        return np.mean((activations-y)**2)

    def derivative_mse_loss(self, y, activations):
        return 2*(activations-y) / y.shape[0]

    def train(self, X, y, epochs, learning_rate, batch_size=32):
        for epoch in range(epochs):
            total_loss = 0
            for i in range(0, len(X), batch_size):
                X_batch = X[i:i+batch_size]
                y_batch = y[i:i+batch_size]

                outputs = self.forward(X_batch)
                loss = self.mse_loss(y_batch, outputs)
                total_loss += loss * len(X_batch)

                output_gradient = self.derivative_mse_loss(y_batch, outputs)
                for layer in reversed(self.layers):
                    output_gradient = layer.backward(output_gradient, learning_rate)

            avg_loss = total_loss / len(X)
            print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss}")

    def predict(self, X):
        return self.forward(X)

In [None]:
x = NeuralNetwork([3,2,1])

In [None]:
x.forward([3,2,1])

array([[0.]])