In [255]:
import math
import numpy as np

class Value():
    """
    Value class that stores a single value, its own gradient, and its children for backpropagation.
    """

    def __init__(self, data, _children=()):
        self.data = data
        self.grad = 0.0 # Set gradient to 0 automatically. Should be calculated and set in backprop.
        self._backward = lambda: None # Gradient calculation. Unique dependent on operation. Set as none by default. Will be set when opreation complete.
        self._prev = set(_children) # Don't care for duplicate children

    def __repr__(self):
        return (f"Value(data={self.data}, grad={self.grad})")

    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other) # Allows value + number to take place
        out = Value(self.data + other.data, (self, other)) # Set children nodes and assign operation for backpropagation

        def _backward():
            self.grad += 1.0 * out.grad # Due to addition, child gradients should be same as parent (added in case of multi of same node)
            other.grad += 1.0 * out.grad
        out._backward = _backward # Assign backprop lambda

        return out
    
    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, (self, other))

        def _backward():
            self.grad += other.data * out.grad
            other.grad += self.data * out.grad
        out._backward = _backward
        
        return out
    
    def __pow__(self, other):
        assert isinstance(other, (int, float)) # Only supporting int/float exponentiation
        out = Value(self.data**other, (self,))

        def _backward():
            self.grad += other * (self.data ** (other - 1)) * out.grad
            # Calculus power rule and chain rule to calculate gradient
        out._backward = _backward

        return out
    
    def __rmul__(self, other):
        return self * other
    
    def __neg__(self):
        return self * -1
    
    def __sub__(self, other):
        return self + (-other)
    
    def __radd__(self, other):
        return self + other
    
    def __rsub__(self, other):
        return -self + other
    
    # Functions above are just QOL functions for when we try to do ops using ints/floats rather than values

    def sigmoid(self): # Sigmoid activation function for non-linearity
        out = Value(1 / (1 + math.exp(-self.data)), (self,))
        
        def _backward():
            self.grad += out.data * (1 - out.data) * out.grad
            # Sigmoid derivative calculated and written out
        out._backward = _backward

        return out
    
    def tanh(self):
        x = self.data
        t = (math.exp(2*x) - 1)/(math.exp(2*x) + 1)
        out = Value(t, (self, ))
        
        def _backward():
            self.grad += (1 - t**2) * out.grad
        out._backward = _backward
        
        return out
    
    def ReLU(self):
        x = np.maximum(self.data, 0)
        out = Value(x, (self,))

        def _backward():
            self.grad += (1 if x > 0 else 0) * out.grad
        out._backward = _backward

        return out

    
    def exp(self): # e^x
        x = self.data
        out = Value(np.exp(x), (self,))

        def _backward():
            self.grad += out.data * out.grad # e^x dvtv is e^x and we also incorporate chain rule
        out._backward = _backward

        return out
    
    def log(self):
        x = self.data
        out = Value(math.log(x), (self,))

        def _backward():
            self.grad += (1/x) * out.grad
        out._backward = _backward

        return out
    
    def backward(self):
        topo = []
        visited = set()
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)
        build_topo(self) # This topological list is children first. i.e. for each node, traverse all the way down our stored children
        # Then once we reach a node with no children, add that node to our list. This gives us an ordered list of children first for each node we have.
        self.grad = 1.0 # This should be called on the final output value which should have a gradient of 1
        for node in reversed(topo): # For each node in reversed topology list (Meaning nodes descending starting at output node, exactly what you want in backprop)
            node._backward() # Calculate given node's gradient and assign it

In [256]:
class Neuron:
    """
    Neuron class representing input/output calculations alongside specified activation function.
    Takes in # of inputs in initialization (also represents the num weights)
    """
    def __init__(self, numInputs, activation):
        self.activation = activation
        self.weights = [Value(np.random.uniform(-0.1, 0.1)) for _ in range(numInputs)]
        self.bias = Value(np.random.uniform(-0.1,0.1))
        # Initialize neuron with random weights and random bias

    def __call__(self, x): # f(x) call
        x = np.array(x) if not isinstance(x, np.ndarray) else x
        pre_activate = sum((wi*xi for wi, xi in zip(self.weights, x)), self.bias) # Calculate output (weights * input). Note: is Value class

        if self.activation == "tanh":
            return pre_activate.tanh()
        elif self.activation == "sigmoid":
            return pre_activate.sigmoid()
        else:
            return pre_activate.ReLU()
    
    def getParams(self):
        return self.weights + [self.bias] # Return all weights and biases (params) within singular neuron (Necessary for training)

In [257]:
class Layer:
    """
    A singular layer containing n amount of neurons
    """

    def __init__(self, numInputs, numNeuronsInLayer, activation):
        # Create desired # of neurons for the layer that take in n inputs.
        # numNeuronsInLayer is the number of outputs.
        self.neurons = [Neuron(numInputs, activation) for _ in range(numNeuronsInLayer)]

    def __call__(self, x):
        # Calculate outputs of each neuron in layer and return them
        outputs = [neu(x) for neu in self.neurons]
        return outputs[0] if len(outputs) == 1 else outputs
    
    def getParams(self):
        # Return all weights and biases in all neurons in this layer 
        return [p for neuron in self.neurons for p in neuron.getParams()]

In [258]:
class MomentumOptimizer:
    """
    Basic momentum gradient descent optimizer class.
    Can set beta to 0 for regulat gradient descent.
    """

    def __init__(self, params, alpha=0.1, beta=0.9):
        self.params = params
        self.alpha = alpha
        self.beta = beta
        self.velocities = [np.zeros_like(p.data) for p in params] if beta > 0 else None

    def zero_grad(self):
        for p in self.params:
            p.grad = 0.0

    def step(self):
        """
        Updates each parameter using momentum-based gradient descent:
           v = beta * v + (1 - beta) * grad
           p = p - lr * v
        """
        for idx, p in enumerate(self.params):
            if self.beta > 0:  # Momentum update
                previousV = self.velocities[idx]
                v = previousV * self.beta + (1 - self.beta) * p.grad
                self.velocities[idx] = v
            else:  # Perform regular gradient descent if beta is 0 or less
                v = p.grad

            p.data -= v * self.alpha

In [259]:
class RMSPropOptimizer:

    def __init__(self, params, alpha=0.0001, beta=0.9, eps=1e-8):
        self.params = params
        self.alpha = alpha
        self.beta = beta
        self.eps = eps

        self.cache = [0.0 for _ in params]

    def zero_grad(self):
        for p in self.params:
            p.grad = 0.0
    
    def step(self):
        for i, p in enumerate(self.params):
            # Update running average of squared gradients
            self.cache[i] = self.beta * self.cache[i] + (1.0 - self.beta) * (p.grad ** 2)
            
            # Apply the RMSProp update
            p.data -= self.alpha * p.grad / (np.sqrt(self.cache[i]) + self.eps)


In [260]:
import numpy as np

class AdamOptimizer:
    """
    Implementation of the Adam Optimizer.
    """
    def __init__(self, params, alpha=1e-4, beta1=0.9, beta2=0.999, eps=1e-8):
        self.params = params
        self.alpha = alpha
        self.beta1 = beta1
        self.beta2 = beta2
        self.eps = eps


        # Initialize first and second moment estimates to zero
        self.m = [0.0 for _ in params]
        self.v = [0.0 for _ in params]

        # Time step (for bias correction)
        self.t = 0

    def zero_grad(self):
        for p in self.params:
            p.grad = 0.0

    def step(self):
        """
        Performs a single Adam update for each parameter:

            t = t + 1
            m_t = beta1 * m_{t-1} + (1 - beta1) * grad
            v_t = beta2 * v_{t-1} + (1 - beta2) * (grad^2)

            m_hat = m_t / (1 - beta1^t)
            v_hat = v_t / (1 - beta2^t)

            param = param - alpha * m_hat / (sqrt(v_hat) + eps)
        """
        self.t += 1  # increment time step
        for i, p in enumerate(self.params):
            # 1) Update first moment (m) and second moment (v)
            self.m[i] = self.beta1 * self.m[i] + (1 - self.beta1) * p.grad
            self.v[i] = self.beta2 * self.v[i] + (1 - self.beta2) * (p.grad ** 2)

            # 2) Compute bias-corrected estimates
            m_hat = self.m[i] / (1 - self.beta1 ** self.t)
            v_hat = self.v[i] / (1 - self.beta2 ** self.t)

            # 3) Adam update
            p.data -= self.alpha * m_hat / (np.sqrt(v_hat) + self.eps)


In [261]:
def get_batch(xs, ys, batch_size=32):
    # pick random indices
    idxs = np.random.choice(len(xs), batch_size, replace=False)
    x_batch = [xs[i] for i in idxs]
    y_batch = [ys[i] for i in idxs]
    return x_batch, y_batch


class MLP:
    """
    A multi-layered perceptron implementation that puts everything together and is trainable.
    Takes in # of inputs and
        numNeuronsPerLayer (list): Ordered list representing the amount of layers we want (length) and how many neurons we want in each layer (values)
    """

    def __init__(self, numInps, numNeuronsPerLayer, activation='tanh', loss_function='mse'):
        sizes = [numInps] + numNeuronsPerLayer 
        self.layers = [Layer(sizes[i], sizes[i+1], activation) for i in range(len(numNeuronsPerLayer))] # 
        self.activation = activation
        self.loss_function = loss_function
        # We know that the # of inputs for the next layer is the same as the number of outputs for this layer. So we can use sizes[i] and sizes[i+1] to classify that

    def compute_loss(self, ys, yhats):
        if self.loss_function == "mse":
            return sum((yhat - y)**2 for y, yhat in zip(ys, yhats))
        elif self.loss_function == "ce": # binary cross_entropy
            return -sum(y * yhat.log() + (1 - y) * (1 - yhat).log() for y, yhat in zip(ys, yhats))
            

    def __call__(self, x): # This is the forward pass of the MLP. x represents the user input to the neural network.
        for layer in self.layers:
            x = layer(x)
            # Calculate vector of neuron outputs using the previous layer's outputs as inputs
            # The output of the last layer is what is returned
        return x
    
    def getParams(self): # Get params of all neurons in all layers in order of layers
        return [p for layer in self.layers for p in layer.getParams()]
    
    def train(self, xs, ys, optimizer, max_iter=500):
        if not isinstance(optimizer, MomentumOptimizer) or not isinstance(optimizer, RMSPropOptimizer) or not isinstance(optimizer, AdamOptimizer):
            assert TypeError("Please use a pre-made optimizer")
        # Training loop
        for epoch in range(max_iter):
            optimizer.zero_grad()

            x_b, y_b = get_batch(xs, ys, batch_size=24)
            
            # Forward pass - make prediction
            y_preds = [self(x) for x in x_b]  # y_preds is a list of Value objects
            
            loss = self.compute_loss(y_b, y_preds)
            
            loss.backward()

            optimizer.step()
            
            if epoch % 50 == 0:
                print(f"Epoch {epoch}, Loss = {loss.data}")

In [262]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('/Users/liambouayad/Documents/Coding/Personal/Neural_Networks/micrograd/wine+quality/winequality-white.csv', sep=';')

df["quality"] = 1*(df["quality"]>5)


X = df.drop(columns=['quality'])
y = df['quality']

def scale_columns(dataframe, cols):
    scaled_df = dataframe.copy()
    for c in cols:
        mean = scaled_df[c].mean()
        std = scaled_df[c].std()
        scaled_df[c] = (scaled_df[c] - mean) / std
    return scaled_df

X = scale_columns(X, X.columns)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
mlp = MLP(X_train.shape[1], [8, 1], activation='sigmoid', loss_function="ce")

In [264]:
optimizer = AdamOptimizer(mlp.getParams(), alpha=5e-5)

In [269]:

Xt = X_train.values.tolist()
yt = y_train.values.tolist()

X_toTrain = []

for x in Xt:
    temp = []
    for x_i in x:
        temp.append(float(x_i))
    X_toTrain.append(temp)

y_toTrain = [float(y) for y in yt]

mlp.train(X_toTrain[:100], y_toTrain[:100], optimizer, max_iter=1001)


Epoch 0, Loss = 16.53284003465768
Epoch 50, Loss = 15.694259144907848
Epoch 100, Loss = 15.216361372850153
Epoch 150, Loss = 16.773477063925203
Epoch 200, Loss = 16.07770029090445
Epoch 250, Loss = 16.30729368217303
Epoch 300, Loss = 15.571666173658041
Epoch 350, Loss = 16.050378082677522
Epoch 400, Loss = 16.546188070499376
Epoch 450, Loss = 16.28323486266553
Epoch 500, Loss = 15.472633305603075
Epoch 550, Loss = 14.955410675013523
Epoch 600, Loss = 14.890544957621579
Epoch 650, Loss = 14.880944636240143
Epoch 700, Loss = 14.515437109601903
Epoch 750, Loss = 15.402785450814827
Epoch 800, Loss = 15.664400010347766
Epoch 850, Loss = 18.373239081322826
Epoch 900, Loss = 16.860359792145406
Epoch 950, Loss = 15.943738844325278
Epoch 1000, Loss = 15.293420500197941


In [268]:
# Check class balance
print("Class distribution:")
print(df['quality'].value_counts(normalize=True))

# Check feature correlations
correlations = df.corr()['quality'].sort_values(ascending=False)
print("\nFeature correlations with quality:")
print(correlations)

# Check for extreme values
print("\nFeature statistics:")
print(df.describe())

Class distribution:
quality
1    0.665169
0    0.334831
Name: proportion, dtype: float64

Feature correlations with quality:
quality                 1.000000
alcohol                 0.383280
pH                      0.083687
sulphates               0.051858
citric acid            -0.000700
free sulfur dioxide    -0.001278
fixed acidity          -0.089749
residual sugar         -0.092756
total sulfur dioxide   -0.170924
chlorides              -0.183939
volatile acidity       -0.225440
density                -0.268696
Name: quality, dtype: float64

Feature statistics:
       fixed acidity  volatile acidity  citric acid  residual sugar  \
count    4898.000000       4898.000000  4898.000000     4898.000000   
mean        6.854788          0.278241     0.334192        6.391415   
std         0.843868          0.100795     0.121020        5.072058   
min         3.800000          0.080000     0.000000        0.600000   
25%         6.300000          0.210000     0.270000        1.700000   
50

In [267]:
# 3. Add validation metrics
def calculate_accuracy(model, X, y):
    correct = 0
    total = 0
    for x_i, y_i in zip(X, y):
        pred = model(x_i).data > 0.5
        correct += (pred == y_i)
        total += 1
    return correct/total

# During training, every 50 epochs:
train_acc = calculate_accuracy(mlp, X_toTrain, y_toTrain)
print(f"Training accuracy: {train_acc:.4f}")


Training accuracy: 0.6633


In [266]:
"""
Summarized TODO

Implement dropout ? (Maybe)
Visualizations
Train on real dataset and use scikit learn to document results (f1 score, confusion matrix)
Since wine set isnt working well, try MNIST


Built an extensible MLP library from scratch, implementing backpropagation, optimizers (SGD, Adam), and modular activation/loss functions.
Applied the model to practical tasks, including fraudulent transaction detection (99.2% F1 score) and image classification (97% on MNIST).
Developed tools for batch processing, saving/loading models, and training visualization, competing with Scikit-learn and PyTorch on small-scale datasets.

"""

'\nSummarized TODO\n\nImplement dropout ? (Maybe)\nVisualizations\nTrain on real dataset and use scikit learn to document results (f1 score, confusion matrix)\nSince wine set isnt working well, try MNIST\n\n\nBuilt an extensible MLP library from scratch, implementing backpropagation, optimizers (SGD, Adam), and modular activation/loss functions.\nApplied the model to practical tasks, including fraudulent transaction detection (99.2% F1 score) and image classification (97% on MNIST).\nDeveloped tools for batch processing, saving/loading models, and training visualization, competing with Scikit-learn and PyTorch on small-scale datasets.\n\n'