In [47]:
import math
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [48]:
class Value:
    def __init__(self, data, _children=(), _op='', label='') -> None:
        # Initialize the Value object with data, gradient, and other properties
        self.data = data  # The actual data value
        self.grad = 0.0  # The gradient of this value, initialized to 0
        # A function to compute the gradient, does nothing by default for leaf nodes
        self._backward = lambda: None
        self._prev = set(_children)  # Set of child nodes in the computation graph
        self._op = _op  # The operation that created this node
        self.label = label  # Optional label for the node

    def __repr__(self):
        # String representation of the Value object
        return f"Value(data={self.data})"

    def __add__(self, other):
        # Addition operation between two Value objects
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data + other.data, (self, other), '+')  # Create a new Value for the result
        def _backward():
            # Backward pass for addition, distribute the gradient to both operands
            self.grad += 1.0 * out.grad
            other.grad += 1.0 * out.grad
        out._backward = _backward  # Set the backward function for the result
        return out

    def __radd__(self, other):
        # Right-side addition, reuses the __add__ method
        return self + other

    def __neg__(self):
        # Negation operation, implemented as multiplication by -1
        return self * -1

    def __sub__(self, other):
        # Subtraction operation, implemented as addition of a negation
        return self + (-other)

    def __mul__(self, other):
        # Multiplication operation between two Value objects
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, (self, other), '*')  # Create a new Value for the result
        def _backward():
            # Backward pass for multiplication, apply the product rule
            self.grad += other.data * out.grad
            other.grad += self.data * out.grad
        out._backward = _backward  # Set the backward function for the result
        return out

    def __rmul__(self, other):
        # Right-side multiplication, reuses the __mul__ method
        return self * other

    def __truediv__(self, other):
        # Division operation, implemented as multiplication by the reciprocal
        return self * other**-1

    def __pow__(self, other):
        # Power operation, only supports int/float exponents
        assert isinstance(other, (int, float)), "only supporting int/float powers for now"
        out = Value(self.data**other, (self,), f'**{other}')  # Create a new Value for the result
        def _backward():
            # Backward pass for power, apply the power rule
            self.grad += other * (self.data ** (other - 1)) * out.grad
        out._backward = _backward  # Set the backward function for the result
        return out

    def tanh(self):
        # Hyperbolic tangent activation function
        x = self.data
        t = (math.exp(2*x) - 1) / (math.exp(2*x) + 1)  # Compute tanh
        out = Value(t, (self,), 'tanh')  # Create a new Value for the result
        def _backward():
            # Backward pass for tanh, derivative is 1 - tanh^2
            self.grad += (1 - t**2) * out.grad
        out._backward = _backward  # Set the backward function for the result
        return out

    def exp(self):
        # Exponential function
        x = self.data
        out = Value(math.exp(x), (self,), 'exp')  # Create a new Value for the result
        def _backward():
            # Backward pass for exp, derivative is exp itself
            self.grad += out.data * out.grad
        out._backward = _backward  # Set the backward function for the result
        return out

    def backward(self):
        # Perform backpropagation to compute gradients
        topo = []  # List to store the topologically sorted nodes
        visited = set()  # Set to keep track of visited nodes

        def build_topo(v):
            # Helper function to build the topological order
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)

        build_topo(self)  # Start building the topological order from this node
        self.grad = 1.0  # Initialize the gradient of the output node to 1
        for node in reversed(topo):
            node._backward()  # Apply the chain rule in reverse topological order

In [49]:
import random

In [50]:
class Neuron:
    # Initialize a neuron with a given number of inputs (nin)
    def __init__(self, nin) -> None:
        # Initialize weights (w) randomly between -1 and 1 for each input
        self.w = [Value(random.uniform(-1,1)) for _ in range(nin)]
        # Initialize the bias (b) randomly between -1 and 1
        self.b = Value(random.uniform(-1,1))
    
    # Define the forward pass of the neuron
    def __call__(self, x):
        # Compute the weighted sum of inputs and bias (w * x + b)
        act = sum((wi * xi for wi, xi in zip(self.w, x)), self.b)
        # Apply the tanh activation function to the weighted sum
        out = act.tanh()
        return out
    
    # Gather all parameters (weights and bias) of the neuron
    def parameters(self):
        return self.w + [self.b]

In [51]:
# Define a layer of neurons
class Layer:
    def __init__(self, nin, nout):
        # Initialize the layer with a list of neurons
        # Each neuron in the layer will have 'nin' inputs
        self.neurons = [Neuron(nin) for _ in range(nout)]
    
    def __call__(self, x):
        # Perform the forward pass for each neuron in the layer
        # 'x' is the input to the layer
        outs = [n(x) for n in self.neurons]
        # If there is only one neuron in the layer, return its output directly
        # Otherwise, return the list of outputs from all neurons
        return outs[0] if len(outs) == 1 else outs
    
    def parameters(self):
        # Gather all parameters (weights and biases) from all neurons in the layer
        return [p for neuron in self.neurons for p in neuron.parameters()]

In [52]:
class MLP:
    # Initialize the MLP with the number of inputs and the number of neurons in each layer
    def __init__(self, nin, nouts):
        # Create a list of sizes, starting with the number of inputs followed by the number of neurons in each layer
        sz = [nin] + nouts
        # Initialize the layers of the MLP
        # Each layer is created with the number of inputs and outputs specified in the 'sz' list
        self.layers = [Layer(sz[i], sz[i+1]) for i in range(len(nouts))]

    # Define the forward pass of the MLP
    def __call__(self, x):
        # Pass the input 'x' through each layer in the MLP
        for layer in self.layers:
            x = layer(x)
        # Return the final output after passing through all layers
        return x
    
    # Gather all parameters (weights and biases) from all layers in the MLP
    def parameters(self):
        # Flatten the list of parameters from each layer into a single list
        return [p for layer in self.layers for p in layer.parameters()]

In [53]:
# Define the input vector 'x' with 3 elements
x = [2.0, 3.0, -1.0]

# Create an instance of the MLP (Multi-Layer Perceptron) class
# The MLP has 3 input neurons, two hidden layers with 4 neurons each, and 1 output neuron
n = MLP(3, [4, 4, 1])

# Perform a forward pass of the MLP with the input vector 'x'
# This will compute the output of the MLP based on the current weights and biases
n(x)

Value(data=-0.5766674457872752)

In [54]:
# Retrieve all the weights and biases in the neural network
# This will return a list of all parameters (weights and biases) from all layers in the MLP
# Each parameter is an instance of the Value class, which holds the data and gradient
n.parameters()

[Value(data=0.1986388682369966),
 Value(data=-0.38658409972450913),
 Value(data=-0.43356535849638655),
 Value(data=0.5283833703018346),
 Value(data=0.40947882159933346),
 Value(data=-0.03544273206091342),
 Value(data=-0.8675990444069803),
 Value(data=0.13285281424044548),
 Value(data=0.9701868959297075),
 Value(data=-0.287554470155327),
 Value(data=0.05578963974264184),
 Value(data=-0.1798031880524662),
 Value(data=0.9792678427998209),
 Value(data=-0.8091181672875454),
 Value(data=0.716818927344054),
 Value(data=0.5761908449845983),
 Value(data=0.5626936883683891),
 Value(data=-0.13743583850914276),
 Value(data=0.47902489980249996),
 Value(data=-0.4736473484596744),
 Value(data=-0.7059417717912215),
 Value(data=-0.5277083367358057),
 Value(data=-0.0552780165803457),
 Value(data=-0.04081743681524319),
 Value(data=-0.6845945830647193),
 Value(data=0.39300400993246476),
 Value(data=0.27208238625568004),
 Value(data=0.9712476303988329),
 Value(data=-0.24203629498725898),
 Value(data=-0.424

In [55]:
# Example dataset with 4 possible inputs into the neural net and 4 desired targets

# Input data: a list of 4 input vectors, each with 3 features
xs = [
    [2.0, 3.0, -1.0],  # First input vector
    [3.0, -1.0, 0.5],  # Second input vector
    [0.5, 1.0, 1.0],   # Third input vector
    [1.0, 1.0, -1.0]   # Fourth input vector
]

# Desired target outputs for the input data
# This is a simple binary classifier neural net
ys = [1.0, -1.0, -1.0, 1.0]  # Desired targets: 
                             # We want the neural net to output 1.0 for xs[0], 
                             # -1.0 for xs[1], -1.0 for xs[2], and 1.0 for xs[3]

In [56]:
# The goal is to adjust the weights of the neural network to achieve the desired outputs.
# In deep learning, we calculate a single number called the 'loss' to measure the performance of the neural network.
# Initially, the loss is high because the neural network is not performing well.
# We will iterate 40 times to improve the network's performance.
for k in range(40):
    # Forward pass: Compute the predicted outputs for each input vector in 'xs'.
    ypred = [n(x) for x in xs]
    
    # Calculate the loss: This is the sum of squared differences between predicted and target outputs.
    # The loss measures how far off the predictions are from the actual targets.
    loss = sum((yout - ygt)**2 for ygt, yout in zip(ys, ypred))
    
    # Backward pass: Reset gradients to zero before computing new gradients.
    for p in n.parameters():
        p.grad = 0.0
    # Compute gradients of the loss with respect to all parameters.
    loss.backward()
    
    # Update parameters: Adjust each parameter by moving in the direction that reduces the loss.
    # The learning rate is 0.05, which controls the size of the update step.
    for p in n.parameters():
        p.data += -0.05 * p.grad
    
    # Print the current iteration number and the loss value to monitor progress.
    print(k, loss.data)

# For each of the 4 examples, we calculate the squared difference between the prediction and the ground truth.
# If the prediction is close to the target, the squared difference is small, indicating a good performance.
# Squaring ensures that the loss is always positive, regardless of whether the prediction is above or below the target.
# The more the prediction deviates from the target, the higher the loss.
# Our goal is to minimize the loss, indicating better performance of the neural network.
# The final loss is the sum of all these squared differences.

0 6.038323437096592
1 1.6320927822339133
2 0.5678337462970057
3 0.31243055577866463
4 0.23414969663252577
5 0.1869182019364849
6 0.15520087153140555
7 0.1324167122719434
8 0.1152641607176135
9 0.10189526400913762
10 0.09119193942471443
11 0.0824369740847832
12 0.07514880699761076
13 0.06899200280236892
14 0.06372573314320923
15 0.05917264577935922
16 0.055199275352608525
17 0.05170328784115038
18 0.04860492785402208
19 0.04584113646074944
20 0.04336141468892814
21 0.04112485685979385
22 0.03909798532684644
23 0.037253145107249815
24 0.035567296636412234
25 0.03402109616215525
26 0.032598186983783466
27 0.03128464729811663
28 0.030068555782651
29 0.02893964668428659
30 0.027889033654469344
31 0.026908986891383813
32 0.025992751984501218
33 0.02513440165302714
34 0.024328713630809833
35 0.02357106948466338
36 0.022857370306139248
37 0.022183966090943672
38 0.02154759628839608
39 0.02094533951806303


In [57]:
# 'ypred' contains the predicted outputs of the neural network for each input vector in 'xs'.
# These predictions are the result of the forward pass through the network.
# Each element in 'ypred' corresponds to the network's output for a specific input vector.
# The goal is for these predictions to be as close as possible to the desired target outputs 'ys'.
ypred

[Value(data=0.9460600613150587),
 Value(data=-0.9204856875534216),
 Value(data=-0.9350846831651825),
 Value(data=0.9134015110460294)]

## Summary of What We Have Learned

### What Are Neural Nets?

Neural networks are mathematical expressions that:

1. **Take input as data**:
   - They accept data inputs and also take the weights and parameters of the neuron.

2. **Perform a forward pass**:
   - This involves calculating the neuron’s mathematical expression to predict outputs.
   - A loss function then measures the accuracy of the predictions. Generally, the loss will be low when predictions closely match the targets, indicating that the network is performing well.

3. **Optimize using the loss function**:
   - The goal is to manipulate the loss function so that when the loss is low, the network solves the problem effectively.

4. **Backpropagate the loss**:
   - Through backpropagation, the gradient of the loss is calculated, which informs how to tune all the parameters to decrease the loss locally.
   - This process is iterated multiple times using gradient descent.

5. **Minimize the loss**:
   - By following the gradient, the loss is minimized, ensuring that the network performs the desired task correctly.

### Key Insights

- Neural networks can be seen as a "blob" of neural connections that can perform arbitrary tasks, which is the source of their power.
- Even a tiny network with 41 parameters can solve problems, but significantly more complex neural networks with billions (or even trillions) of parameters are now common.

### Complex Neural Nets and Emerging Properties

- **Example: GPT**
  - In the case of GPT, the task involves predicting the next word in a sequence based on massive amounts of internet text data. This learning problem uses:
    - A neural network with hundreds of billions of parameters.
    - Cross-entropy loss instead of mean squared error for predicting the next token.
  - Despite its complexity, the underlying principles remain identical:
    - Gradient evaluation is the same.
    - Gradient descent operates similarly.

- **Remarkable Emerging Properties**:
  - Training on large datasets often reveals fascinating and unexpected behaviors in neural networks.

### Conclusion

Neural network setup and training, regardless of scale, fundamentally operate on the same principles. Now, there is an intuitive understanding of how these processes work under the hood, making it easier to grasp the capabilities of neural networks for solving extremely complex problems.