# Solving the XOR problem with a MLP

In [1]:
# general imports and configuration

import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import random
import sys
import torch

WORKSPACE_PATH = Path.cwd().parent.parent
SRC_PATH = str(WORKSPACE_PATH / "src")

if SRC_PATH not in sys.path:
    sys.path.insert(0, SRC_PATH)
    print(f"echo $PATH={sys.path}")


%matplotlib inline

my_random_seed = 1337
np.random.seed(my_random_seed)
random.seed(my_random_seed)
torch.manual_seed(my_random_seed)  # Set seed for PyTorch

echo $PATH=['/Users/jeanmetz/workspace/playground-llm-pytorch/src', '/Users/jeanmetz/.pyenv/versions/3.12.2/lib/python312.zip', '/Users/jeanmetz/.pyenv/versions/3.12.2/lib/python3.12', '/Users/jeanmetz/.pyenv/versions/3.12.2/lib/python3.12/lib-dynload', '', '/Users/jeanmetz/workspace/playground-llm-pytorch/.venv/lib/python3.12/site-packages']


<torch._C.Generator at 0x119ca7bf0>

## Start with the pytorch implementation (for reference)

In [2]:
import torch.nn as nn
import torch.optim as optim
from torchsummary import summary

In [3]:
class TorchXor(nn.Module):
    """
    x1 ─┐
          │
    x2 ─┼─> [ Hidden Neuron 1 (ReLU/tanh) ] ─┐
          │                                   │
          └─> [ Hidden Neuron 2 (ReLU/tanh) ] ─┘
                                            │
                                            ▼
                                     [ Output Neuron (Sigmoid) ]
    """

    def __init__(self):
        super(TorchXor, self).__init__()
        # Input layer to hidden layer
        # self.hidden = nn.Linear(2, 4)  # 2 inputs, 4 neurons in hidden layer
        # # Hidden layer to output layer
        # self.output = nn.Linear(4, 1)  # 4 neurons in hidden layer, 1 output

        self.layers = nn.Sequential(
            nn.Linear(2, 4),  # 2 inputs, 4 neurons in hidden layer
            nn.Tanh(),  # Add non-linearity to the network
            nn.Linear(4, 1),  # 4 neurons in hidden layer, 1 output
            nn.Sigmoid(),  # Activation for output layer
        )

    def forward(self, x):
        # x = torch.tanh(self.hidden(x))  # Activation for hidden layer
        # x = torch.sigmoid(self.output(x))  # Activation for output layer
        # return x
        return self.layers(x)

In [4]:
# Create the model, define loss function and optimizer
model = TorchXor()
lr = 0.1
epochs = 1001
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=lr)

In [5]:
summary(model, (2,))  # (2,) is the shape of the input tensor

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                    [-1, 4]              12
              Tanh-2                    [-1, 4]               0
            Linear-3                    [-1, 1]               5
           Sigmoid-4                    [-1, 1]               0
Total params: 17
Trainable params: 17
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.00
----------------------------------------------------------------


In [6]:
# XOR data
inputs = torch.tensor([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=torch.float32)
targets = torch.tensor([[0], [1], [1], [0]], dtype=torch.float32)

In [7]:
# Training loop
epochs = 1000
for epoch in range(epochs):
    # Forward pass
    predictions = model(inputs)
    loss = criterion(predictions, targets)

    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()

    # update step
    optimizer.step()

    # Print loss every 100 epochs
    if epoch % 100 == 0:
        print(f"Epoch [{epoch}/{epochs}], Loss: {loss.item():.4f}")

Epoch [0/1000], Loss: 0.2552
Epoch [100/1000], Loss: 0.2507
Epoch [200/1000], Loss: 0.2483
Epoch [300/1000], Loss: 0.2460
Epoch [400/1000], Loss: 0.2431
Epoch [500/1000], Loss: 0.2388
Epoch [600/1000], Loss: 0.2321
Epoch [700/1000], Loss: 0.2220
Epoch [800/1000], Loss: 0.2079
Epoch [900/1000], Loss: 0.1906


In [8]:
# Test the model
with torch.no_grad():
    test_outputs = model(inputs)
    print("\nPredicted outputs for XOR inputs:")
    print(test_outputs.round())  # Round to 0 or 1 for clarity


Predicted outputs for XOR inputs:
tensor([[0.],
        [1.],
        [1.],
        [1.]])


In [9]:
# Let's try inference
# Switch to evaluation mode
model.eval()

# Define the input data (same as training XOR inputs here, but could be new data)
test_inputs = torch.tensor([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=torch.float32)

# Perform inference
with torch.no_grad():  # No need for gradients during inference
    test_outputs = model(test_inputs)
    predicted = test_outputs.round()  # Round to 0 or 1 for binary classification

print("Inference results:")
for i, input in enumerate(test_inputs):
    print(f"Input: {input.numpy()}, Output: {predicted[i].item()}")

Inference results:
Input: [0. 0.], Output: 0.0
Input: [0. 1.], Output: 1.0
Input: [1. 0.], Output: 1.0
Input: [1. 1.], Output: 1.0


# Build XOR model using the micrograd engine

In [None]:
import micrograd
import micrograd.nn
from micrograd.engine import Operand
from typing import Sequence

# A simple neuron class definition

In [13]:
class MicrogradXor(micrograd.nn.Module):
    """
    This implementation should mimic pytorch's
    model = nn.Sequential(
            nn.Linear(2, 4),  # 2 inputs, 4 neurons in hidden layer
            nn.Tanh(),        # Add non-linearity to the network
            nn.Linear(4, 1),  # 4 neurons in hidden layer, 1 output
            nn.Sigmoid(),     # Activation for output layer
        )
    """

    def __init__(self) -> None:
        # Input layer to hidden LINEAR layer
        self.hidden = micrograd.nn.Layer(2, 4)  # 2 inputs, 4 neurons in hidden layer
        # Hidden layer to output LINEAR layer
        self.output = micrograd.nn.Layer(4, 1)  # 4 neurons in hidden layer, 1 output

    def __call__(self, inputs: Sequence[Operand | int | float]) -> Sequence[Operand]:
        x = self.hidden(inputs)
        x = [v.tanh() for v in x]  # apply non-linearity
        x = self.output(x)
        return x.sigmoid()

    def parameters(
        self,
    ):
        return [*self.hidden.parameters(), *self.output.parameters()]

    def __repr__(self):
        hidden_size = len(self.hidden.neurons)
        output_size = len(self.output.neurons)
        return f"MLP of [Linear-1 [-1, {hidden_size}], Tanh-2 [-1, {hidden_size}], Linear-3 [-1, {output_size}], Sigmore-4 [-1, {output_size}]]"

In [14]:
def mse_loss(predictions: Sequence[Operand], targets: Sequence[Operand]) -> float:
    squared_diffs = sum((p - t) ** 2 for p, t in zip(predictions, targets, strict=False))
    return squared_diffs / len(predictions)

In [15]:
inputs = [[0, 0], [0, 1], [1, 0], [1, 1]]
targets = [0, 1, 1, 0]

In [18]:
# Define the model
micrograd_model = MicrogradXor()

print(f"The network")
print(f"Architecture:{micrograd_model}")
print(f"Size: {len(micrograd_model.parameters())}")

The network
Architecture:MLP of [Linear-1 [-1, 4], Tanh-2 [-1, 4], Linear-3 [-1, 1], Sigmore-4 [-1, 1]]
Size: 17


In [19]:
# Training loop
epochs = 1001
for epoch in range(epochs):
    # Forward pass
    predictions = [micrograd_model(x) for x in inputs]
    loss = mse_loss(predictions, targets)

    # Backward pass and optimization
    micrograd_model.zero_grad()  # zero the gradients first
    loss.backward()

    # update step
    for p in micrograd_model.parameters():
        p.data += -lr * p.grad

    # Print loss every 100 epochs
    if epoch % 100 == 0:
        print(f"Epoch [{epoch}/{epochs}], Loss: {loss.data:.4f}")

Epoch [0/1001], Loss: 0.2438
Epoch [100/1001], Loss: 0.2271
Epoch [200/1001], Loss: 0.2111
Epoch [300/1001], Loss: 0.1932
Epoch [400/1001], Loss: 0.1770
Epoch [500/1001], Loss: 0.1637
Epoch [600/1001], Loss: 0.1543
Epoch [700/1001], Loss: 0.1480
Epoch [800/1001], Loss: 0.1433
Epoch [900/1001], Loss: 0.1400
Epoch [1000/1001], Loss: 0.1378


In [20]:
# Define the input data (same as training XOR inputs here, but could be new data)
test_inputs = [[0, 0], [0, 1], [1, 0], [1, 1]]

# Perform inference
test_predictions = [micrograd_model(x) for x in test_inputs]
# Round to 0 or 1 for binary classification
test_predictions = torch.tensor([o.data for o in test_predictions]).round()

print("Inference results:")
for i, pred in zip(test_inputs, test_predictions):
    print(f"Input: {i}, Output: {pred.item()}")

Inference results:
Input: [0, 0], Output: 0.0
Input: [0, 1], Output: 1.0
Input: [1, 0], Output: 1.0
Input: [1, 1], Output: 0.0
