<a href="https://colab.research.google.com/github/kevalshah90/llms/blob/main/PyTorch_Neural_Network_with_Backpropagation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

Define Neural network architecture

In [None]:
# Define the neural network architecture as a class.
# This class represents the "Multiple Layers" part of the image.
# We'll use a sequential model for simplicity.
class SimpleNeuralNetwork(nn.Module):
    """
    A simple feed-forward neural network with layers and activations
    similar to the diagram provided.

    The architecture is:
    - Input Layer (x)
    - Layer 1 (W1) -> Activation Function (Layer 2)
    - Layer 3 (W3) -> Activation Function (Layer 4)
    - Layer 5 (W5) -> Output (o)
    """

    def __init__(self, input_size, hidden_size1, hidden_size2, output_size):

        super(SimpleNeuralNetwork, self).__init__()

        # Corresponds to Layer 1 and its weights w1
        # It's a linear transformation from input_size to hidden_size1
        self.layer1 = nn.Linear(input_size, hidden_size1)

        # Corresponds to Layer 2's "Activation Function"
        self.relu1 = nn.ReLU()

        # Corresponds to Layer 3 and its weights W3
        # It's a linear transformation from hidden_size1 to hidden_size2
        self.layer3 = nn.Linear(hidden_size1, hidden_size2)

        # Corresponds to Layer 4's "Activation Function"
        self.relu2 = nn.ReLU()

        # Corresponds to Layer 5 and its weights W5
        # It's the final linear transformation to the output_size
        self.layer5 = nn.Linear(hidden_size2, output_size)

    def forward(self, x):
        """
        Defines the forward pass of the network. This is the process
        of taking an input 'x' and generating an output 'o'.
        """
        # Pass the input through Layer 1 and the first activation
        # a2 = Activation(W1*x)
        a2 = self.relu1(self.layer1(x))

        # Pass through Layer 3 and the second activation
        # a4 = Activation(W3*a2)
        a4 = self.relu2(self.layer3(a2))

        # Pass through the final layer to get the output
        # o = W5*a4
        o = self.layer5(a4)

        return o

Training the Network

In [None]:
# --- Training the Network ---

# 1. Hyperparameters and Network Initialization
# We'll use some example dimensions for our network
input_size = 10     # e.g., 10 features for each data point
hidden_size1 = 64   # Size of the first hidden layer
hidden_size2 = 32   # Size of the second hidden layer
output_size = 1     # e.g., a single prediction value

learning_rate = 0.01
epochs = 10

# Instantiate the model, creating an instance of our class
model = SimpleNeuralNetwork(input_size, hidden_size1, hidden_size2, output_size)

# Define the Loss Function (L(o, y) from the diagram)
# We'll use Mean Squared Error, common for regression tasks
loss_function = nn.MSELoss()

# Define the Optimizer
# This is the algorithm that updates the weights (W1, W3, W5)
# based on the gradients calculated by backpropagation.
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

# 2. Create Dummy Data
# This is our example data 'x' and 'y'
X = torch.randn(100, input_size)  # 100 samples, each with 10 features
Y = torch.randn(100, output_size)  # 100 corresponding target values

# 3. Training Loop
# This loop performs the core steps of training a neural network
print("Starting training loop...")
loss_history = []

for epoch in range(epochs):
    # Perform a forward pass to get the model's prediction (o)
    predictions = model(X)

    # Calculate the loss (L(o, y))
    loss = loss_function(predictions, Y)

    # Reset the gradients to zero for the next iteration
    # This is an important step to prevent gradients from accumulating
    optimizer.zero_grad()

    # Backpropagation (the process highlighted in the image)
    # This automatically computes the gradients of the loss w.r.t. all parameters
    # The chain rule is used under the hood here to calculate dL/dW1, dL/dW3, dL/dW5
    loss.backward()

    # Update the weights (W1, W3, W5) using the optimizer
    # The optimizer uses the gradients from the backpropagation step
    optimizer.step()

    # Store the loss for plotting
    loss_history.append(loss.item())

    # Print the loss periodically to monitor progress
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")

print("Training finished.")

# 4. Plotting the Loss History
# This visualizes how the loss decreases over time, showing the model is learning
plt.figure(figsize=(10, 6))
plt.plot(loss_history)
plt.title("Training Loss Over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Loss (MSE)")
plt.grid(True)
plt.show()

# 5. Making a prediction with the trained model
new_data = torch.randn(1, input_size)

with torch.no_grad(): # Disable gradient calculation for inference
    prediction = model(new_data)
print(f"\nPrediction for a new data point: {prediction.item():.4f}")