In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define the neural network
class AddNumbersNet(nn.Module):
    def __init__(self):
        super(AddNumbersNet, self).__init__()
        self.fc = nn.Linear(2, 1)  # Two inputs, one output

    def forward(self, x):
        return self.fc(x)

# Proper weight initialization
def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight)  # Xavier initialization for stability
        nn.init.zeros_(m.bias)  # Initialize biases to zero

# Generate normalized dataset
def generate_data(num_samples=1000):
    x = torch.rand((num_samples, 2)) * 100  # Random numbers between 0 and 100
    x = (x - x.mean()) / x.std()  # Standardize inputs to zero mean and unit variance
    y = x.sum(dim=1, keepdim=True)  # Sum the two numbers
    return x, y

# Initialize model, loss, and optimizer
model = AddNumbersNet()
model.apply(init_weights)  # Apply weight initialization
criterion = nn.MSELoss()  # Mean Squared Error loss
optimizer = optim.SGD(model.parameters(), lr=0.001, weight_decay=1e-4)  # Smaller learning rate and L2 regularization

# Training loop
x_train, y_train = generate_data(1000)
epochs = 1000
for epoch in range(epochs):
    # Forward pass
    predictions = model(x_train)
    loss = criterion(predictions, y_train)

    # Check for NaN loss
    if torch.isnan(loss).any():
        print(f"Loss is NaN at epoch {epoch}!")
        break

    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()

    # Gradient clipping to prevent exploding gradients
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()

    # Log every 100 epochs
    if epoch % 100 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item()}")

# Test the model
x_test = torch.tensor([[10.0, 20.0], [15.0, 25.0]])
x_test = (x_test - x_train.mean()) / x_train.std()  # Normalize test data
y_test = model(x_test)
print("Input:", x_test)
print("Predicted Sum:", y_test.detach())


Epoch 0, Loss: 0.3400421738624573
Epoch 100, Loss: 0.23187130689620972
Epoch 200, Loss: 0.15325528383255005
Epoch 300, Loss: 0.10130085796117783
Epoch 400, Loss: 0.0669640377163887
Epoch 500, Loss: 0.044269390404224396
Epoch 600, Loss: 0.029268592596054077
Epoch 700, Loss: 0.019352572038769722
Epoch 800, Loss: 0.01279726717621088
Epoch 900, Loss: 0.008463330566883087
Input: tensor([[10., 20.],
        [15., 25.]])
Predicted Sum: tensor([[28.4590],
        [38.0539]])
