In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define the neural network
class AddNumbersNet(nn.Module):
    def __init__(self):
        super(AddNumbersNet, self).__init__()
        self.fc = nn.Linear(2, 1)  # Two inputs, one output

    def forward(self, x):
        return self.fc(x)

# Proper weight initialization
def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight)  # Xavier initialization for stability
        nn.init.zeros_(m.bias)  # Initialize biases to zero

# Generate normalized dataset
def generate_data(num_samples=5000):  # Increased dataset size
    x = torch.rand((num_samples, 2)) * 100  # Random numbers between 0 and 100
    x = (x - x.mean()) / x.std()  # Standardize inputs to zero mean and unit variance
    y = x.sum(dim=1, keepdim=True)  # Sum the two numbers
    y = y / y.std()  # Normalize target to match input scale
    return x, y

# Initialize model, loss, and optimizer
model = AddNumbersNet()
model.apply(init_weights)  # Apply weight initialization
criterion = nn.MSELoss()  # Mean Squared Error loss
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)  # Adam optimizer for better convergence

# Training loop
x_train, y_train = generate_data(5000)
epochs = 5000  # Train for more epochs
for epoch in range(epochs):
    # Forward pass
    predictions = model(x_train)
    loss = criterion(predictions, y_train)

    # Check for NaN loss
    if torch.isnan(loss).any():
        print(f"Loss is NaN at epoch {epoch}!")
        break

    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()

    # Gradient clipping to prevent exploding gradients
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()

    # Log every 500 epochs
    if epoch % 500 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item()}")

# Test the model
x_test = torch.tensor([[10.0, 20.0], [15.0, 25.0]])
x_test = (x_test - x_train.mean()) / x_train.std()  # Normalize test data
y_test = model(x_test)
y_test_rounded = y_test.round()  # Round predictions to nearest whole number
print("Input:", x_test)
print("Predicted Sum (Rounded):", y_test_rounded.detach())


Epoch 0, Loss: 3.829310178756714
Epoch 500, Loss: 1.9475975036621094
Epoch 1000, Loss: 0.78922438621521
Epoch 1500, Loss: 0.15770266950130463
Epoch 2000, Loss: 0.016104672104120255
Epoch 2500, Loss: 0.0008411712478846312
Epoch 3000, Loss: 1.7257056242669933e-05
Epoch 3500, Loss: 1.2401521587435127e-07
Epoch 4000, Loss: 3.302814466366044e-09
Epoch 4500, Loss: 2.6808528730271064e-09
Input: tensor([[10., 20.],
        [15., 25.]])
Predicted Sum (Rounded): tensor([[21.],
        [28.]])


raise the number of examples to 30,000 and made the number of epocs 10k
loss near zero around about 

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define the neural network
class AddNumbersNet(nn.Module):
    def __init__(self):
        super(AddNumbersNet, self).__init__()
        self.fc = nn.Linear(2, 1)  # Two inputs, one output

    def forward(self, x):
        return self.fc(x)

# Proper weight initialization
def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight)  # Xavier initialization for stability
        nn.init.zeros_(m.bias)  # Initialize biases to zero

# Generate normalized dataset
def generate_data(num_samples=5000):
    x = torch.rand((num_samples, 2)) * 100  # Random numbers between 0 and 100
    x_mean = x.mean(dim=0, keepdim=True)
    x_std = x.std(dim=0, keepdim=True)
    x_normalized = (x - x_mean) / x_std  # Standardize inputs to zero mean and unit variance
    y = x.sum(dim=1, keepdim=True)  # Sum the two numbers
    y_normalized = (y - y.mean()) / y.std()  # Normalize target to match input scale
    return x_normalized, y_normalized, x_mean, x_std, y.mean(), y.std()

# Test data generation and evaluation function
def evaluate_model(model, x_train_mean, x_train_std, y_train_mean, y_train_std, test_samples=10):
    x_test = torch.rand((test_samples, 2)) * 100  # Generate test data
    y_actual = x_test.sum(dim=1, keepdim=True)  # Calculate actual sums
    x_test_normalized = (x_test - x_train_mean) / x_train_std  # Normalize test data

    # Get predictions
    with torch.no_grad():
        y_pred_normalized = model(x_test_normalized)
        y_pred = y_pred_normalized * y_train_std + y_train_mean  # Rescale predictions to original scale

    # Print results
    print(f"{'Input':<25}{'Predicted':<15}{'Actual':<15}{'Error':<10}")
    print("-" * 65)
    for i in range(test_samples):
        input_vals = f"{x_test[i,0].item():.2f}, {x_test[i,1].item():.2f}"
        predicted = y_pred[i].item()
        actual = y_actual[i].item()
        error = abs(predicted - actual)
        print(f"{input_vals:<25}{predicted:<15.2f}{actual:<15.2f}{error:<10.2f}")

# Initialize model, loss, and optimizer
model = AddNumbersNet()
model.apply(init_weights)  # Apply weight initialization
criterion = nn.MSELoss()  # Mean Squared Error loss
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)  # Adam optimizer for better convergence

# Training loop
x_train, y_train, x_mean, x_std, y_mean, y_std = generate_data(30000)
epochs = 5000  # Adjusted for faster convergence

for epoch in range(epochs):
    # Forward pass
    predictions = model(x_train)
    loss = criterion(predictions, y_train)

    # Check for NaN loss
    if torch.isnan(loss).any():
        print(f"Loss is NaN at epoch {epoch}!")
        break

    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()

    # Gradient clipping to prevent exploding gradients
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()

    # Log every 100 epochs
    if epoch % 100 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

# Evaluate the model
evaluate_model(model, x_mean, x_std, y_mean, y_std, test_samples=10)


Epoch 0, Loss: 0.9295
Epoch 100, Loss: 0.6766
Epoch 200, Loss: 0.4639
Epoch 300, Loss: 0.2915
Epoch 400, Loss: 0.1647
Epoch 500, Loss: 0.0897
Epoch 600, Loss: 0.0468
Epoch 700, Loss: 0.0232
Epoch 800, Loss: 0.0108
Epoch 900, Loss: 0.0048
Epoch 1000, Loss: 0.0020
Epoch 1100, Loss: 0.0008
Epoch 1200, Loss: 0.0003
Epoch 1300, Loss: 0.0001
Epoch 1400, Loss: 0.0000
Epoch 1500, Loss: 0.0000
Epoch 1600, Loss: 0.0000
Epoch 1700, Loss: 0.0000
Epoch 1800, Loss: 0.0000
Epoch 1900, Loss: 0.0000
Epoch 2000, Loss: 0.0000
Epoch 2100, Loss: 0.0000
Epoch 2200, Loss: 0.0000
Epoch 2300, Loss: 0.0000
Epoch 2400, Loss: 0.0000
Epoch 2500, Loss: 0.0000
Epoch 2600, Loss: 0.0000
Epoch 2700, Loss: 0.0000
Epoch 2800, Loss: 0.0000
Epoch 2900, Loss: 0.0000
Epoch 3000, Loss: 0.0000
Epoch 3100, Loss: 0.0000
Epoch 3200, Loss: 0.0000
Epoch 3300, Loss: 0.0000
Epoch 3400, Loss: 0.0000
Epoch 3500, Loss: 0.0000
Epoch 3600, Loss: 0.0000
Epoch 3700, Loss: 0.0000
Epoch 3800, Loss: 0.0000
Epoch 3900, Loss: 0.0000
Epoch 4000, 