In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# 1. Setup a simple model and data
model = nn.Linear(10, 1)
# two parameters: weight and bias
print("model parameters size is :", len(list(model.parameters())))
for para in model.parameters():
    print(para)

# there are 5 data sample
input_data = torch.randn(5, 10)
print(input_data)

# the real output of 5 data sample
target = torch.randn(5, 1)
# loss function
criterion = nn.MSELoss()

# 2. Initialize AdamW optimizer
# weight_decay: The coefficient for L2 regularization (default: 0.01)
optimizer = optim.AdamW(model.parameters(), lr=0.01, weight_decay=0.01)

print(f"Original weight (first 3): {model.weight.data[0][:3]}")


# 4. Training Loop
num_epochs = 3
for epoch in range(num_epochs):
    # a. Forward pass: Compute predicted y by passing x to the model
    y_pred = model(input_data)

    # b. Compute and print loss
    loss = criterion(y_pred, target)

    # c. Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()

    print("=" * 50)
    # grad for all parameters will be zero, then backpropagation will be applied again.
    for para in model.parameters():
        print(para.grad)
    print("=" * 50)

    print("Before backpropagation")
    for para in model.parameters():
        print(para)
    
    # update gradient
    loss.backward()
    print("After backpropagation")
    for para in model.parameters():
        print(para)

    # update parameters
    optimizer.step()

    print("After optimizer step")
    for para in model.parameters():
        print(para)

    # Print the loss every 5 epochs to see the update
    if (epoch + 1) % 5 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

print("\nFinal Learned Values:")
print(f"Weight (should be ~2.0): ", model.weight)
print(f"Bias   (should be ~1.0): ", model.bias)




model parameters size is : 2
Parameter containing:
tensor([[ 0.0357, -0.0287,  0.1724, -0.2707, -0.0137, -0.0381,  0.0329, -0.1977,
          0.0094,  0.0615]], requires_grad=True)
Parameter containing:
tensor([0.1080], requires_grad=True)
tensor([[-0.4749, -0.0625,  0.8765, -0.2465,  0.0393,  0.4007, -1.2988,  0.0104,
         -0.8718, -1.3266],
        [ 1.8593, -0.6186, -0.2988,  0.8235,  0.6518, -0.6880,  0.1031, -0.2644,
         -0.3095, -0.2543],
        [-0.2667, -1.1421, -0.3940, -0.5046,  0.2540,  2.2011, -0.0635,  1.2413,
          0.5912, -0.5063],
        [-0.1872, -1.1356, -2.1528,  2.2590,  1.3966,  0.7185,  2.3417,  0.7461,
         -0.8237,  0.6908],
        [ 1.6295,  0.7206,  2.1363, -2.6480,  2.2121,  0.5357,  1.7930, -0.1907,
         -0.6173,  0.8818]])
Original weight (first 3): tensor([ 0.0357, -0.0287,  0.1724])
None
None
Before backpropagation
Parameter containing:
tensor([[ 0.0357, -0.0287,  0.1724, -0.2707, -0.0137, -0.0381,  0.0329, -0.1977,
          0.009