In [201]:
import torch
import torch.nn as nn
import torch.optim as optim

Imports:
- torch.nn as nn [PyTorch docs :torch.nn ](https://docs.pytorch.org/docs/stable/nn.html)
- torch.optim [PyTorch docs](https://docs.pytorch.org/docs/stable/optim.html#module-torch.optim)
    - a package implementing various optimization algorithms.

Containers :
- nn.Module : Base class for all neural network modules.
- nn.Sequential() : [pytorch doc :Sequential](https://docs.pytorch.org/docs/stable/generated/torch.nn.Sequential.html#torch.nn.Sequential)

Non-Linear Activations :
- nn.ReLU
- nn.Sigmoid

Loss Functions :
- nn.BCELoss() : Creates a criterion that measures the Binary Cross Entropy between the target and the input probabilities:
- nn.BCEWithLogitsLoss() : This loss combines a Sigmoid layer and the BCELoss in one single class.

# torch.tensor([])

```
X = torch.tensor([
X = np.array([                      
    [0, 0],
    [0, 1],
    [1, 0],
    [1, 1]
], dtype=np.float32)

y = torch.tensor([
y = np.array([
    [0],
    [1],
    [1],
    [0]
], dtype=np.float32)
```

In [202]:
# data
X = torch.tensor([
    [0., 0.],
    [0., 1.],
    [1., 0.],
    [1., 1.]
])

y = torch.tensor([
    [0.],
    [1.],
    [1.],
    [0.]
])

Since the BCEWithLogitsLoss() has a built in sigmoid layer we can leave the sigmoid out of the model. 

In [203]:
class XORNet_simple(nn.Module):  #nn.Module Base class for all neural network modules
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(2, 3),     # Input → Hidden
            nn.ReLU(),
            nn.Linear(3, 1),     # Hidden → Output
            #nn.Sigmoid()
        )

    def forward(self, x):
        return self.net(x)
    

model = XORNet_simple()
print(model)


XORNet_simple(
  (net): Sequential(
    (0): Linear(in_features=2, out_features=3, bias=True)
    (1): ReLU()
    (2): Linear(in_features=3, out_features=1, bias=True)
  )
)


In [211]:
loss_fn = nn.BCEWithLogitsLoss()  # stable version of BCE
optimizer = optim.SGD(model.parameters(), lr=0.1)  # matches your scratch trainer
print(optimizer.param_groups)

[{'params': [Parameter containing:
tensor([[ 2.2627, -1.9063],
        [-4.0453,  4.0453],
        [-1.9786,  2.4116]], requires_grad=True), Parameter containing:
tensor([-3.4926e-05, -1.2271e-04,  1.9787e+00], requires_grad=True), Parameter containing:
tensor([[ 2.9358,  5.7194, -3.6245]], requires_grad=True), Parameter containing:
tensor([0.3238], requires_grad=True)], 'lr': 0.1, 'momentum': 0, 'dampening': 0, 'weight_decay': 0, 'nesterov': False, 'maximize': False, 'foreach': None, 'differentiable': False, 'fused': None}]


In [208]:
epochs = 3000

for epoch in range(epochs):
    optimizer.zero_grad()

    output = model(X)
    loss = loss_fn(output, y)

    loss.backward()
    optimizer.step()
    
    if epoch % 200 == 0:
        print(f"Epoch {epoch}: Loss = {loss.item():.6f}")


Epoch 0: Loss = 0.001492
Epoch 200: Loss = 0.001413
Epoch 400: Loss = 0.001341
Epoch 600: Loss = 0.001275
Epoch 800: Loss = 0.001215
Epoch 1000: Loss = 0.001160
Epoch 1200: Loss = 0.001110
Epoch 1400: Loss = 0.001063
Epoch 1600: Loss = 0.001020
Epoch 1800: Loss = 0.000980
Epoch 2000: Loss = 0.000943
Epoch 2200: Loss = 0.000908
Epoch 2400: Loss = 0.000876
Epoch 2600: Loss = 0.000845
Epoch 2800: Loss = 0.000817


In [209]:
for name, param in model.named_parameters():
    print(f"name {name} : params: {param.data}")

name net.0.weight : params: tensor([[ 2.2627, -1.9063],
        [-4.0453,  4.0453],
        [-1.9786,  2.4116]])
name net.0.bias : params: tensor([-3.4926e-05, -1.2271e-04,  1.9787e+00])
name net.2.weight : params: tensor([[ 2.9358,  5.7194, -3.6245]])
name net.2.bias : params: tensor([0.3238])


In [210]:
with torch.no_grad():
    logits = model(X)
    preds = torch.sigmoid(logits)

    print("\nPredictions:")
    for inp, pred in zip(X, preds):
        print(f"Input: {inp.tolist()} -> Prediction: {pred.item():.4f}")


Predictions:
Input: [0.0, 0.0] -> Prediction: 0.0011
Input: [0.0, 1.0] -> Prediction: 0.9995
Input: [1.0, 0.0] -> Prediction: 0.9991
Input: [1.0, 1.0] -> Prediction: 0.0006
