In [1]:
import torch 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class MyLinear(torch.nn.Module):
    def __init__(self, in_features, out_features, bias=True):
        super().__init__()
        self.linear = torch.nn.Linear(in_features, out_features, bias=bias)

    def forward(self, x):
        return self.linear(x)
    
def loss_fn(y_pred, y_true):
    return torch.nn.functional.mse_loss(y_pred, y_true)

input = torch.normal(0, 1, (1000, 10)).to(device)
weight = torch.randn(10, 1).to(device)
bias = torch.randn(1).to(device)

print(weight.shape)
print(bias.shape)

output = (input @ weight + bias).to(device)
print(output.shape)

diff = torch.normal(0, 1, (1000, 1)).to(device)

output = output + diff
linear = MyLinear(10, 1).to(device)

print(f"linear.linear.weight.shape: {linear.linear.weight.shape}")

optimizer = torch.optim.AdamW(linear.parameters(), lr=0.01)

for i in range(1000):
    optimizer.zero_grad()
    model_output = linear(input)
    # print(output.shape)
    # print(model_output.shape)
    if i % 100 == 0:
        print(loss_fn(output, model_output).cpu().detach().numpy())
    loss_fn(output, model_output).backward()
    optimizer.step()

params = list(linear.parameters())
for param in params:
    print(param.cpu().detach().numpy())
# print(linear.bias.data.cpu().numpy())


# Y = X @ W + b X (10)  
            
            

torch.Size([10, 1])
torch.Size([1])
torch.Size([1000, 1])
linear.linear.weight.shape: torch.Size([1, 10])
10.781499
3.2224345
1.4325144
1.0422958
0.97897696
0.97081375
0.9697856
0.9696063
0.96954936
0.96951777
[[ 0.67480725  1.1131548  -1.4926283  -0.17925562 -0.06646118  0.83809614
  -0.15099014  1.8294914  -0.11459737 -0.5928807 ]]
[0.37574327]
