In [9]:
#A2 Pytorch tutorial farrukh 9-15-25-----------
import torch
import torch.nn as nn
import numpy as np


In [4]:
# only need to write forward, backward implemented for us autograd computes for us
x = torch.tensor([0., 1., 2.])
z = torch.rand(size=(3,3))
print(x, z)

tensor([0., 1., 2.]) tensor([[0.8889, 0.1718, 0.7258],
        [0.4293, 0.6115, 0.7300],
        [0.7343, 0.2274, 0.9296]])


In [13]:
#if want to use accelerator, where data is matters need data and weights in same place
# device = torch.accelerator.current_accelerator().type  #returns None
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("device: ", device)

z.to(device)
print(z.device)

device:  cpu
cpu


In [19]:
x = torch.tensor([2.,3.], requires_grad=True)
z = x[0]**2 + x[1]**3 + 1   #yay, do not need to use torch.pow
y = 2*z

#cannot do x.grad_fn , to save memory, it does not retain grads unless explicitly
print(x.grad)
print(y.grad_fn)
print(y.grad_fn.next_functions)
print(x.grad)

y.backward()        #pretend is loss function, calculated gradient with input [2,3]
print(x.grad)

None
<MulBackward0 object at 0x0000022F4CE91810>
((<AddBackward0 object at 0x0000022F4E2A5120>, 0), (None, 0))
None
tensor([ 8., 54.])


In [20]:
x = torch.tensor([-1.], requires_grad = True)
if x.item() > 0:
    y = 2 *x
else:
    y = torch.exp(x)
print(x.grad)
y.backward()
print(x.grad)

None
tensor([0.3679])


In [21]:
#can pull out values from graph
z  = x
print(z.requires_grad)
z = x.detach()
print(z.requires_grad)

True
False


In [22]:
#somes we do not want gradients, typically during evaluation, testing, deployment
y = x * 2
z = y**2
print(y.requires_grad)

with torch.no_grad():       #does not track gradient here
    y = x * 2
    print(y.requires_grad)

@torch.no_grad()
def validate(dataset):
    pass


True
False


In [29]:
#define network - those auto choose requires_grad
import torch.nn as nn

class net(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.fc1 = nn.Linear(in_features=input_dim, out_features=hidden_dim)
        self.relu = nn.ReLU()
        #second layer does not have bias term
        self.fc2 = nn.Linear(in_features=hidden_dim, out_features=output_dim, bias=False)
        # self.conv2 = nn.Conv2d(in_channels=1, out_channels=64, kernel_size=3, stride=1, padding=0)
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

model = net(input_dim=10, hidden_dim=4, output_dim=1)
print(model)
# print(list(model.parameters()))


net(
  (fc1): Linear(in_features=10, out_features=4, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=4, out_features=1, bias=False)
)


In [34]:
x = torch.rand(size=(1,10))
out = model(x)
print(out)

#define loss function and optimizer
loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1)

loss = loss_fn(out, torch.tensor([1.]))
print(loss)

tensor([[-0.1404]], grad_fn=<MmBackward0>)
tensor(1.3006, grad_fn=<MseLossBackward0>)


In [36]:
loss.backward()
for p in model.parameters():    #outputs weights, gradients=None
    #until we call backward
    print(p, p.grad)

Parameter containing:
tensor([[-0.0234,  0.0420,  0.1273,  0.2781,  0.1870, -0.2529, -0.2307,  0.0967,
          0.1767, -0.2488],
        [-0.1831, -0.2042,  0.1716,  0.0372,  0.1465,  0.1246,  0.0431,  0.1743,
         -0.1790,  0.2722],
        [ 0.1703, -0.1427,  0.2583, -0.2483,  0.1085,  0.2157,  0.0765,  0.2678,
          0.0635, -0.2716],
        [ 0.2043, -0.0826, -0.0623,  0.2113, -0.2073, -0.3009,  0.0860,  0.1551,
          0.0514,  0.2888]], requires_grad=True) tensor([[ 1.0269,  0.5199,  0.9421,  0.3361,  0.4108,  0.0890,  0.3362,  0.7882,
          0.8540,  0.2119],
        [-0.5657, -0.2864, -0.5190, -0.1852, -0.2263, -0.0490, -0.1852, -0.4342,
         -0.4705, -0.1167],
        [ 0.1272,  0.0644,  0.1167,  0.0416,  0.0509,  0.0110,  0.0416,  0.0976,
          0.1058,  0.0262],
        [-0.6894, -0.3490, -0.6325, -0.2257, -0.2758, -0.0597, -0.2257, -0.5291,
         -0.5733, -0.1422]])
Parameter containing:
tensor([ 0.1345,  0.1350, -0.1275, -0.0854], requires_grad=Tru

In [37]:
#everytime you type loss.backward, it keeps acculumulating until you zero out

#this updates weights, wow weights have changed
optimizer.step()
for p in model.parameters():
    print(p, p.grad)

Parameter containing:
tensor([[-1.0503, -0.4778, -0.8148, -0.0581, -0.2238, -0.3419, -0.5669, -0.6915,
         -0.6773, -0.4607],
        [ 0.3826,  0.0822,  0.6906,  0.2224,  0.3728,  0.1736,  0.2282,  0.6086,
          0.2915,  0.3889],
        [ 0.0431, -0.2071,  0.1417, -0.2899,  0.0576,  0.2047,  0.0349,  0.1702,
         -0.0422, -0.2979],
        [ 0.8937,  0.2664,  0.5702,  0.4369,  0.0684, -0.2412,  0.3116,  0.6843,
          0.6247,  0.4311]], requires_grad=True) tensor([[ 1.0269,  0.5199,  0.9421,  0.3361,  0.4108,  0.0890,  0.3362,  0.7882,
          0.8540,  0.2119],
        [-0.5657, -0.2864, -0.5190, -0.1852, -0.2263, -0.0490, -0.1852, -0.4342,
         -0.4705, -0.1167],
        [ 0.1272,  0.0644,  0.1167,  0.0416,  0.0509,  0.0110,  0.0416,  0.0976,
          0.1058,  0.0262],
        [-0.6894, -0.3490, -0.6325, -0.2257, -0.2758, -0.0597, -0.2257, -0.5291,
         -0.5733, -0.1422]])
Parameter containing:
tensor([-0.9510,  0.7331, -0.2619,  0.6434], requires_grad=Tru

In [38]:
optimizer.zero_grad()
for p in model.parameters():
    print(p, p.grad)

Parameter containing:
tensor([[-1.0503, -0.4778, -0.8148, -0.0581, -0.2238, -0.3419, -0.5669, -0.6915,
         -0.6773, -0.4607],
        [ 0.3826,  0.0822,  0.6906,  0.2224,  0.3728,  0.1736,  0.2282,  0.6086,
          0.2915,  0.3889],
        [ 0.0431, -0.2071,  0.1417, -0.2899,  0.0576,  0.2047,  0.0349,  0.1702,
         -0.0422, -0.2979],
        [ 0.8937,  0.2664,  0.5702,  0.4369,  0.0684, -0.2412,  0.3116,  0.6843,
          0.6247,  0.4311]], requires_grad=True) None
Parameter containing:
tensor([-0.9510,  0.7331, -0.2619,  0.6434], requires_grad=True) None
Parameter containing:
tensor([[0.5923, 0.5867, 0.8222, 0.8046]], requires_grad=True) None


In [None]:
#define network
#define loss function, optimizer
#push data thru network
#compute loss
#call backwards() to compute gradients
#optimizer.step() to use gradients to update weights
#optimizer.zero_grad() to zero out gradients for next batch


class net(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.fc1 = nn.Linear(in_features=input_dim, out_features=hidden_dim)
        self.relu = nn.ReLU()
        #second layer does not have bias term
        self.fc2 = nn.Linear(in_features=hidden_dim, out_features=output_dim, bias=False)
        # self.conv2 = nn.Conv2d(in_channels=1, out_channels=64, kernel_size=3, stride=1, padding=0)
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

model = net(input_dim=10, hidden_dim=4, output_dim=1)
loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1)

for _ in range(num_epochs):
    for b, y in batches:
        b.to(device)
        out = model(b)
        loss = loss_func(out, y)
        loss.back_ward()
        optimizer.step()
        optimizer.zero_grad()

#evaluation loop

#weights and biases, tensorboard, mlflow might be useful for project