4 steps

1) Step 1: Forward pass : manual implementation, gradients computation : manual, loss computation : manual, parameter updates : manual
2) Step 2 : Forward pass : manual implementation, gradients computation : auto grad, loss computation : manual, parameter updates : manual
3) Step 3 : Forward pass : manual implementation, gradients computation : auto grad, loss computation : pytorch loss, parameter updates : pytorch optimizer
4) Step 4 : Forward pass : pytorch model, gradients computation : auto grad, loss computation : pytorch loss, parameter updates : pytorch optimizer

Step 1 is purely manual implementation, step 4 is more or less completely automated

## Step 1 (complete scratch)
Forward pass : manual implementation, gradients computation : manual, loss computation : manual, parameter updates : manual

In [3]:
import numpy as np
# f = w*x (linear regression, ignore bias)
X = np.array([1,2,3,4], dtype = np.float32) ## numpy
Y = np.array([2,4,6,8], dtype = np.float32)

w = 0.0 ## initialize weight to 0 at beginning

## model prediction
def forward(x):
    return w*x

## model loss : MSE
def loss(y, y_predicted):
    return ((y_predicted-y)**2).mean()

## gradient
## MSE  = 1/N * (w*x - y)**2
## dL/dw = 1/N * 2 * (w*x - y)*x

def gradient(x, y, y_predicted):
    return np.dot(2*x, y_predicted -y).mean()

print(f'Prediction before training : f(5) = {forward(5):.3f}')


## training
learning_rate = 0.01
n_iters = 10

for epoch in range(n_iters):
    ## forward pass
    y_pred = forward(X)
    
    ## loss
    l = loss(Y, y_pred)
    
    ## gradient
    dw = gradient(X, Y, y_pred)
    
    ## weight update (gradient descent)
    w = w - learning_rate*dw
    
    if epoch%1==0:
        print(f'epoch {epoch + 1}: w = {w:.3f}, loss = {l :.8f}')
        
print(f'Prediction after training : f(5) = {forward(5):.3f}')
    
    



    

Prediction before training : f(5) = 0.000
epoch 1: w = 1.200, loss = 30.00000000
epoch 2: w = 1.680, loss = 4.79999924
epoch 3: w = 1.872, loss = 0.76800019
epoch 4: w = 1.949, loss = 0.12288000
epoch 5: w = 1.980, loss = 0.01966083
epoch 6: w = 1.992, loss = 0.00314570
epoch 7: w = 1.997, loss = 0.00050332
epoch 8: w = 1.999, loss = 0.00008053
epoch 9: w = 1.999, loss = 0.00001288
epoch 10: w = 2.000, loss = 0.00000206
Prediction after training : f(5) = 9.999


## Step 2 
Forward pass : manual implementation, gradients computation : auto grad, loss computation : manual, parameter updates : manual

Weird !! In code below, if instead of line w -= learning_rate*(w.grad), we use w = w - learning_rate*(w.grad), we get error

In [16]:
import torch
# f = w*x (linear regression, ignore bias)
X = torch.tensor([1,2,3,4], dtype = torch.float32) ## numpy
Y = torch.tensor([2,4,6,8], dtype = torch.float32)

w = torch.tensor(0.0, dtype=torch.float32, requires_grad=True)

## model prediction
def forward(x):
    return w*x

## model loss : MSE
def loss(y, y_predicted):
    return ((y_predicted-y)**2).mean()

## gradient
## MSE  = 1/N * (w*x - y)**2
## dL/dw = 1/N * 2 * (w*x - y)*x


print(f'Prediction before training : f(5) = {forward(5):.3f}')


## training
learning_rate = 0.01
n_iters = 10

for epoch in range(n_iters):
    ## forward pass
    y_pred = forward(X)
    
    ## loss
    l = loss(Y, y_pred)
    
    ## gradient (backward pass)
    l.backward() ## dl/dw
    ##print(w.grad)
    
    
    ## weight update (gradient descent) - this is couched under torch.no_grad since this is not a part of forward pass, so shouldn't be tracked in computational graph
    with torch.no_grad():
        ##print(w.grad, "before here")
        w -= learning_rate*(w.grad)
        ##print(w.grad, "here")
        
    ## zero grad
    w.grad.zero_()
    
    if epoch%1==0:
        print(f'epoch {epoch + 1}: w = {w:.3f}, loss = {l :.8f}')
        
print(f'Prediction after training : f(5) = {forward(5):.3f}')
    
    



    

Prediction before training : f(5) = 0.000
epoch 1: w = 0.300, loss = 30.00000000
epoch 2: w = 0.555, loss = 21.67499924
epoch 3: w = 0.772, loss = 15.66018772
epoch 4: w = 0.956, loss = 11.31448650
epoch 5: w = 1.113, loss = 8.17471695
epoch 6: w = 1.246, loss = 5.90623236
epoch 7: w = 1.359, loss = 4.26725292
epoch 8: w = 1.455, loss = 3.08308983
epoch 9: w = 1.537, loss = 2.22753215
epoch 10: w = 1.606, loss = 1.60939169
Prediction after training : f(5) = 8.031


## Step 3
Forward pass : manual implementation, gradients computation : auto grad, loss computation : pytorch loss, parameter updates : pytorch optimizer





In [17]:
import torch
import torch.nn as nn

# f = w*x (linear regression, ignore bias)
X = torch.tensor([1,2,3,4], dtype = torch.float32) ## numpy
Y = torch.tensor([2,4,6,8], dtype = torch.float32)

w = torch.tensor(0.0, dtype=torch.float32, requires_grad=True)

## model prediction
def forward(x):
    return w*x

## model loss : MSE


## gradient
## MSE  = 1/N * (w*x - y)**2
## dL/dw = 1/N * 2 * (w*x - y)*x


print(f'Prediction before training : f(5) = {forward(5):.3f}')


## training
learning_rate = 0.01
n_iters = 10

loss = nn.MSELoss()
optimizer = torch.optim.SGD([w], lr = learning_rate)

for epoch in range(n_iters):
    ## forward pass
    y_pred = forward(X)
    
    ## loss
    l = loss(Y, y_pred)
    
    ## gradient (backward pass)
    l.backward() ## dl/dw
    ##print(w.grad)
    
    
    ## weight update (gradient descent) - this is couched under torch.no_grad since this is not a part of forward pass, so shouldn't be tracked in computational graph
    # with torch.no_grad():
    #     ##print(w.grad, "before here")
    #     w -= learning_rate*(w.grad)
    #     ##print(w.grad, "here")
    optimizer.step() 
    
    ## zero grad
    ##w.grad.zero_()
    optimizer.zero_grad()
    
    if epoch%1==0:
        print(f'epoch {epoch + 1}: w = {w:.3f}, loss = {l :.8f}')
        
print(f'Prediction after training : f(5) = {forward(5):.3f}')
    
    



    

Prediction before training : f(5) = 0.000
epoch 1: w = 0.300, loss = 30.00000000
epoch 2: w = 0.555, loss = 21.67499924
epoch 3: w = 0.772, loss = 15.66018772
epoch 4: w = 0.956, loss = 11.31448650
epoch 5: w = 1.113, loss = 8.17471695
epoch 6: w = 1.246, loss = 5.90623236
epoch 7: w = 1.359, loss = 4.26725292
epoch 8: w = 1.455, loss = 3.08308983
epoch 9: w = 1.537, loss = 2.22753215
epoch 10: w = 1.606, loss = 1.60939169
Prediction after training : f(5) = 8.031


## Step 4
Forward pass : pytorch based, gradients computation : auto grad, loss computation : pytorch loss, parameter updates : pytorch optimizer

In [24]:
import torch
import torch.nn as nn

# f = w*x (linear regression, ignore bias)
X = torch.tensor([[1],[2],[3],[4]], dtype = torch.float32) ## x and y have to have  a slightly different shape than before. 2D tensor instead of 1-D where no of rows is number of samples, each row has features
Y = torch.tensor([[2],[4],[6],[8]], dtype = torch.float32)

n_samples, n_features = X.shape
input_size = n_features
output_size = n_features ## in this case
# w = torch.tensor(0.0, dtype=torch.float32, requires_grad=True)

# ## model prediction
# def forward(x):
#     return w*x


## 
model = nn.Linear(input_size, output_size) ## model is simple as it is just a linear model
## model loss : MSE


## gradient
## MSE  = 1/N * (w*x - y)**2
## dL/dw = 1/N * 2 * (w*x - y)*x

X_test = torch.tensor([5], dtype = torch.float32)
print(f'Prediction before training : f(5) = {model(X_test).item():.3f}')


## training
learning_rate = 0.01
n_iters = 100

loss = nn.MSELoss()
## optimizer = torch.optim.SGD([w], lr = learning_rate)
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)

for epoch in range(n_iters):
    ## forward pass
    ## y_pred = forward(X)
    y_pred = model(X)
    
    ## loss
    l = loss(Y, y_pred)
    
    ## gradient (backward pass)
    l.backward() ## dl/dw
    ##print(w.grad)
    
    
    ## weight update (gradient descent) - this is couched under torch.no_grad since this is not a part of forward pass, so shouldn't be tracked in computational graph
    # with torch.no_grad():
    #     ##print(w.grad, "before here")
    #     w -= learning_rate*(w.grad)
    #     ##print(w.grad, "here")
    optimizer.step() 
    
    ## zero grad
    ##w.grad.zero_()
    optimizer.zero_grad()
    
    if epoch%1==0:
        [w,b] = model.parameters()
        print(f'epoch {epoch + 1}: w = {w[0][0].item():.3f}, loss = {l :.8f}')
        
print(f'Prediction after training : f(5) = {model(X_test).item():.3f}')
    
    



    

Prediction before training : f(5) = -3.503
epoch 1: w = -0.489, loss = 50.18429565
epoch 2: w = -0.166, loss = 34.96062088
epoch 3: w = 0.104, loss = 24.39641190
epoch 4: w = 0.328, loss = 17.06530380
epoch 5: w = 0.516, loss = 11.97758293
epoch 6: w = 0.672, loss = 8.44650650
epoch 7: w = 0.803, loss = 5.99555588
epoch 8: w = 0.912, loss = 4.29408836
epoch 9: w = 1.003, loss = 3.11267614
epoch 10: w = 1.079, loss = 2.29212236
epoch 11: w = 1.143, loss = 1.72196805
epoch 12: w = 1.196, loss = 1.32556283
epoch 13: w = 1.241, loss = 1.04972410
epoch 14: w = 1.278, loss = 0.85754842
epoch 15: w = 1.310, loss = 0.72342956
epoch 16: w = 1.336, loss = 0.62959975
epoch 17: w = 1.359, loss = 0.56373054
epoch 18: w = 1.378, loss = 0.51726621
epoch 19: w = 1.394, loss = 0.48427182
epoch 20: w = 1.407, loss = 0.46062833
epoch 21: w = 1.419, loss = 0.44347751
epoch 22: w = 1.429, loss = 0.43083629
epoch 23: w = 1.437, loss = 0.42132896
epoch 24: w = 1.445, loss = 0.41400003
epoch 25: w = 1.451, lo

In [20]:
[w,b] = model.parameters()

In [22]:
b

Parameter containing:
tensor([0.6433], requires_grad=True)

In [23]:
w

Parameter containing:
tensor([[1.3674]], requires_grad=True)

## Step 5
Forward pass : pytorch based, gradients computation : auto grad, loss computation : pytorch loss, parameter updates : pytorch optimizer

Only difference from step 4  - instead of using nn.Linear directly for the model,
define a model class, which is a more general way of doing things

In [25]:
import torch
import torch.nn as nn

# f = w*x (linear regression, ignore bias)
X = torch.tensor([[1],[2],[3],[4]], dtype = torch.float32) ## x and y have to have  a slightly different shape than before. 2D tensor instead of 1-D where no of rows is number of samples, each row has features
Y = torch.tensor([[2],[4],[6],[8]], dtype = torch.float32)

n_samples, n_features = X.shape
input_size = n_features
output_size = n_features ## in this case
# w = torch.tensor(0.0, dtype=torch.float32, requires_grad=True)

# ## model prediction
# def forward(x):
#     return w*x


## 

class LinearRegression(nn.Module):
    
    def __init__(self, input_dim, output_dim):
        
        super(LinearRegression, self).__init__()
        self.lin = nn.Linear(input_dim, output_dim)
        
    def forward(self,x):
        return self.lin(x)
        
##model = nn.Linear(input_size, output_size) ## model is simple as it is just a linear model
model = LinearRegression(input_size, output_size)
## model loss : MSE


## gradient
## MSE  = 1/N * (w*x - y)**2
## dL/dw = 1/N * 2 * (w*x - y)*x

X_test = torch.tensor([5], dtype = torch.float32)
print(f'Prediction before training : f(5) = {model(X_test).item():.3f}')


## training
learning_rate = 0.01
n_iters = 100

loss = nn.MSELoss()
## optimizer = torch.optim.SGD([w], lr = learning_rate)
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)

for epoch in range(n_iters):
    ## forward pass
    ## y_pred = forward(X)
    y_pred = model(X)
    
    ## loss
    l = loss(Y, y_pred)
    
    ## gradient (backward pass)
    l.backward() ## dl/dw
    ##print(w.grad)
    
    
    ## weight update (gradient descent) - this is couched under torch.no_grad since this is not a part of forward pass, so shouldn't be tracked in computational graph
    # with torch.no_grad():
    #     ##print(w.grad, "before here")
    #     w -= learning_rate*(w.grad)
    #     ##print(w.grad, "here")
    optimizer.step() 
    
    ## zero grad
    ##w.grad.zero_()
    optimizer.zero_grad()
    
    if epoch%1==0:
        [w,b] = model.parameters()
        print(f'epoch {epoch + 1}: w = {w[0][0].item():.3f}, loss = {l :.8f}')
        
print(f'Prediction after training : f(5) = {model(X_test).item():.3f}')
    
    



    

Prediction before training : f(5) = -5.758
epoch 1: w = -0.507, loss = 79.58591461
epoch 2: w = -0.101, loss = 55.22529984
epoch 3: w = 0.238, loss = 38.32197189
epoch 4: w = 0.521, loss = 26.59309196
epoch 5: w = 0.756, loss = 18.45466042
epoch 6: w = 0.952, loss = 12.80756187
epoch 7: w = 1.115, loss = 8.88914585
epoch 8: w = 1.251, loss = 6.17022610
epoch 9: w = 1.364, loss = 4.28361130
epoch 10: w = 1.459, loss = 2.97451663
epoch 11: w = 1.538, loss = 2.06615067
epoch 12: w = 1.603, loss = 1.43584061
epoch 13: w = 1.658, loss = 0.99846941
epoch 14: w = 1.704, loss = 0.69497383
epoch 15: w = 1.742, loss = 0.48437160
epoch 16: w = 1.773, loss = 0.33822656
epoch 17: w = 1.800, loss = 0.23680639
epoch 18: w = 1.822, loss = 0.16642073
epoch 19: w = 1.840, loss = 0.11756900
epoch 20: w = 1.856, loss = 0.08365937
epoch 21: w = 1.868, loss = 0.06011757
epoch 22: w = 1.879, loss = 0.04377011
epoch 23: w = 1.888, loss = 0.03241476
epoch 24: w = 1.896, loss = 0.02452325
epoch 25: w = 1.902, l

In [None]:
## References