In [1]:
import torch

In [2]:
t1 = torch.tensor(4.)
t1

tensor(4.)

In [3]:
t1.dtype

torch.float32

In [4]:
#vector
t2 = torch.tensor([1., 2, 3, 4])
t2

tensor([1., 2., 3., 4.])

In [5]:
# matrix
t3 = torch.tensor([[5., 6], 
                   [7, 8], 
                   [9, 10]])
t3

tensor([[ 5.,  6.],
        [ 7.,  8.],
        [ 9., 10.]])

In [6]:
# 3-dimensional array
t4 = torch.tensor([
    [[11, 12, 13], 
     [13, 14, 15]], 
    [[15, 16, 17], 
     [17, 18, 19.]]])
t4

tensor([[[11., 12., 13.],
         [13., 14., 15.]],

        [[15., 16., 17.],
         [17., 18., 19.]]])

In [7]:
print(t1)
t1.shape

tensor(4.)


torch.Size([])

In [8]:
print(t2)
t2.shape

tensor([1., 2., 3., 4.])


torch.Size([4])

In [9]:
print(t3)
t3.shape

tensor([[ 5.,  6.],
        [ 7.,  8.],
        [ 9., 10.]])


torch.Size([3, 2])

In [10]:
print(t4)
t4.shape

tensor([[[11., 12., 13.],
         [13., 14., 15.]],

        [[15., 16., 17.],
         [17., 18., 19.]]])


torch.Size([2, 2, 3])

# Tensor operations and gradients

In [11]:
# creating tensors
x = torch.tensor(3.)
w = torch.tensor(4., requires_grad = True)
b = torch.tensor(5., requires_grad = True)
x, w, b

(tensor(3.), tensor(4., requires_grad=True), tensor(5., requires_grad=True))

In [12]:
# arithmetic operations
y = w*x + b
y

tensor(17., grad_fn=<AddBackward0>)

In [13]:
# computing derivative
y.backward()

# the derivatives wrt to each variable is stores in the .grad attribute of respective variable

In [14]:
# displaying the gradients
print('dy/dx :', x.grad)
print('dy/dw:', w.grad)
print('dy/db :', b.grad)

# we can see that the dy/dx does not have a derivative value, this is because we didn't
# set the requires_grad = True
# this is done to reduce computation time and resources, if derivative not needed

dy/dx : None
dy/dw: tensor(3.)
dy/db : tensor(1.)


# Tensor functions

In [15]:
t6 = torch.full((3,2), 42.)
t6

tensor([[42., 42.],
        [42., 42.],
        [42., 42.]])

In [16]:
t7 = torch.cat((t3,t6))
t7

tensor([[ 5.,  6.],
        [ 7.,  8.],
        [ 9., 10.],
        [42., 42.],
        [42., 42.],
        [42., 42.]])

In [17]:
t8 = torch.sin(t7)
t8

tensor([[-0.9589, -0.2794],
        [ 0.6570,  0.9894],
        [ 0.4121, -0.5440],
        [-0.9165, -0.9165],
        [-0.9165, -0.9165],
        [-0.9165, -0.9165]])

In [18]:
t9 = t8.reshape(3,2,2)
t9

tensor([[[-0.9589, -0.2794],
         [ 0.6570,  0.9894]],

        [[ 0.4121, -0.5440],
         [-0.9165, -0.9165]],

        [[-0.9165, -0.9165],
         [-0.9165, -0.9165]]])

# Interoperablility with numpy

In [19]:
import numpy as np

In [20]:
x = np.array([[1,2],[3,4]])
x

array([[1, 2],
       [3, 4]])

In [21]:
# convert numpy array to torch tensor
y = torch.from_numpy(x)
y

tensor([[1, 2],
        [3, 4]])

In [22]:
x.dtype, y.dtype

(dtype('int64'), torch.int64)

In [23]:
# convert torch tensor to numpy
z = y.numpy()
z

array([[1, 2],
       [3, 4]])

In [24]:
inputs = np.array([[73, 67, 43], 
                   [91, 88, 64], 
                   [87, 134, 58], 
                   [102, 43, 37], 
                   [69, 96, 70]], dtype='float32')


In [25]:
targets = np.array([[56, 70], 
                    [81, 101], 
                    [119, 133], 
                    [22, 37], 
                    [103, 119]], dtype='float32')

In [26]:
inputs = torch.from_numpy(inputs)
targets = torch.from_numpy(targets)
print(inputs)
print(targets)

tensor([[ 73.,  67.,  43.],
        [ 91.,  88.,  64.],
        [ 87., 134.,  58.],
        [102.,  43.,  37.],
        [ 69.,  96.,  70.]])
tensor([[ 56.,  70.],
        [ 81., 101.],
        [119., 133.],
        [ 22.,  37.],
        [103., 119.]])


# Linear regression model from scratch


In [27]:
w = torch.randn(2, 3, requires_grad=True)
b = torch.randn(2, requires_grad=True)
print(w)
print(b)

tensor([[-0.2326,  1.3107, -0.9803],
        [ 0.3582, -1.1337,  1.8023]], requires_grad=True)
tensor([-0.9584,  0.4856], requires_grad=True)


In [28]:
def model(x):
    return x@w.t()+b
# @ is matrix multiplication

In [29]:
# generate predictions
preds = model(inputs)
print(preds)

tensor([[ 27.7255,  28.1777],
        [ 30.4765,  48.6666],
        [ 97.5808, -15.7301],
        [ -4.5938,  54.9610],
        [ 40.1967,  42.5300]], grad_fn=<AddBackward0>)


In [30]:
# compare with targets
print(targets)

tensor([[ 56.,  70.],
        [ 81., 101.],
        [119., 133.],
        [ 22.,  37.],
        [103., 119.]])


In [31]:
# implementation of mse
diff = preds - targets
torch.sum(diff*diff)/diff.numel()
# numel gives the total number of inputs

tensor(4124.1128, grad_fn=<DivBackward0>)

# Loss function

In [32]:
def mse(t1, t2):
    diff = t1 - t2
    return torch.sum(diff * diff) / diff.numel()

In [33]:
loss = mse(preds, targets)
print(loss)

tensor(4124.1128, grad_fn=<DivBackward0>)


# Computing gradients

In [34]:
loss.backward()

In [35]:
# gradient for weights
print(w)
print(w.grad)

tensor([[-0.2326,  1.3107, -0.9803],
        [ 0.3582, -1.1337,  1.8023]], requires_grad=True)
tensor([[-3114.2278, -3276.6562, -2214.3643],
        [-4839.8589, -6781.2129, -3692.4766]])


In [36]:
print(b)
print(b.grad)

tensor([-0.9584,  0.4856], requires_grad=True)
tensor([-37.9229, -60.2790])


# Adjusting weights and biases to reduce the loss

In [37]:
# torch.no_grad() indicates to pytorch not to track, modify gradients when updating other
# variable, in this case weights and biases
with torch.no_grad():
    alpha= 1e-5 # learning rate
    w -= w.grad * alpha
    b -= b.grad * alpha

In [38]:
print(w)
print(b)

tensor([[-0.2014,  1.3435, -0.9582],
        [ 0.4066, -1.0659,  1.8392]], requires_grad=True)
tensor([-0.9581,  0.4862], requires_grad=True)


In [39]:
loss = mse(model(inputs), targets)
print(loss)

tensor(3136.2727, grad_fn=<DivBackward0>)


In [40]:
# we need to set the gradients to zero otherwise recomputation of gradients adds to the
# existing gradient 
w.grad.zero_()
b.grad.zero_()
print(w.grad)
print(b.grad)

tensor([[0., 0., 0.],
        [0., 0., 0.]])
tensor([0., 0.])


# Train the model using gradient descent


In [41]:
# Generate predictions
preds = model(inputs)
print(preds)

tensor([[ 33.1468,  37.8426],
        [ 37.6115,  61.4022],
        [105.9656,  -0.2904],
        [  0.8114,  64.1804],
        [ 47.0415,  54.9648]], grad_fn=<AddBackward0>)


In [42]:
# Calculate the loss
loss = mse(preds, targets)
print(loss)

tensor(3136.2727, grad_fn=<DivBackward0>)


In [43]:
# Compute gradients
loss.backward()
print(w.grad)
print(b.grad)

tensor([[-2554.6006, -2675.8174, -1843.3235],
        [-3838.6377, -5695.7383, -3024.9326]])
tensor([-31.2846, -48.3801])


In [44]:
# Adjust weights & reset gradients
with torch.no_grad():
    w -= w.grad * 1e-5
    b -= b.grad * 1e-5
    w.grad.zero_()
    b.grad.zero_()

In [45]:
print(w)
print(b)

tensor([[-0.1759,  1.3702, -0.9398],
        [ 0.4450, -1.0089,  1.8695]], requires_grad=True)
tensor([-0.9578,  0.4867], requires_grad=True)


In [46]:
# Calculate loss
preds = model(inputs)
loss = mse(preds, targets)
print(loss)

tensor(2466.8569, grad_fn=<DivBackward0>)


# Train for multiple epochs

In [47]:
# Train for 100 epochs
for i in range(100):
    preds = model(inputs)
    loss = mse(preds, targets)
    loss.backward()
    with torch.no_grad():
        w -= w.grad * 1e-5
        b -= b.grad * 1e-5
        w.grad.zero_()
        b.grad.zero_()

In [48]:
# Calculate loss
preds = model(inputs)
loss = mse(preds, targets)
print(loss)

tensor(433.9011, grad_fn=<DivBackward0>)


In [49]:
# Predictions
preds

tensor([[ 58.0597,  76.3539],
        [ 71.4632, 114.4349],
        [141.6866,  91.9367],
        [ 26.0668,  72.6531],
        [ 80.1320, 122.3826]], grad_fn=<AddBackward0>)

In [50]:
# Targets
targets

tensor([[ 56.,  70.],
        [ 81., 101.],
        [119., 133.],
        [ 22.,  37.],
        [103., 119.]])

# Why not use numpy?
- using pytorch we can easily compute the gradients
- pytorch supports gpu, whereas numpy doesn't

# Linear regression using PyTorch built-ins

In [51]:
import torch.nn as nn

In [52]:
# Input (temp, rainfall, humidity)
inputs = np.array([[73, 67, 43], 
                   [91, 88, 64], 
                   [87, 134, 58], 
                   [102, 43, 37], 
                   [69, 96, 70], 
                   [74, 66, 43], 
                   [91, 87, 65], 
                   [88, 134, 59], 
                   [101, 44, 37], 
                   [68, 96, 71], 
                   [73, 66, 44], 
                   [92, 87, 64], 
                   [87, 135, 57], 
                   [103, 43, 36], 
                   [68, 97, 70]], 
                  dtype='float32')

# Targets (apples, oranges)
targets = np.array([[56, 70], 
                    [81, 101], 
                    [119, 133], 
                    [22, 37], 
                    [103, 119],
                    [57, 69], 
                    [80, 102], 
                    [118, 132], 
                    [21, 38], 
                    [104, 118], 
                    [57, 69], 
                    [82, 100], 
                    [118, 134], 
                    [20, 38], 
                    [102, 120]], 
                   dtype='float32')

inputs = torch.from_numpy(inputs)
targets = torch.from_numpy(targets)

In [53]:
inputs

tensor([[ 73.,  67.,  43.],
        [ 91.,  88.,  64.],
        [ 87., 134.,  58.],
        [102.,  43.,  37.],
        [ 69.,  96.,  70.],
        [ 74.,  66.,  43.],
        [ 91.,  87.,  65.],
        [ 88., 134.,  59.],
        [101.,  44.,  37.],
        [ 68.,  96.,  71.],
        [ 73.,  66.,  44.],
        [ 92.,  87.,  64.],
        [ 87., 135.,  57.],
        [103.,  43.,  36.],
        [ 68.,  97.,  70.]])

# Dataset and DataLoader
- for large dataset, inorder to reduce the computation time, we split ourt dataset into batches to make computations faster
- it also helps if our RAM is limited

In [54]:
from torch.utils.data import TensorDataset

In [55]:
# define dataset
train_ds = TensorDataset(inputs, targets)
train_ds[0:3]

(tensor([[ 73.,  67.,  43.],
         [ 91.,  88.,  64.],
         [ 87., 134.,  58.]]),
 tensor([[ 56.,  70.],
         [ 81., 101.],
         [119., 133.]]))

In [56]:
from torch.utils.data import DataLoader

In [57]:
# define dataloader
batch_size = 5
train_dl = DataLoader(train_ds, batch_size, shuffle=True)

In [58]:
train_dl

<torch.utils.data.dataloader.DataLoader at 0x7ff1dae89430>

- we can use for loop to get each batch in the dataloader
- if shuffle is set to true, the dataset will be shuffled before creating the batches
- shuffling is beneficial as, it introduces randomization, which in turn helps to train the model faster

In [59]:
for xb, yb in train_dl:
    print(xb)
    print(yb)
    break

tensor([[ 68.,  97.,  70.],
        [ 91.,  88.,  64.],
        [ 92.,  87.,  64.],
        [101.,  44.,  37.],
        [ 87., 135.,  57.]])
tensor([[102., 120.],
        [ 81., 101.],
        [ 82., 100.],
        [ 21.,  38.],
        [118., 134.]])


# nn.Linear
- Instead of initializing the weights & biases manually, we can define the model using the nn.Linear class from PyTorch, which does it automatically.

In [60]:
# define model
model = nn.Linear(3,2)
print(model.weight)
print(model.bias)

Parameter containing:
tensor([[ 0.3052,  0.3109, -0.5628],
        [-0.1003, -0.5455,  0.5663]], requires_grad=True)
Parameter containing:
tensor([-0.2468, -0.4212], requires_grad=True)


- PyTorch models also have a helpful .parameters method, which returns a list containing all the weights and bias matrices present in the model. For our linear regression model, we have one weight matrix and one bias matrix.

In [61]:
# parameters
list(model.parameters())

[Parameter containing:
 tensor([[ 0.3052,  0.3109, -0.5628],
         [-0.1003, -0.5455,  0.5663]], requires_grad=True),
 Parameter containing:
 tensor([-0.2468, -0.4212], requires_grad=True)]

- we can use the model to generate predictions in the same way as before

In [62]:
preds = model(inputs)
preds

tensor([[ 18.6563, -19.9400],
        [ 18.8580, -21.3086],
        [ 35.3144, -49.3989],
        [ 23.4218, -13.1535],
        [ 11.2546, -20.0689],
        [ 18.6506, -19.4948],
        [ 17.9843, -20.1968],
        [ 35.0567, -48.9329],
        [ 23.4275, -13.5987],
        [ 10.3866, -19.4023],
        [ 17.7826, -18.8282],
        [ 18.8522, -20.8633],
        [ 36.1881, -50.5107],
        [ 24.2898, -13.8200],
        [ 11.2603, -20.5141]], grad_fn=<AddmmBackward0>)

In [63]:
targets

tensor([[ 56.,  70.],
        [ 81., 101.],
        [119., 133.],
        [ 22.,  37.],
        [103., 119.],
        [ 57.,  69.],
        [ 80., 102.],
        [118., 132.],
        [ 21.,  38.],
        [104., 118.],
        [ 57.,  69.],
        [ 82., 100.],
        [118., 134.],
        [ 20.,  38.],
        [102., 120.]])

# Loss function
- instead of defining the loss function manually, we can use the in built loss function 'mse_loss'

In [64]:
import torch.nn.functional as F

In [65]:
# define loss function
loss_fn = F.mse_loss

In [66]:
# loss computation
loss = loss_fn(model(inputs), targets)
print(loss)

tensor(9870.8691, grad_fn=<MseLossBackward0>)


# Optimizer
- Instead of manually manipulating the model's weights & biases using gradients, we can use the optimizer optim.SGD. SGD is short for "stochastic gradient descent". The term stochastic indicates that samples are selected in random batches instead of as a single group.

In [67]:
opt = torch.optim.SGD(model.parameters(), lr=1e-5)

# Training the model

In [68]:
# utility function to train the model
def fit(num_epochs, model, loss_fn, opt, train_dl):
    
    # repeat for given number of epochs:
    for epoch in range(num_epochs):
        
        # train with batches of data
        for xb, yb in train_dl:
            
            # 1. generate predictions
            pred = model(xb)
            
            # 2. calculate loss
            loss = loss_fn(pred, yb)
            
            # 3. compute gradients
            loss.backward()
            
            # 4. performs SGD and update parameters(weights and biases) using gradients
            opt.step()
            
            # 5. reset gradients to zero
            opt.zero_grad()
        
        # Print the progress
        if (epoch+1) % 10 == 0:
            print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, loss.item()))

Some things to note above:

- We use the data loader defined earlier to get batches of data for every iteration.

- Instead of updating parameters (weights and biases) manually, we use opt.step to perform the update and opt.zero_grad to reset the gradients to zero.

- We've also added a log statement that prints the loss from the last batch of data for every 10th epoch to track training progress. loss.item returns the actual value stored in the loss tensor.

Let's train the model for 100 epochs.

In [69]:
fit(100, model, loss_fn, opt, train_dl)

Epoch [10/100], Loss: 785.5948
Epoch [20/100], Loss: 323.2477
Epoch [30/100], Loss: 107.2406
Epoch [40/100], Loss: 249.5095
Epoch [50/100], Loss: 95.4218
Epoch [60/100], Loss: 186.9138
Epoch [70/100], Loss: 107.5544
Epoch [80/100], Loss: 117.7475
Epoch [90/100], Loss: 81.9778
Epoch [100/100], Loss: 44.4334


In [70]:
# generate predictions
preds = model(inputs)
preds

tensor([[ 58.6686,  71.3153],
        [ 77.3285, 102.1231],
        [126.2870, 127.1762],
        [ 30.3263,  44.0316],
        [ 88.0684, 117.5504],
        [ 57.4998,  70.4642],
        [ 76.2283, 102.4473],
        [126.1114, 127.9599],
        [ 31.4952,  44.8827],
        [ 88.1371, 118.7257],
        [ 57.5684,  71.6395],
        [ 76.1597, 101.2720],
        [127.3872, 126.8521],
        [ 30.2577,  42.8563],
        [ 89.2373, 118.4016]], grad_fn=<AddmmBackward0>)

In [71]:
targets

tensor([[ 56.,  70.],
        [ 81., 101.],
        [119., 133.],
        [ 22.,  37.],
        [103., 119.],
        [ 57.,  69.],
        [ 80., 102.],
        [118., 132.],
        [ 21.,  38.],
        [104., 118.],
        [ 57.,  69.],
        [ 82., 100.],
        [118., 134.],
        [ 20.,  38.],
        [102., 120.]])

# Predicting for new inputs

In [72]:
model(torch.tensor([[75.,63,44]])) # average temperature, rainfall humidity

tensor([[54.1840, 69.2818]], grad_fn=<AddmmBackward0>)

- the predicted yield of apples is 54.6 tons and 67.93 tons per hectare for the given parameters

# Creating feedforward neural networks

In [74]:
model2 = nn.Sequential(
    nn.Linear(3,4),
    nn.Sigmoid(),
    nn.Linear(4,2)
)

In [87]:
opt2 = torch.optim.SGD(model2.parameters(), lr = 1e-3)

In [90]:
fit(100, model2, F.mse_loss, opt2, train_dl)

Epoch [10/100], Loss: 1252.9213
Epoch [20/100], Loss: 1176.0902
Epoch [30/100], Loss: 1774.5457
Epoch [40/100], Loss: 1177.0903
Epoch [50/100], Loss: 1262.7644
Epoch [60/100], Loss: 1031.9996
Epoch [70/100], Loss: 1634.3035
Epoch [80/100], Loss: 1098.9089
Epoch [90/100], Loss: 1263.1428
Epoch [100/100], Loss: 552.1130
