# Pytorch Tutorial

In [1]:
import torch
import torch.nn

import pprint
pp = pprint.PrettyPrinter()

## Part 1: Tensors

Basicly numpy arrays, but allow for parallelization and quick gradient computations

In [2]:
list_of_lists = [
  [1, 2, 3],
  [4, 5, 6]
]

data = torch.tensor([
  [0, 1],
  [2, 3],
  [4, 5], 
])
print(data)

tensor([[0, 1],
        [2, 3],
        [4, 5]])


In [3]:
data = torch.tensor([
  [0, 1],
  [2, 3],
  [4, 5]
], dtype=torch.float32)
data

tensor([[0., 1.],
        [2., 3.],
        [4., 5.]])

In [4]:
zeros = torch.zeros(2,5)
zeros

tensor([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]])

In [5]:
ones = torch.ones(2,5)
ones

tensor([[1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.]])

In [6]:
rr = torch.arange(1,10)
rr

tensor([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [7]:
rr * 2

tensor([ 2,  4,  6,  8, 10, 12, 14, 16, 18])

In [8]:
rr * 4

tensor([ 4,  8, 12, 16, 20, 24, 28, 32, 36])

In [9]:
a = torch.tensor([[1,2], [2,3], [4,5]])
b = torch.tensor([[1,2,3,4],[5,6,7,8]]) 

print(a.matmul(b))

print(a @ b)

tensor([[11, 14, 17, 20],
        [17, 22, 27, 32],
        [29, 38, 47, 56]])
tensor([[11, 14, 17, 20],
        [17, 22, 27, 32],
        [29, 38, 47, 56]])


In [10]:
print(f'A: {a.shape}')
print(f'B: {b.shape}')
print(f'A@B: {(a@b).shape}')

A: torch.Size([3, 2])
B: torch.Size([2, 4])
A@B: torch.Size([3, 4])


Tensor Reshaping

In [11]:
rr = torch.arange(1, 16)
print(rr.shape)

rr = rr.reshape(5, 3)
print(rr.shape)
print(rr)

torch.Size([15])
torch.Size([5, 3])
tensor([[ 1,  2,  3],
        [ 4,  5,  6],
        [ 7,  8,  9],
        [10, 11, 12],
        [13, 14, 15]])


In [12]:
data = torch.arange(1, 36, dtype=torch.float32).reshape(5,7)
print("Data is: ", data)


print("Taking sum over columns:")
print(data.sum(dim=0))

print("Taking sum over rows:")
print(data.sum(dim=1))

print("Taking the stdev over rows:")
print(data.std(dim=1))

Data is:  tensor([[ 1.,  2.,  3.,  4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11., 12., 13., 14.],
        [15., 16., 17., 18., 19., 20., 21.],
        [22., 23., 24., 25., 26., 27., 28.],
        [29., 30., 31., 32., 33., 34., 35.]])
Taking sum over columns:
tensor([ 75.,  80.,  85.,  90.,  95., 100., 105.])
Taking sum over rows:
tensor([ 28.,  77., 126., 175., 224.])
Taking the stdev over rows:
tensor([2.1602, 2.1602, 2.1602, 2.1602, 2.1602])


The dim we specify when performing these aggregating functions is the one that gets eliminated

In [13]:
x = torch.arange(1,13)
x = x.view(3, 2, 2)
x

tensor([[[ 1,  2],
         [ 3,  4]],

        [[ 5,  6],
         [ 7,  8]],

        [[ 9, 10],
         [11, 12]]])

In [14]:
x.shape

torch.Size([3, 2, 2])

In [15]:
x[0, :] # Providing a colon for a dim means "copy over that dim"

tensor([[1, 2],
        [3, 4]])

In [16]:
x[0]

tensor([[1, 2],
        [3, 4]])

In [17]:
x[:, 0]

tensor([[ 1,  2],
        [ 5,  6],
        [ 9, 10]])

In [18]:
x[:, :, 0]

tensor([[ 1,  3],
        [ 5,  7],
        [ 9, 11]])

In [19]:
y = torch.arange(1, 21)
y = y.view(5, 4)
y

tensor([[ 1,  2,  3,  4],
        [ 5,  6,  7,  8],
        [ 9, 10, 11, 12],
        [13, 14, 15, 16],
        [17, 18, 19, 20]])

In [20]:
y[[0, 2, 4]]

tensor([[ 1,  2,  3,  4],
        [ 9, 10, 11, 12],
        [17, 18, 19, 20]])

In [21]:
y[0,0]

tensor(1)

In [22]:
y[0,0].item()

1

## Autograd

Automatic differantion feature. We call `backward()` for Pytorch to calculate the gradients, which are then stored in the `grad` attribute

In [23]:
# requires_grad tells pytorch to store gradients
x = torch.tensor([2.], requires_grad=True)

pp.pprint(x.grad)

None


In [24]:
y = x * x * 3
y.backward()
pp.pprint(x.grad) # d(y)/d(x) = d(3x²)/d(x) = 6x = 12

tensor([12.])


Gradients are accumilated

In [25]:
z = x * x * 3
z.backward()
pp.pprint(x.grad) # d(y)/d(x) = d(3x²)/d(x) = 6x = 12

tensor([24.])


This means we should call `zero_grad` after updating our loss.. In order for loss of last epoch to not influence next.

## Neural Network Module

`nn.Linear(H_in, H_out)` layer takes matrix `(N, *, H_in)` and outputs `(N, *, H_out)`. We typically consider the first dim `N` to be the batch dimension (i.e number of images, sentences, sequences). The star `*` indicates an arbitrary number of dimensions (i.e with images it could be two)

In [26]:
import torch.nn as nn

In [27]:
input = torch.ones(2,3,6) # Last dim has to match first dim of nn

linear = nn.Linear(6,2)
linear_output = linear(input)
linear_output

tensor([[[0.1643, 0.1309],
         [0.1643, 0.1309],
         [0.1643, 0.1309]],

        [[0.1643, 0.1309],
         [0.1643, 0.1309],
         [0.1643, 0.1309]]], grad_fn=<ViewBackward0>)

In [28]:
list(linear.parameters()) # Ax + b

[Parameter containing:
 tensor([[ 0.2248,  0.3835, -0.2953, -0.4049,  0.2233, -0.3593],
         [ 0.1114, -0.2636, -0.3103,  0.3189,  0.3827, -0.2263]],
        requires_grad=True),
 Parameter containing:
 tensor([0.3921, 0.1181], requires_grad=True)]

## Putting Layers together

In [29]:
block = nn.Sequential(
  nn.Linear(4,2),
  nn.Sigmoid()
)

input = torch.ones(2,3,4)
output = block(input)
output

tensor([[[0.5687, 0.4062],
         [0.5687, 0.4062],
         [0.5687, 0.4062]],

        [[0.5687, 0.4062],
         [0.5687, 0.4062],
         [0.5687, 0.4062]]], grad_fn=<SigmoidBackward0>)

## Custom

Two things we need to define:
1. `__init__()`
2. `forward()`

In [30]:
class MultilayerPerceptron(nn.Module):

  def __init__(self, input_size, hidden_size):
    super(MultilayerPerceptron, self).__init__()

    self.input_size = input_size
    self.hidden_size = hidden_size

    self.model = nn.Sequential(
      nn.Linear(self.input_size, self.hidden_size),
      nn.ReLU(),
      nn.Linear(self.hidden_size, self.input_size),
      nn.Sigmoid()
    )

  def forward(self, x):
    output = self.model(x)
    return output

In [31]:
input = torch.randn(2, 5)
model = MultilayerPerceptron(5, 3)

model(input)

tensor([[0.3983, 0.5325, 0.3171, 0.5198, 0.5501],
        [0.4292, 0.4734, 0.4168, 0.5798, 0.5391]], grad_fn=<SigmoidBackward0>)

In [33]:
import torch.optim as optim

In [34]:
# We need some loss
# We pass our model params to the optimizer

y = torch.ones(10, 5)
x = y + torch.randn_like(y)
x

tensor([[ 2.8202,  0.3334,  1.5684,  1.3698, -1.1681],
        [ 1.6039,  1.1667,  1.6924,  0.3294,  0.0083],
        [ 3.1038,  0.9461, -0.7384,  2.8090,  0.5961],
        [-0.0885,  0.6835,  0.3907, -0.5256, -0.4597],
        [ 1.8637,  1.4084,  2.0956, -0.9121,  2.5693],
        [ 1.2722,  2.8229,  1.6460,  0.0969,  0.8227],
        [ 0.1217,  1.1368,  1.1102, -0.0041,  1.4713],
        [ 0.4187,  1.4662,  0.4943,  1.6735,  1.7179],
        [ 1.3104,  1.4895,  1.9534,  0.9505,  1.5441],
        [-0.3242,  0.9450,  0.8084,  1.6520,  0.3484]])

In [35]:
model = MultilayerPerceptron(5,3)

adam = optim.Adam(model.parameters(), lr=1e-1)

loss_function = nn.BCELoss()

y_pred = model(x)
lossy = loss_function(y_pred, y).item()
lossy

  from .autonotebook import tqdm as notebook_tqdm


0.8443832397460938

In [36]:
n_epochs = 10
for epoch in range(n_epochs):
  adam.zero_grad()
  y_pred = model(x)
  loss = loss_function(y_pred, y)
  print(f"Epoch {epoch}: Training loss: {loss}")
  loss.backward()
  adam.step()

Epoch 0: Training loss: 0.8443832397460938
Epoch 1: Training loss: 0.6940587759017944
Epoch 2: Training loss: 0.5450963377952576
Epoch 3: Training loss: 0.3767665922641754
Epoch 4: Training loss: 0.23145557940006256
Epoch 5: Training loss: 0.13198724389076233
Epoch 6: Training loss: 0.07587195187807083
Epoch 7: Training loss: 0.0481712780892849
Epoch 8: Training loss: 0.03338877856731415
Epoch 9: Training loss: 0.024316351860761642
