In [2]:
import torch
import torch.nn as nn

# Import pprint, module we use for making our print statements prettier
import pprint
pp = pprint.PrettyPrinter()

# Tensors

## Tensor Initialization

### From a Python List

In [3]:
# Initialize a tensor from a Python List
data = [
        [0, 1], 
        [2, 3],
        [4, 5]
       ]
x_python = torch.tensor(data)

# Print the tensor
x_python

tensor([[0, 1],
        [2, 3],
        [4, 5]])

In [4]:
# We are using the dtype to create a tensor of particular type
x_float = torch.tensor(data, dtype=torch.float)
x_float

tensor([[0., 1.],
        [2., 3.],
        [4., 5.]])

In [5]:
# We are using the dtype to create a tensor of particular type
x_bool = torch.tensor(data, dtype=torch.bool)
x_bool

tensor([[False,  True],
        [ True,  True],
        [ True,  True]])

In [6]:
x_python.float()

tensor([[0., 1.],
        [2., 3.],
        [4., 5.]])

In [7]:
# `torch.Tensor` defaults to float
# Same as torch.FloatTensor(data)
x = torch.Tensor(data) 
x

tensor([[0., 1.],
        [2., 3.],
        [4., 5.]])

### From a NumPy Array

In [8]:
import numpy as np

# Initialize a tensor from a NumPy array
ndarray = np.array(data)
x_numpy = torch.from_numpy(ndarray)

# Print the tensor
x_numpy


tensor([[0, 1],
        [2, 3],
        [4, 5]])

### From a Tensor

In [9]:
# Initialize a base tensor
x = torch.tensor([[1., 2.], [3., 4.]])
x

tensor([[1., 2.],
        [3., 4.]])

In [10]:
# Initialize a tensor of 0s
x_zeros = torch.zeros_like(x)
x_zeros

tensor([[0., 0.],
        [0., 0.]])

In [11]:
# Initialize a tensor of 1s
x_ones = torch.ones_like(x)
x_ones

tensor([[1., 1.],
        [1., 1.]])

In [12]:
# Initialize a tensor where each element is sampled from a uniform distribution
# between 0 and 1
x_rand = torch.rand_like(x)
x_rand

tensor([[0.1322, 0.0682],
        [0.0599, 0.9042]])

In [13]:
# Initialize a tensor where each element is sampled from a normal distribution
x_randn = torch.randn_like(x)
x_randn

tensor([[0.0542, 0.4742],
        [0.5360, 1.4363]])

### By Specifying a Shape

In [14]:
# Initialize a 2x3x2 tensor of 0s
shape = (4, 2, 2)
x_zeros = torch.zeros(shape) # x_zeros = torch.zeros(4, 3, 2) is an alternative
x_zeros

tensor([[[0., 0.],
         [0., 0.]],

        [[0., 0.],
         [0., 0.]],

        [[0., 0.],
         [0., 0.]],

        [[0., 0.],
         [0., 0.]]])

### With torch.arange()

In [15]:
# Create a tensor with values 0-9
x = torch.arange(10)
x

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

### Tensor Properties

In [16]:
# DType Property 

# Initialize a 3x2 tensor, with 3 rows and 2 columns
x = torch.ones(3, 2)
x.dtype


torch.float32

In [17]:
# Shape
# Initialize a 3x2 tensor, with 3 rows and 2 columns
x = torch.Tensor([[1, 2], [3, 4], [5, 6]])
x

tensor([[1., 2.],
        [3., 4.],
        [5., 6.]])

In [18]:
# Print out its shape
# Same as x.size()
x.shape 

torch.Size([3, 2])

In [19]:
# Print out the number of elements in a particular dimension
# 0th dimension corresponds to the rows
x.shape[0] 

3

In [20]:
# Get the size of the 0th dimension
x.size(0)

3

In [21]:
# Example use of view()
# x_view shares the same memory as x, so changing one changes the other
x_view = x.view(3, 2)
x_view

tensor([[1., 2.],
        [3., 4.],
        [5., 6.]])

In [22]:
# We can ask PyTorch to infer the size of a dimension with -1
x_view = x.view(-1, 3)
x_view

tensor([[1., 2., 3.],
        [4., 5., 6.]])

In [23]:
# Change the shape of x to be 3x2
# x_reshaped could be a reference to or copy of x
x_reshaped = torch.reshape(x, (2, 3))
x_reshaped

tensor([[1., 2., 3.],
        [4., 5., 6.]])

In [24]:
# Initialize a 5x2 tensor, with 5 rows and 2 columns
x = torch.arange(10).reshape(5, 2)
x

tensor([[0, 1],
        [2, 3],
        [4, 5],
        [6, 7],
        [8, 9]])

In [25]:
# Add a new dimension of size 1 at the 1st dimension
x = x.unsqueeze(1)
x.shape

torch.Size([5, 1, 2])

In [26]:
x

tensor([[[0, 1]],

        [[2, 3]],

        [[4, 5]],

        [[6, 7]],

        [[8, 9]]])

In [27]:
# Get the number of elements in tensor.
x.numel()

10

## Device

In [28]:
# Initialize an example tensor
x = torch.Tensor([[1, 2], [3, 4]])
x


tensor([[1., 2.],
        [3., 4.]])

In [29]:
# Get the device of the tensor
x.device

device(type='cpu')

We can move a tensor from one device to another with the method to(device).

In [30]:
# Check if a GPU is available, if so, move the tensor to the GPU
if torch.cuda.is_available():
    x.to('cuda')
else:
    print("GPU not available, using CPU")
    x.to('cpu')

GPU not available, using CPU


In [31]:
x.device

device(type='cpu')

## Tensor Indexing

In [32]:
# Initialize an example tensor
x = torch.Tensor([
                  [[1, 2], [3, 4]],
                  [[5, 6], [7, 8]], 
                  [[9, 10], [11, 12]] 
                 ])
x

tensor([[[ 1.,  2.],
         [ 3.,  4.]],

        [[ 5.,  6.],
         [ 7.,  8.]],

        [[ 9., 10.],
         [11., 12.]]])

In [33]:
x.shape

torch.Size([3, 2, 2])

In [34]:
# Access the 0th element, which is the first row
x[0] # Equivalent to x[0, :]

tensor([[1., 2.],
        [3., 4.]])

We can also index into multiple dimensions with :

In [35]:
# Get the top left element of each element in our tensor
x[:, 0, 0]

tensor([1., 5., 9.])

We can also access arbitrary elements in each dimension.

In [36]:
# Print x again to see our tensor
x

tensor([[[ 1.,  2.],
         [ 3.,  4.]],

        [[ 5.,  6.],
         [ 7.,  8.]],

        [[ 9., 10.],
         [11., 12.]]])

In [37]:
# Let's access the 0th and 1st elements, each twice
i = torch.tensor([0, 0, 1, 1])
x[i]


tensor([[[1., 2.],
         [3., 4.]],

        [[1., 2.],
         [3., 4.]],

        [[5., 6.],
         [7., 8.]],

        [[5., 6.],
         [7., 8.]]])

In [38]:
# Let's access the 0th elements of the 1st and 2nd elements
i = torch.tensor([1, 2])
j = torch.tensor([0])
x[i, j]

tensor([[ 5.,  6.],
        [ 9., 10.]])

In [39]:
x[0, 0, 0]

tensor(1.)

In [40]:
x[0, 0, 0].item()

1.0

## Operations

In [41]:
# Create an example tensor
x = torch.ones((3,2,2))
x

tensor([[[1., 1.],
         [1., 1.]],

        [[1., 1.],
         [1., 1.]],

        [[1., 1.],
         [1., 1.]]])

In [42]:
# Perform elementwise addition
# Use - for subtraction
x + 2

tensor([[[3., 3.],
         [3., 3.]],

        [[3., 3.],
         [3., 3.]],

        [[3., 3.],
         [3., 3.]]])

In [43]:
# Perform elementwise multiplication
# Use / for division
x * 2

tensor([[[2., 2.],
         [2., 2.]],

        [[2., 2.],
         [2., 2.]],

        [[2., 2.],
         [2., 2.]]])

In [44]:
# Create a 4x3 tensor of 6s
a = torch.ones((4,3)) * 6
a

tensor([[6., 6., 6.],
        [6., 6., 6.],
        [6., 6., 6.],
        [6., 6., 6.]])

In [45]:
# Create a 1D tensor of 2s
b = torch.ones(3) * 2
b

tensor([2., 2., 2.])

In [46]:
# Divide a by b
a / b

tensor([[3., 3., 3.],
        [3., 3., 3.],
        [3., 3., 3.],
        [3., 3., 3.]])

We can use tensor.matmul(other_tensor) for matrix multiplication and tensor.T for transpose. Matrix multiplication can also be performed with @.

In [47]:
# Alternative to a.matmul(b)
# a @ b.T returns the same result since b is 1D tensor and the 2nd dimension
# is inferred
a @ b 

tensor([36., 36., 36., 36.])

In [48]:
pp.pprint(a.shape)
pp.pprint(a.T.shape)

torch.Size([4, 3])
torch.Size([3, 4])


We can take the mean and standard deviation along a certain dimension with the methods mean(dim) and std(dim). That is, if we want to get the mean 3x2 matrix in a 4x3x2 matrix, we would set the dim to be 0. We can call these methods with no parameter to get the mean and standard deviation for the whole tensor. To use mean and std our tensor should be a floating point type.

In [49]:
# Create an example tensor
m = torch.tensor(
    [
     [1., 1.],
     [2., 2.],
     [3., 3.],
     [4., 4.]
    ]
)

pp.pprint("Mean: {}".format(m.mean()))
pp.pprint("Mean in the 0th dimension: {}".format(m.mean(0)))
pp.pprint("Mean in the 1st dimension: {}".format(m.mean(1)))

'Mean: 2.5'
'Mean in the 0th dimension: tensor([2.5000, 2.5000])'
'Mean in the 1st dimension: tensor([1., 2., 3., 4.])'


In [50]:
# Concatenate in dimension 0 and 1
a_cat0 = torch.cat([a, a, a], dim=0)
a_cat1 = torch.cat([a, a, a], dim=1)

print("Initial shape: {}".format(a.shape))
print("Shape after concatenation in dimension 0: {}".format(a_cat0.shape))
print("Shape after concatenation in dimension 1: {}".format(a_cat1.shape))

Initial shape: torch.Size([4, 3])
Shape after concatenation in dimension 0: torch.Size([12, 3])
Shape after concatenation in dimension 1: torch.Size([4, 9])


In [51]:
# Print our tensor
a

tensor([[6., 6., 6.],
        [6., 6., 6.],
        [6., 6., 6.],
        [6., 6., 6.]])

In [52]:
# add() is not in place
a.add(a)
a

tensor([[6., 6., 6.],
        [6., 6., 6.],
        [6., 6., 6.],
        [6., 6., 6.]])

In [53]:
# add_() is in place
a.add_(a)
a

tensor([[12., 12., 12.],
        [12., 12., 12.],
        [12., 12., 12.],
        [12., 12., 12.]])

## Autograd

PyTorch and other machine learning libraries are known for their automatic differantiation feature. That is, given that we have defined the set of operations that need to be performed, the framework itself can figure out how to compute the gradients. We can call the backward() method to ask PyTorch to calculate the gradiends, which are then stored in the grad attribute.

In [54]:
# Create an example tensor
# requires_grad parameter tells PyTorch to store gradients
x = torch.tensor([2.], requires_grad=True)

# Print the gradient if it is calculated
# Currently None since x is a scalar
pp.pprint(x.grad)

None


In [55]:
# Calculating the gradient of y with respect to x
y = x * x * 3 # 3x^2
y.backward()
pp.pprint(x.grad) # d(y)/d(x) = d(3x^2)/d(x) = 6x = 12

tensor([12.])


Let's run backprop from a different tensor again to see what happens.

In [56]:
z = x * x * 3 # 3x^2
z.backward()
pp.pprint(x.grad)

tensor([24.])


## Neural Network Module

In [57]:
import torch.nn as nn

### Linear Layer

In [58]:
# Create the inputs
input = torch.ones(2,3,4)

# Make a linear layers transforming N,*,H_in dimensinal inputs to N,*,H_out
# dimensional outputs
linear = nn.Linear(4, 2)
linear_output = linear(input)
linear_output 

tensor([[[-0.4326,  1.0321],
         [-0.4326,  1.0321],
         [-0.4326,  1.0321]],

        [[-0.4326,  1.0321],
         [-0.4326,  1.0321],
         [-0.4326,  1.0321]]], grad_fn=<ViewBackward0>)

### Other Module Layers

There are several other preconfigured layers in the nn module. Some commonly used examples are nn.Conv2d, nn.ConvTranspose2d, nn.BatchNorm1d, nn.BatchNorm2d, nn.Upsample and nn.MaxPool2d among many others. We will learn more about these as we progress in the course. For now, the only important thing to remember is that we can treat each of these layers as plug and play components: we will be providing the required dimensions and PyTorch will take care of setting them up.

### Activation Function Layer

In [59]:
linear_output

tensor([[[-0.4326,  1.0321],
         [-0.4326,  1.0321],
         [-0.4326,  1.0321]],

        [[-0.4326,  1.0321],
         [-0.4326,  1.0321],
         [-0.4326,  1.0321]]], grad_fn=<ViewBackward0>)

In [60]:
sigmoid = nn.Sigmoid()
output = sigmoid(linear_output)
output

tensor([[[0.3935, 0.7373],
         [0.3935, 0.7373],
         [0.3935, 0.7373]],

        [[0.3935, 0.7373],
         [0.3935, 0.7373],
         [0.3935, 0.7373]]], grad_fn=<SigmoidBackward0>)

### Putting the Layers Together

So far we have seen that we can create layers and pass the output of one as the input of the next. Instead of creating intermediate tensors and passing them around, we can use nn.Sequentual, which does exactly that.

In [61]:
block = nn.Sequential(
    nn.Linear(4, 2),
    nn.Sigmoid()
)

input = torch.ones(2,3,4)
output = block(input)
output

tensor([[[0.4219, 0.3491],
         [0.4219, 0.3491],
         [0.4219, 0.3491]],

        [[0.4219, 0.3491],
         [0.4219, 0.3491],
         [0.4219, 0.3491]]], grad_fn=<SigmoidBackward0>)

#### Custom Modules

In [62]:
class MultilayerPerceptron(nn.Module):

  def __init__(self, input_size, hidden_size):
    # Call to the __init__ function of the super class
    super(MultilayerPerceptron, self).__init__()

    # Bookkeeping: Saving the initialization parameters
    self.input_size = input_size 
    self.hidden_size = hidden_size 

    # Defining of our model
    # There isn't anything specific about the naming of `self.model`. It could
    # be something arbitrary.
    self.model = nn.Sequential(
        nn.Linear(self.input_size, self.hidden_size),
        nn.ReLU(),
        nn.Linear(self.hidden_size, self.input_size),
        nn.Sigmoid()
    )
    
  def forward(self, x):
    output = self.model(x)
    return output

In [63]:
class MultilayerPerceptron(nn.Module):

  def __init__(self, input_size, hidden_size):
    # Call to the __init__ function of the super class
    super(MultilayerPerceptron, self).__init__()

    # Bookkeeping: Saving the initialization parameters
    self.input_size = input_size 
    self.hidden_size = hidden_size 

    # Defining of our layers
    self.linear = nn.Linear(self.input_size, self.hidden_size)
    self.relu = nn.ReLU()
    self.linear2 = nn.Linear(self.hidden_size, self.input_size)
    self.sigmoid = nn.Sigmoid()
    
  def forward(self, x):
    linear = self.linear(x)
    relu = self.relu(linear)
    linear2 = self.linear2(relu)
    output = self.sigmoid(linear2)
    return output

Now that we have defined our class, we can instantiate it and see what it does.

In [64]:
# Make a sample input
input = torch.randn(2, 5)

# Create our model
model = MultilayerPerceptron(5, 3)

# Pass our input through our model
model(input)

tensor([[0.5514, 0.3762, 0.4956, 0.3232, 0.5243],
        [0.5711, 0.5383, 0.4982, 0.3363, 0.3744]], grad_fn=<SigmoidBackward0>)

In [65]:
list(model.named_parameters())

[('linear.weight',
  Parameter containing:
  tensor([[ 0.4041,  0.1791, -0.3538, -0.2695,  0.4321],
          [ 0.2490, -0.3557, -0.3299, -0.0379,  0.3613],
          [-0.3564, -0.2623,  0.3753, -0.1224, -0.1645]], requires_grad=True)),
 ('linear.bias',
  Parameter containing:
  tensor([0.0296, 0.3683, 0.0535], requires_grad=True)),
 ('linear2.weight',
  Parameter containing:
  tensor([[ 0.1015,  0.0289, -0.0413],
          [ 0.5406,  0.5114, -0.3159],
          [ 0.2199, -0.4689, -0.2090],
          [-0.5578, -0.1032, -0.4478],
          [-0.1840, -0.4108,  0.5023]], requires_grad=True)),
 ('linear2.bias',
  Parameter containing:
  tensor([ 0.2353, -0.2842,  0.1289, -0.4250, -0.2547], requires_grad=True))]

## Optimization

We have showed how gradients are calculated with the backward() function. Having the gradients isn't enought for our models to learn. We also need to know how to update the parameters of our models. This is where the optomozers comes in. torch.optim module contains several optimizers that we can use. Some popular examples are optim.SGD and optim.Adam. When initializing optimizers, we pass our model parameters, which can be accessed with model.parameters(), telling the optimizers which values it will be optimizing. Optimizers also has a learning rate (lr) parameter, which determines how big of an update will be made in every step. Different optimizers have different hyperparameters as well.

In [66]:
import torch.optim as optim

After we have our optimization function, we can define a loss that we want to optimize for. We can either define the loss ourselves, or use one of the predefined loss function in PyTorch, such as nn.BCELoss(). Let's put everything together now! We will start by creating some dummy data.

In [67]:
# Create the y data
y = torch.ones(10, 5)

# Add some noise to our goal y to generate our x
# We want out model to predict our original data, albeit the noise
x = y + torch.randn_like(y)
x

tensor([[ 0.5522,  1.2808,  0.5177,  1.1953,  1.3007],
        [-0.2136,  1.8097,  1.9314, -0.2380,  0.2242],
        [ 1.6737,  1.0757,  1.1377,  0.4370, -0.9362],
        [ 1.3108,  3.3745,  2.1980,  0.6397,  0.3603],
        [ 0.5579,  1.2647,  0.8169,  1.1712,  1.0324],
        [ 0.5548, -0.3197,  0.3583,  0.7412,  0.7948],
        [-0.0565,  2.8970,  2.3576,  1.8476,  0.5214],
        [-0.2722,  1.2165,  1.2751,  0.9607,  2.7261],
        [ 1.2929,  1.7935,  2.6197,  1.6742, -0.2893],
        [ 1.7099,  0.3498,  1.3995,  2.2894, -0.4380]])

Now, we can define our model, optimizer and the loss function.

In [68]:
# Instantiate the model
model = MultilayerPerceptron(5, 3)

# Define the optimizer
adam = optim.Adam(model.parameters(), lr=1e-1)

# Define loss using a predefined loss function
loss_function = nn.BCELoss()

# Calculate how our model is doing now
y_pred = model(x)
loss_function(y_pred, y).item()

0.7217341065406799

Let's see if we can have our model achieve a smaller loss. Now that we have everything we need, we can setup our training loop.

In [69]:
# Set the number of epoch, which determines the number of training iterations
n_epoch = 10 

for epoch in range(n_epoch):
  # Set the gradients to 0
  adam.zero_grad()

  # Get the model predictions
  y_pred = model(x)

  # Get the loss
  loss = loss_function(y_pred, y)

  # Print stats
  print(f"Epoch {epoch}: traing loss: {loss}")

  # Compute the gradients
  loss.backward()

  # Take a step to optimize the weights
  adam.step()

Epoch 0: traing loss: 0.7217341065406799
Epoch 1: traing loss: 0.5939948558807373
Epoch 2: traing loss: 0.47342708706855774
Epoch 3: traing loss: 0.34594619274139404
Epoch 4: traing loss: 0.23017439246177673
Epoch 5: traing loss: 0.14004084467887878
Epoch 6: traing loss: 0.07889825105667114
Epoch 7: traing loss: 0.042169179767370224
Epoch 8: traing loss: 0.022025126963853836
Epoch 9: traing loss: 0.011530808173120022


In [70]:
# See how our model performs on the training data
y_pred = model(x)
y_pred

tensor([[0.9992, 0.9995, 0.9972, 0.9997, 0.9933],
        [0.9918, 0.9948, 0.9854, 0.9956, 0.9675],
        [0.9871, 0.9916, 0.9798, 0.9927, 0.9560],
        [0.9999, 0.9999, 0.9992, 1.0000, 0.9980],
        [0.9990, 0.9994, 0.9967, 0.9996, 0.9922],
        [0.9878, 0.9919, 0.9803, 0.9931, 0.9585],
        [0.9999, 0.9999, 0.9991, 0.9999, 0.9978],
        [0.9998, 0.9999, 0.9991, 0.9999, 0.9977],
        [0.9996, 0.9998, 0.9983, 0.9999, 0.9959],
        [0.9985, 0.9991, 0.9957, 0.9993, 0.9898]], grad_fn=<SigmoidBackward0>)

In [71]:
# Create test data and check how our model performs on it
x2 = y + torch.randn_like(y)
y_pred = model(x2)
y_pred

tensor([[0.9975, 0.9985, 0.9937, 0.9988, 0.9854],
        [0.9999, 0.9999, 0.9993, 1.0000, 0.9982],
        [0.9875, 0.9919, 0.9803, 0.9930, 0.9571],
        [1.0000, 1.0000, 0.9996, 1.0000, 0.9990],
        [1.0000, 1.0000, 0.9998, 1.0000, 0.9995],
        [0.9992, 0.9995, 0.9971, 0.9996, 0.9930],
        [0.9997, 0.9999, 0.9987, 0.9999, 0.9968],
        [1.0000, 1.0000, 0.9998, 1.0000, 0.9994],
        [0.9952, 0.9970, 0.9900, 0.9976, 0.9774],
        [1.0000, 1.0000, 0.9997, 1.0000, 0.9992]], grad_fn=<SigmoidBackward0>)