In [90]:
import torch
import torch.nn as nn

# Import pprint, module we use for making our print statements prettier
import pprint
pp = pprint.PrettyPrinter()

In [91]:
# Initialize a tensor from a Python List
data = [
        [0, 1], 
        [2, 3],
        [4, 5]
       ]
x_python = torch.tensor(data)

# Print the tensor
x_python

tensor([[0, 1],
        [2, 3],
        [4, 5]])

In [92]:
# We are using the dtype to create a tensor of particular type
x_float = torch.tensor(data, dtype=torch.float)
x_float

tensor([[0., 1.],
        [2., 3.],
        [4., 5.]])

In [93]:
# We are using the dtype to create a tensor of particular type
x_float = torch.tensor(data, dtype=torch.float)
x_float

tensor([[0., 1.],
        [2., 3.],
        [4., 5.]])

In [94]:
# We are using the dtype to create a tensor of particular type
x_bool = torch.tensor(data, dtype=torch.bool)
x_bool

tensor([[False,  True],
        [ True,  True],
        [ True,  True]])

We can also use tensor.FloatTensor, tensor.LongTensor, tensor.Tensor classes to instantiate a tensor of particular type. LongTensors are particularly important in NLP as many methods that deal with indices require the indices to be passed as a LongTensor, which is a 64 bit integer.

In [95]:
import numpy as np

# Initialize a tensor from a NumPy array
ndarray = np.array(data)
x_numpy = torch.from_numpy(ndarray)

# Print the tensor
x_numpy

tensor([[0, 1],
        [2, 3],
        [4, 5]])

From a Tensor
We can also initialize a tensor from another tensor, using the following methods:

torch.ones_like(old_tensor): Initializes a tensor of 1s.
torch.zeros_like(old_tensor): Initializes a tensor of 0s.
torch.rand_like(old_tensor): Initializes a tensor where all the elements are sampled from a uniform distribution between 0 and 1.
torch.randn_like(old_tensor): Initializes a tensor where all the elements are sampled from a normal distribution.
All of these methods preserve the tensor properties of the original tensor passed in, such as the shape and device, which we will cover in a bit.

From a Tensor
We can also initialize a tensor from another tensor, using the following methods:

torch.ones_like(old_tensor): Initializes a tensor of 1s.
torch.zeros_like(old_tensor): Initializes a tensor of 0s.
torch.rand_like(old_tensor): Initializes a tensor where all the elements are sampled from a uniform distribution between 0 and 1.
torch.randn_like(old_tensor): Initializes a tensor where all the elements are sampled from a normal distribution.
All of these methods preserve the tensor properties of the original tensor passed in, such as the shape and device, which we will cover in a bit.

In [96]:
# Initialize a base tensor
x = torch.tensor([[1., 2.], [3., 4.]])
x

tensor([[1., 2.],
        [3., 4.]])

In [97]:
# Initialize a tensor of 0s
x_zeros = torch.zeros_like(x)
x_zeros

tensor([[0., 0.],
        [0., 0.]])

In [98]:
# Initialize a tensor of 1s
x_ones = torch.ones_like(x)
x_ones


tensor([[1., 1.],
        [1., 1.]])

In [99]:
# Initialize a tensor where each element is sampled from a uniform distribution
# between 0 and 1
x_rand = torch.rand_like(x)
x_rand

tensor([[0.5707, 0.3576],
        [0.6809, 0.1375]])

In [100]:
# Initialize a tensor where each element is sampled from a normal distribution
x_randn = torch.randn_like(x)
x_randn

tensor([[ 0.5783, -0.7311],
        [-0.5846,  0.4765]])

By Specifying a Shape
We can also instantiate tensors by specifying their shapes (which we will cover in more detail in a bit). The methods we could use follow the ones in the previous section:

torch.zeros()
torch.ones()
torch.rand()
torch.randn(

In [101]:
# Initialize a 2x3x2 tensor of 0s
shape = (4, 2, 2)
x_zeros = torch.zeros(shape) # x_zeros = torch.zeros(4, 3, 2) is an alternative
x_zeros

tensor([[[0., 0.],
         [0., 0.]],

        [[0., 0.],
         [0., 0.]],

        [[0., 0.],
         [0., 0.]],

        [[0., 0.],
         [0., 0.]]])

# With torch.arange()


In [102]:
# Create a tensor with values 0-9
x = torch.arange(10)
x

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [103]:
# Initialize a 3x2 tensor, with 3 rows and 2 columns
x = torch.ones(3, 2)
x.dtype

torch.float32

# Tensor Properties
Tensors have a few properties that are important for us to cover. These are namely shape, and the device properties. -->

In [104]:
# Initialize a 3x2 tensor, with 3 rows and 2 columns
x = torch.Tensor([[1, 2], [3, 4], [5, 6]])
x

tensor([[1., 2.],
        [3., 4.],
        [5., 6.]])

# Shape
The shape property tells us the shape of our tensor. This can help us identify how many dimensional our tensor is as well as how many elements exist in each dimension.

In [105]:
# Initialize a 3x2 tensor, with 3 rows and 2 columns
x = torch.Tensor([[1, 2], [3, 4], [5, 6]])
x

tensor([[1., 2.],
        [3., 4.],
        [5., 6.]])

In [106]:
# Print out its shape
# Same as x.size()
x.shape

torch.Size([3, 2])

In [107]:
# Print out the number of elements in a particular dimension
# 0th dimension corresponds to the rows
x.shape[0]

3

In [108]:
# Get the size of the 0th dimension
x.size(0)


3

In [109]:
x.shape

torch.Size([3, 2])

In [110]:
# Example use of view()
# x_view shares the same memory as x, so changing one changes the other
x_view = x.view(3, 2)
x_view

tensor([[1., 2.],
        [3., 4.],
        [5., 6.]])

We can change the shape of a tensor with the view() method.



In [111]:
x_view = x.view(-1,3)
x_view

tensor([[1., 2., 3.],
        [4., 5., 6.]])

In [112]:
tensor = torch.tensor([[1, 2, 3], [4, 5, 6]])
reshaped = tensor.view(-1, 2)  # Automatically calculates rows

print(reshaped)

tensor([[1, 2],
        [3, 4],
        [5, 6]])


In [113]:
reshaped = tensor.view(-1,3) 
print(reshaped)

tensor([[1, 2, 3],
        [4, 5, 6]])


In [114]:
tensor = torch.arange(10).view(2, 5)
print(tensor)
transposed = tensor.T  # Non-contiguous tensor
print(transposed)
try:
    reshaped = transposed.view(-1)  # This will raise an error
except RuntimeError as e:
    print(e)

# Fix with contiguous
reshaped = transposed.contiguous().view(-1)
print(reshaped)


tensor([[0, 1, 2, 3, 4],
        [5, 6, 7, 8, 9]])
tensor([[0, 5],
        [1, 6],
        [2, 7],
        [3, 8],
        [4, 9]])
view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.
tensor([0, 5, 1, 6, 2, 7, 3, 8, 4, 9])


# A 3D tensor of shape (2, 3, 4)


Transposing 3d Tensor

In [115]:
import torch

# A 3D tensor of shape (2, 3, 4)
tensor = torch.arange(24).view(2, 3, 4)
print("Original Tensor:")
print(tensor)

# Transposing dimensions 1 and 2 (swap last two dimensions)
transposed = tensor.transpose(1, 2)
print("\nTransposed Tensor (Shape: 2, 4, 3):")
print(transposed)


Original Tensor:
tensor([[[ 0,  1,  2,  3],
         [ 4,  5,  6,  7],
         [ 8,  9, 10, 11]],

        [[12, 13, 14, 15],
         [16, 17, 18, 19],
         [20, 21, 22, 23]]])

Transposed Tensor (Shape: 2, 4, 3):
tensor([[[ 0,  4,  8],
         [ 1,  5,  9],
         [ 2,  6, 10],
         [ 3,  7, 11]],

        [[12, 16, 20],
         [13, 17, 21],
         [14, 18, 22],
         [15, 19, 23]]])


We can also use torch.reshape() method for a similar purpose. There is a subtle difference between reshape() and view(): view() requires the data to be stored contiguously in the memory. You can refer to this StackOverflow answer for more information. In simple terms, contiguous means that the way our data is laid out in the memory is the same as the way we would read elements from it. This happens because some methods, such as transpose() and view(), do not actually change how our data is stored in the memory. They just change the meta information about out tensor, so that when we use it we will see the elements in the order we expect.

reshape() calls view() internally if the data is stored contiguously, if not, it returns a copy. The difference here isn't too important for basic tensors, but if you perform operations that make the underlying storage of the data non-contiguous (such as taking a transpose), you will have issues using view(). If you would like to match the way your tensor is stored in the memory to how it is used, you can use the contiguous() method.



In [116]:
# Change the shape of x to be 3x2
# x_reshaped could be a reference to or copy of x
x_reshaped = torch.reshape(x, (2, 3))
x_reshaped

tensor([[1., 2., 3.],
        [4., 5., 6.]])

We can use torch.unsqueeze(x, dim) function to add a dimension of size 1 to the provided dim, where x is the tensor. We can also use the corresponding use torch.squeeze(x), which removes the dimensions of size 1.

In [117]:
# Initialize a 5x2 tensor, with 5 rows and 2 columns
x = torch.arange(10).reshape(5, 2)
x


tensor([[0, 1],
        [2, 3],
        [4, 5],
        [6, 7],
        [8, 9]])

# UNSQUEEZE just adds a dimension means . asuume it creates a seperate dimension at that position and from that position to the right everything will be under that 1 dimension

In [118]:
# Add a new dimension of size 1 at the 1st dimension
x = x.unsqueeze(1)
x.shape


torch.Size([5, 1, 2])

In [119]:
# Squeeze the dimensions of x by getting rid of all the dimensions with 1 element
x = x.squeeze()
x.shape

torch.Size([5, 2])

In [120]:
x = x.unsqueeze(1)
x = x.unsqueeze(1)
x = x.unsqueeze(1)
x.shape

torch.Size([5, 1, 1, 1, 2])

In [121]:
x = x.squeeze()
x.shape

torch.Size([5, 2])

# understand the above example it will be clear


If we want to get the total number of elements in a tensor, we can use the numel() method.



In [122]:
x

tensor([[0, 1],
        [2, 3],
        [4, 5],
        [6, 7],
        [8, 9]])

In [123]:
x.numel()

10

# Device
Device property tells PyTorch where to store our tensor. Where a tensor is stored determines which device, GPU or CPU, would be handling the computations involving it. We can find the device of a tensor with the device property

In [124]:
# Initialize an example tensor
x = torch.Tensor([[1, 2], [3, 4]])
x

tensor([[1., 2.],
        [3., 4.]])

In [125]:
# Check if a GPU is available, if so, move the tensor to the GPU
if torch.cuda.is_available():
  x.to('cuda') 

In [126]:
x.device

device(type='cpu')

In [127]:
if torch.cuda.is_available():
    device = torch.device("cuda:"+str(2))


In [128]:
device

device(type='cuda', index=2)

In [129]:
x.to(device=2)

tensor([[1., 2.],
        [3., 4.]], device='cuda:2')

In [130]:
x.device

device(type='cpu')

In [131]:
!nvidia-smi

Sat Dec 14 15:48:14 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.154.05             Driver Version: 535.154.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-SXM2-32GB           Off | 00000000:06:00.0 Off |                    0 |
| N/A   35C    P0              57W / 300W |  32047MiB / 32768MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2-32GB           Off | 00000000:07:00.0 Off |  

# Tensor Indexing


In [132]:
# Initialize an example tensor
x = torch.Tensor([
                  [[1, 2], [3, 4]],
                  [[5, 6], [7, 8]], 
                  [[9, 10], [11, 12]] 
                 ])
x

tensor([[[ 1.,  2.],
         [ 3.,  4.]],

        [[ 5.,  6.],
         [ 7.,  8.]],

        [[ 9., 10.],
         [11., 12.]]])

In [133]:
x.shape

torch.Size([3, 2, 2])

In [134]:
# Access the 0th element, which is the first row
x[0] # Equivalent to x[0, :]

tensor([[1., 2.],
        [3., 4.]])

In [135]:
# Get the top left element of each element in our tensor
x[:, 0, 0]

tensor([1., 5., 9.])

In [136]:
# Let's access the 0th and 1st elements, each twice
i = torch.tensor([0, 0, 1, 1])
result = x[i]

In [137]:
result.shape

torch.Size([4, 2, 2])

In [138]:
# Let's access the 0th elements of the 1st and 2nd elements
i = torch.tensor([1, 2])
j = torch.tensor([0])
x[i, j]

tensor([[ 5.,  6.],
        [ 9., 10.]])

3. Combined Indexing x[i, j]:
In PyTorch, x[i, j] performs element-wise indexing:





i and j must have compatible shapes for broadcasting.
Here, i = [1, 2] and j = [0] broadcast to [1, 2] and [0, 0].
Therefore, it selects:

//


x[1, 0] → 30 (from row 1, column 0)
x[2, 0] → 50 (from row 2, column 0)


In [139]:
# Create an example tensor
x = torch.ones((3,2,2))
x
# Perform elementwise addition
# Use - for subtraction
x + 2
# Perform elementwise multiplication
# Use / for division
x * 2
# Create a 4x3 tensor of 6s
a = torch.ones((4,3)) * 6
a
# Create a 1D tensor of 2s
b = torch.ones(3) * 2
b
# Divide a by b
a / b


tensor([[3., 3., 3.],
        [3., 3., 3.],
        [3., 3., 3.],
        [3., 3., 3.]])

This operation divides tensor a by tensor b.
Broadcasting rules apply since the shapes of a (4, 3) and b (3) are different:
Tensor b is "stretched" to match the shape of a along its first dimension.
Essentially, b is treated as:
tensor([[2., 2., 2.],
        [2., 2., 2.],
        [2., 2., 2.],
        [2., 2., 2.]])



In [140]:
# Alternative to a.matmul(b)
# a @ b.T returns the same result since b is 1D tensor and the 2nd dimension
# is inferred
a @ b 

tensor([36., 36., 36., 36.])

We can use tensor.matmul(other_tensor) for matrix multiplication and tensor.T for transpose. Matrix multiplication can also be performed with @.



In [141]:
# Create an example tensor
m = torch.tensor(
    [
     [1., 1.],
     [2., 2.],
     [3., 3.],
     [4., 4.]
    ]
)

pp.pprint("Mean: {}".format(m.mean()))
pp.pprint("Mean in the 0th dimension: {}".format(m.mean(0)))
pp.pprint("Mean in the 1st dimension: {}".format(m.mean(1)))

'Mean: 2.5'
'Mean in the 0th dimension: tensor([2.5000, 2.5000])'
'Mean in the 1st dimension: tensor([1., 2., 3., 4.])'


In [142]:
a
a.shape

torch.Size([4, 3])

# Conactenation big one


In [143]:
# Concatenate in dimension 0 and 1
a_cat0 = torch.cat([a, a, a], dim=0)
a_cat1 = torch.cat([a, a, a], dim=1)

print("Initial shape: {}".format(a.shape))
print("Shape after concatenation in dimension 0: {}".format(a_cat0.shape))
print("Shape after concatenation in dimension 1: {}".format(a_cat1.shape))

Initial shape: torch.Size([4, 3])
Shape after concatenation in dimension 0: torch.Size([12, 3])
Shape after concatenation in dimension 1: torch.Size([4, 9])


In [144]:
a_cat0

tensor([[6., 6., 6.],
        [6., 6., 6.],
        [6., 6., 6.],
        [6., 6., 6.],
        [6., 6., 6.],
        [6., 6., 6.],
        [6., 6., 6.],
        [6., 6., 6.],
        [6., 6., 6.],
        [6., 6., 6.],
        [6., 6., 6.],
        [6., 6., 6.]])

In [145]:
a_cat1

tensor([[6., 6., 6., 6., 6., 6., 6., 6., 6.],
        [6., 6., 6., 6., 6., 6., 6., 6., 6.],
        [6., 6., 6., 6., 6., 6., 6., 6., 6.],
        [6., 6., 6., 6., 6., 6., 6., 6., 6.]])

In [146]:
# add_() is in place
a.add_(a)
a


tensor([[12., 12., 12.],
        [12., 12., 12.],
        [12., 12., 12.],
        [12., 12., 12.]])

In [147]:
# Compute a more complex function
a = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
b = a * 2 + 1  # b = 2a + 1
c = b.mean()  # Scalar output

c.backward()  # Compute gradients
print("Gradient of a:", a.grad)  # dc/da


Gradient of a: tensor([0.6667, 0.6667, 0.6667])


# Autograd

In [148]:
# Create an example tensor
# requires_grad parameter tells PyTorch to store gradients
x = torch.tensor([2.], requires_grad=True)

# Print the gradient if it is calculated
# Currently None since x is a scalar
pp.pprint(x.grad)

None


In [149]:
# Calculating the gradient of y with respect to x
y = x * x * 3 # 3x^2
y.backward()
pp.pprint(x.grad) # d(y)/d(x) = d(3x^2)/d(x) = 6x = 12

tensor([12.])


In [150]:
z = x * x * 3 # 3x^2
z.backward()
pp.pprint(x.grad)

tensor([24.])


In [151]:
z = x * x * 3  # 3x^2
z.backward()
pp.pprint(x.grad)

tensor([36.])


# Neural Network Module



So far we have looked into the tensors, their properties and basic operations on tensors. These are especially useful to get familiar with if we are building the layers of our network from scratch. We will utilize these in Assignment 3, but moving forward, we will use predefined blocks in the torch.nn module of PyTorch. We will then put together these blocks to create complex networks. Let's start by importing this module with an alias so that we don't have to type torch every time we use it.



In [152]:
import torch
import torch.nn as nn

# Input tensor: Shape (2, 3, 4)
input = torch.ones(2, 3, 4)

# Linear layer: Transforms input of size 4 to output of size 2
linear = nn.Linear(4, 2)
linear_output = linear(input)
print("Linear Output Shape:", linear_output.shape)


Linear Output Shape: torch.Size([2, 3, 2])


In [153]:
linear_output

tensor([[[-0.2955,  0.7897],
         [-0.2955,  0.7897],
         [-0.2955,  0.7897]],

        [[-0.2955,  0.7897],
         [-0.2955,  0.7897],
         [-0.2955,  0.7897]]], grad_fn=<ViewBackward0>)

In [154]:
linear_output.mean()

tensor(0.2471, grad_fn=<MeanBackward0>)

In [155]:
# Define a dummy loss (mean of output elements)
loss = linear_output.mean()

# Perform backpropagation
loss.backward()

# Gradients of weights and biases
print("Weight Gradient:", linear.weight.grad)
print("Bias Gradient:", linear.bias.grad)


Weight Gradient: tensor([[0.5000, 0.5000, 0.5000, 0.5000],
        [0.5000, 0.5000, 0.5000, 0.5000]])
Bias Gradient: tensor([0.5000, 0.5000])


In [156]:
# Summary
# Batch Size: Number of independent samples processed simultaneously. It's a parallelization tool to improve computational efficiency.
# Sequence Length: The number of steps, rows, or units per sample in the batch. Its interpretation varies by context (time steps, words, etc.).
# In your tensor, 
# (
# 2
# ,
# 3
# ,
# 4
# )
# (2,3,4):
# Batch size = 2
# Sequence length = 3
# Feature dimension = 4

# Other Module Layers
There are several other preconfigured layers in the nn module. Some commonly used examples are nn.Conv2d, nn.ConvTranspose2d, nn.BatchNorm1d, nn.BatchNorm2d, nn.Upsample and nn.MaxPool2d among many others. We will learn more about these as we progress in the course. For now, the only important thing to remember is that we can treat each of these layers as plug and play components: we will be providing the required dimensions and PyTorch will take care of setting them up.



# Activation Function Layer

We can also use the nn module to apply activations functions to our tensors. Activation functions are used to add non-linearity to our network. Some examples of activations functions are nn.ReLU(), nn.Sigmoid() and nn.LeakyReLU(). Activation functions operate on each element seperately, so the shape of the tensors we get as an output are the same as the ones we pass in.




In [157]:
sigmoid = nn.Sigmoid()
output = sigmoid(linear_output)
output

tensor([[[0.4266, 0.6878],
         [0.4266, 0.6878],
         [0.4266, 0.6878]],

        [[0.4266, 0.6878],
         [0.4266, 0.6878],
         [0.4266, 0.6878]]], grad_fn=<SigmoidBackward0>)

# Putting the Layers Together


In [158]:
block = nn.Sequential(
    nn.Linear(4, 2),
    nn.Sigmoid()
)

input = torch.ones(2,3,4)
output = block(input)
output

tensor([[[0.6004, 0.3913],
         [0.6004, 0.3913],
         [0.6004, 0.3913]],

        [[0.6004, 0.3913],
         [0.6004, 0.3913],
         [0.6004, 0.3913]]], grad_fn=<SigmoidBackward0>)

# Custom Modules
Instead of using the predefined modules, we can also build our own by extending the nn.Module class. For example, we can build a the nn.Linear (which also extends nn.Module) on our own using the tensor introduced earlier! We can also build new, more complex modules, such as a custom neural network. You will be practicing these in the later assignment.

To create a custom module, the first thing we have to do is to extend the nn.Module. We can then initialize our parameters in the __init__ function, starting with a call to the __init__ function of the super class. All the class attributes we define which are nn module objects are treated as parameters, which can be learned during the training. Tensors are not parameters, but they can be turned into parameters if they are wrapped in nn.Parameter class.

All classes extending nn.Module are also expected to implement a forward(x) function, where x is a tensor. This is the function that is called when a parameter is passed to our module, such as in model(x).

# 1 st way

In [159]:
class MultilayerPerceptron(nn.Module):

  def __init__(self, input_size, hidden_size):
    # Call to the __init__ function of the super class
    super(MultilayerPerceptron, self).__init__()

    # Bookkeeping: Saving the initialization parameters
    self.input_size = input_size 
    self.hidden_size = hidden_size 

    # Defining of our model
    # There isn't anything specific about the naming of `self.model`. It could
    # be something arbitrary.
    self.model = nn.Sequential(
        nn.Linear(self.input_size, self.hidden_size),
        nn.ReLU(),
        nn.Linear(self.hidden_size, self.input_size),
        nn.Sigmoid()
    )
    
  def forward(self, x):
    output = self.model(x)
    return output

# 2 nd way

In [160]:
class MultilayerPerceptron(nn.Module):

  def __init__(self, input_size, hidden_size):
    # Call to the __init__ function of the super class
    super(MultilayerPerceptron, self).__init__()

    # Bookkeeping: Saving the initialization parameters
    self.input_size = input_size 
    self.hidden_size = hidden_size 

    # Defining of our layers
    self.linear = nn.Linear(self.input_size, self.hidden_size)
    self.relu = nn.ReLU()
    self.linear2 = nn.Linear(self.hidden_size, self.input_size)
    self.sigmoid = nn.Sigmoid()
    
  def forward(self, x):
    linear = self.linear(x)
    relu = self.relu(linear)
    linear2 = self.linear2(relu)
    output = self.sigmoid(linear2)
    return output

In [161]:
# Make a sample input
input = torch.randn(2, 5)

# Create our model
model = MultilayerPerceptron(5, 3)

# Pass our input through our model
model(input)

tensor([[0.5240, 0.3813, 0.5767, 0.4643, 0.5471],
        [0.5370, 0.3656, 0.6397, 0.5570, 0.5527]], grad_fn=<SigmoidBackward0>)

# We can inspect the parameters of our model with named_parameters() and parameters() methods.



In [162]:
list(model.named_parameters())
# list(model.parameters())


[('linear.weight',
  Parameter containing:
  tensor([[ 0.1575, -0.0194, -0.2044,  0.3691, -0.1456],
          [-0.3416, -0.0482, -0.4342,  0.2628, -0.0540],
          [-0.2163, -0.2916, -0.2140,  0.4041,  0.3812]], requires_grad=True)),
 ('linear.bias',
  Parameter containing:
  tensor([ 0.2761,  0.0789, -0.2437], requires_grad=True)),
 ('linear2.weight',
  Parameter containing:
  tensor([[-0.3080,  0.4572, -0.3109],
          [-0.0031, -0.1281, -0.0758],
          [-0.0423,  0.5671,  0.1794],
          [ 0.4506,  0.2087, -0.4294],
          [-0.0601,  0.1130,  0.0197]], requires_grad=True)),
 ('linear2.bias',
  Parameter containing:
  tensor([ 0.2264, -0.4827,  0.3270, -0.3335,  0.2145], requires_grad=True))]

# 6. Summary
Variables used in forward are not automatically parameters.
Only variables:
Defined in __init__, and
Registered as part of the model (via nn.Linear, nn.Parameter, etc.) are considered parameters.
Temporary variables or tensors defined dynamically in forward are not treated as parameters

# 7 Optimization

We have showed how gradients are calculated with the backward() function. Having the gradients isn't enought for our models to learn. We also need to know how to update the parameters of our models. This is where the optomozers comes in. torch.optim module contains several optimizers that we can use. Some popular examples are optim.SGD and optim.Adam. When initializing optimizers, we pass our model parameters, which can be accessed with model.parameters(), telling the optimizers which values it will be optimizing. Optimizers also has a learning rate (lr) parameter, which determines how big of an update will be made in every step. Different optimizers have different hyperparameters as well

In [163]:
import torch.optim as optim


In [164]:
# Create the y data
y = torch.ones(10, 5)

# Add some noise to our goal y to generate our x
# We want out model to predict our original data, albeit the noise
x = y + torch.randn_like(y)
x

tensor([[-1.1246,  0.4158,  0.6701,  2.3684,  1.6137],
        [ 1.1349,  2.9332,  0.6678,  1.3227,  2.6178],
        [-0.0701, -0.3620,  0.1470,  0.4678,  0.0390],
        [ 0.9460,  1.3305, -1.7679, -0.3706,  1.5793],
        [ 0.2037,  1.1595,  1.1309,  0.2724, -0.3707],
        [ 1.3111, -0.1565,  0.2871,  0.3701,  0.9300],
        [ 0.7448, -0.1652, -0.1627, -0.0862,  1.4703],
        [ 1.7290,  1.4431,  1.8540,  1.3332,  0.0223],
        [ 1.6582,  0.3306,  0.2948,  1.1027, -0.2368],
        [ 1.9333,  0.6692,  0.3107, -0.4042, -0.0662]])

In [165]:
# Instantiate the model
model = MultilayerPerceptron(5, 3)

# Define the optimizer
adam = optim.Adam(model.parameters(), lr=1e-1)

# Define loss using a predefined loss function
loss_function = nn.BCELoss()

# Calculate how our model is doing now
y_pred = model(x)
loss_function(y_pred, y).item()

0.5697386860847473

After we have our optimization function, we can define a loss that we want to optimize for. We can either define the loss ourselves, or use one of the predefined loss function in PyTorch, such as nn.BCELoss(). Let's put everything together now! We will start by creating some dummy data.

In [166]:
# Set the number of epoch, which determines the number of training iterations
n_epoch = 150 

for epoch in range(n_epoch):
  # Set the gradients to 0
  adam.zero_grad()

  # Get the model predictions
  y_pred = model(x)

  # Get the loss
  loss = loss_function(y_pred, y)

  # Print stats
  print(f"Epoch {epoch}: traing loss: {loss}")

  # Compute the gradients
  loss.backward()

  # Take a step to optimize the weights
  adam.step()

Epoch 0: traing loss: 0.5697386860847473
Epoch 1: traing loss: 0.4518817961215973
Epoch 2: traing loss: 0.31862181425094604
Epoch 3: traing loss: 0.20102010667324066
Epoch 4: traing loss: 0.11843087524175644
Epoch 5: traing loss: 0.06561624258756638
Epoch 6: traing loss: 0.03644599765539169
Epoch 7: traing loss: 0.02103191614151001
Epoch 8: traing loss: 0.012776395305991173
Epoch 9: traing loss: 0.008140631020069122
Epoch 10: traing loss: 0.005390300881117582
Epoch 11: traing loss: 0.003679005429148674
Epoch 12: traing loss: 0.002574361627921462
Epoch 13: traing loss: 0.0018410177435725927
Epoch 14: traing loss: 0.001343145384453237
Epoch 15: traing loss: 0.0009986020158976316
Epoch 16: traing loss: 0.0007560430676676333
Epoch 17: traing loss: 0.0005824940744787455
Epoch 18: traing loss: 0.00045641648466698825
Epoch 19: traing loss: 0.0003634699387475848
Epoch 20: traing loss: 0.00029399021877907217
Epoch 21: traing loss: 0.00024132843827828765
Epoch 22: traing loss: 0.0002009059535339

Epoch 37: traing loss: 4.286671901354566e-05
Epoch 38: traing loss: 4.079274367541075e-05
Epoch 39: traing loss: 3.8983569538686424e-05
Epoch 40: traing loss: 3.7389094359241426e-05
Epoch 41: traing loss: 3.5985445720143616e-05
Epoch 42: traing loss: 3.473805554676801e-05
Epoch 43: traing loss: 3.363734867889434e-05
Epoch 44: traing loss: 3.2652333175064996e-05
Epoch 45: traing loss: 3.1778228731127456e-05
Epoch 46: traing loss: 3.099595778621733e-05
Epoch 47: traing loss: 3.0292389055830427e-05
Epoch 48: traing loss: 2.9661574444617145e-05
Epoch 49: traing loss: 2.9093960620230064e-05
Epoch 50: traing loss: 2.858120024029631e-05
Epoch 51: traing loss: 2.8118527552578598e-05
Epoch 52: traing loss: 2.7696401957655326e-05
Epoch 53: traing loss: 2.7316013074596412e-05
Epoch 54: traing loss: 2.6965437427861616e-05
Epoch 55: traing loss: 2.664348176040221e-05
Epoch 56: traing loss: 2.635491000546608e-05
Epoch 57: traing loss: 2.6086618163390085e-05
Epoch 58: traing loss: 2.5836210625129752e

You can see that our loss is decreasing. Let's check the predictions of our model now and see if they are close to our original y, which was all 1s.



In [167]:
# See how our model performs on the training data
y_pred = model(x)
y_pred

tensor([[1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [0.9998, 0.9997, 0.9998, 0.9999, 0.9999],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000]], grad_fn=<SigmoidBackward0>)

In [168]:
# Create test data and check how our model performs on it
x2 =  torch.ones(10, 5)*7+ torch.randn_like(y)
print(x2)
y_pred = model(x2)
print(model.parameters)
y_pred

tensor([[4.4915, 6.5473, 7.8561, 6.1052, 8.9219],
        [7.7268, 6.8075, 4.2031, 9.5671, 6.1436],
        [8.8899, 6.5117, 7.1294, 8.2141, 4.7021],
        [7.2137, 8.5314, 7.9726, 8.1570, 7.0697],
        [6.6888, 7.0549, 8.4104, 6.3693, 6.3517],
        [6.0518, 7.6293, 7.9173, 7.0912, 8.0458],
        [7.7968, 6.3989, 5.6888, 5.8699, 7.3317],
        [6.9274, 7.6337, 6.4889, 6.8725, 5.6553],
        [8.5854, 6.8114, 7.4371, 8.8274, 7.4435],
        [6.6794, 7.3597, 7.0120, 7.3108, 5.6795]])
<bound method Module.parameters of MultilayerPerceptron(
  (linear): Linear(in_features=5, out_features=3, bias=True)
  (relu): ReLU()
  (linear2): Linear(in_features=3, out_features=5, bias=True)
  (sigmoid): Sigmoid()
)>


tensor([[1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.]], grad_fn=<SigmoidBackward0>)

# Demo: Word Window Classification
Until this part of the notebook, we have learned the fundamentals of PyTorch and built a basic network solving a toy task. Now we will attempt to solve an example NLP task. Here are the things we will learn:

Data: Creating a Dataset of Batched Tensors
Modeling
Training
Prediction

In [169]:
# Our raw data, which consists of sentences
corpus = [
          "We always come to Paris",
          "The professor is from Australia",
          "I live in Stanford",
          "He comes from Taiwan",
          "The capital of Turkey is Ankara"
         ]

In [170]:
# The preprocessing function we will use to generate our training examples
# Our function is a simple one, we lowercase the letters
# and then tokenize the words.
def preprocess_sentence(sentence):
  return sentence.lower().split()

# Create our training set
train_sentences = [sent.lower().split() for sent in corpus]
train_sentences

[['we', 'always', 'come', 'to', 'paris'],
 ['the', 'professor', 'is', 'from', 'australia'],
 ['i', 'live', 'in', 'stanford'],
 ['he', 'comes', 'from', 'taiwan'],
 ['the', 'capital', 'of', 'turkey', 'is', 'ankara']]

In [171]:
# Set of locations that appear in our corpus
locations = set(["australia", "ankara", "paris", "stanford", "taiwan", "turkey"])

# Our train labels
train_labels = [[1 if word in locations else 0 for word in sent] for sent in train_sentences]
train_labels

[[0, 0, 0, 0, 1],
 [0, 0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 0, 0, 1, 0, 1]]

In [172]:
# Find all the unique words in our corpus 
vocabulary = set(w for s in train_sentences for w in s)
vocabulary

{'always',
 'ankara',
 'australia',
 'capital',
 'come',
 'comes',
 'from',
 'he',
 'i',
 'in',
 'is',
 'live',
 'of',
 'paris',
 'professor',
 'stanford',
 'taiwan',
 'the',
 'to',
 'turkey',
 'we'}

In [173]:
# Add the unknown token to our vocabulary
vocabulary.add("<unk>")

In [174]:

# Add the <pad> token to our vocabulary
vocabulary.add("<pad>")

# Function that pads the given sentence
# We are introducing this function here as an example
# We will be utilizing it later in the tutorial
def pad_window(sentence, window_size, pad_token="<pad>"):
  window = [pad_token] * window_size
  return window + sentence + window

# Show padding example
window_size = 2
pad_window(train_sentences[0], window_size=window_size)

['<pad>', '<pad>', 'we', 'always', 'come', 'to', 'paris', '<pad>', '<pad>']

In [175]:
# We are just converting our vocabularly to a list to be able to index into it
# Sorting is not necessary, we sort to show an ordered word_to_ind dictionary
# That being said, we will see that having the index for the padding token
# be 0 is convenient as some PyTorch functions use it as a default value
# such as nn.utils.rnn.pad_sequence, which we will cover in a bit
ix_to_word = sorted(list(vocabulary))

# Creating a dictionary to find the index of a given word
word_to_ix = {word: ind for ind, word in enumerate(ix_to_word)}
word_to_ix

{'<pad>': 0,
 '<unk>': 1,
 'always': 2,
 'ankara': 3,
 'australia': 4,
 'capital': 5,
 'come': 6,
 'comes': 7,
 'from': 8,
 'he': 9,
 'i': 10,
 'in': 11,
 'is': 12,
 'live': 13,
 'of': 14,
 'paris': 15,
 'professor': 16,
 'stanford': 17,
 'taiwan': 18,
 'the': 19,
 'to': 20,
 'turkey': 21,
 'we': 22}

In [176]:
# Given a sentence of tokens, return the corresponding indices
def convert_token_to_indices(sentence, word_to_ix):
  indices = []
  for token in sentence:
    # Check if the token is in our vocabularly. If it is, get it's index. 
    # If not, get the index for the unknown token.
    if token in word_to_ix:
      index = word_to_ix[token]
    else:
      index = word_to_ix["<unk>"]
    indices.append(index)
  return indices

# More compact version of the same function
def _convert_token_to_indices(sentence, word_to_ix):
  return [word_to_ind.get(token, word_to_ix["<unk>"]) for token in sentence]

# Show an example
example_sentence = ["we", "always", "come", "to", "kuwait"]
example_indices = convert_token_to_indices(example_sentence, word_to_ix)
restored_example = [ix_to_word[ind] for ind in example_indices]

print(f"Original sentence is: {example_sentence}")
print(f"Going from words to indices: {example_indices}")
print(f"Going from indices to words: {restored_example}")

Original sentence is: ['we', 'always', 'come', 'to', 'kuwait']
Going from words to indices: [22, 2, 6, 20, 1]
Going from indices to words: ['we', 'always', 'come', 'to', '<unk>']


In [177]:
# Converting our sentences to indices
example_padded_indices = [convert_token_to_indices(s, word_to_ix) for s in train_sentences]
example_padded_indices

[[22, 2, 6, 20, 15],
 [19, 16, 12, 8, 4],
 [10, 13, 11, 17],
 [9, 7, 8, 18],
 [19, 5, 14, 21, 12, 3]]

In [178]:
# Creating an embedding table for our words
embedding_dim = 5
embeds = nn.Embedding(len(vocabulary), embedding_dim)

# Printing the parameters in our embedding table
list(embeds.parameters())

[Parameter containing:
 tensor([[ 0.4321, -0.4427, -1.3339, -1.6981,  0.3264],
         [-0.6849,  1.0373,  0.5621,  0.1715,  0.0448],
         [-0.2237,  0.2622,  0.4394, -0.8891, -0.1788],
         [-1.2526, -0.6405,  0.8085, -0.1376, -1.9072],
         [ 1.4887,  0.8991, -0.3989, -1.2887, -0.2206],
         [ 1.9466, -1.8456,  0.3524,  0.3897,  1.2697],
         [ 1.2533,  0.5538,  0.2538, -0.4683,  0.3732],
         [ 2.6256, -0.7374, -0.3677, -0.0181,  0.4589],
         [ 0.4490, -0.1230, -0.7963,  0.5316, -0.0714],
         [ 1.0654, -0.1746,  0.8054, -1.0272, -1.6092],
         [-1.3187,  0.5855, -1.0503,  1.5619,  0.3118],
         [-0.4223, -0.6546,  0.9146,  1.7235, -0.5915],
         [-1.7244,  0.7667,  0.0973, -0.0636,  1.3586],
         [ 1.1849, -0.4574, -1.0163,  0.4566, -0.1258],
         [ 0.3742, -0.5091, -0.1079, -2.3596,  1.8711],
         [-1.1342,  0.5262,  1.7196, -0.1474, -0.8347],
         [ 0.3451, -0.1447,  0.4374,  0.0246, -1.1759],
         [ 0.9046, -0.345

In [179]:

# Get the embedding for the word Paris
index = word_to_ix["paris"]
index_tensor = torch.tensor(index, dtype=torch.long)
paris_embed = embeds(index_tensor)
paris_embed

tensor([-1.1342,  0.5262,  1.7196, -0.1474, -0.8347],
       grad_fn=<EmbeddingBackward0>)

In [180]:


# We can also get multiple embeddings at once
index_paris = word_to_ix["paris"]
index_ankara = word_to_ix["ankara"]
indices = [index_paris, index_ankara]
indices_tensor = torch.tensor(indices, dtype=torch.long)
embeddings = embeds(indices_tensor)
embeddings

tensor([[-1.1342,  0.5262,  1.7196, -0.1474, -0.8347],
        [-1.2526, -0.6405,  0.8085, -0.1376, -1.9072]],
       grad_fn=<EmbeddingBackward0>)

In [181]:
from torch.utils.data import DataLoader
from functools import partial

def custom_collate_fn(batch, window_size, word_to_ix):
  # Break our batch into the training examples (x) and labels (y)
  # We are turning our x and y into tensors because nn.utils.rnn.pad_sequence
  # method expects tensors. This is also useful since our model will be
  # expecting tensor inputs. 
  x, y = zip(*batch)

  # Now we need to window pad our training examples. We have already defined a 
  # function to handle window padding. We are including it here again so that
  # everything is in one place.
  def pad_window(sentence, window_size, pad_token="<pad>"):
    window = [pad_token] * window_size
    return window + sentence + window

  # Pad the train examples.
  x = [pad_window(s, window_size=window_size) for s in x]

  # Now we need to turn words in our training examples to indices. We are
  # copying the function defined earlier for the same reason as above.
  def convert_tokens_to_indices(sentence, word_to_ix):
    return [word_to_ix.get(token, word_to_ix["<unk>"]) for token in sentence]

  # Convert the train examples into indices.
  x = [convert_tokens_to_indices(s, word_to_ix) for s in x]

  # We will now pad the examples so that the lengths of all the example in 
  # one batch are the same, making it possible to do matrix operations. 
  # We set the batch_first parameter to True so that the returned matrix has 
  # the batch as the first dimension.
  pad_token_ix = word_to_ix["<pad>"]

  # pad_sequence function expects the input to be a tensor, so we turn x into one
  x = [torch.LongTensor(x_i) for x_i in x]
  x_padded = nn.utils.rnn.pad_sequence(x, batch_first=True, padding_value=pad_token_ix)

  # We will also pad the labels. Before doing so, we will record the number 
  # of labels so that we know how many words existed in each example. 
  lengths = [len(label) for label in y]
  lenghts = torch.LongTensor(lengths)

  y = [torch.LongTensor(y_i) for y_i in y]
  y_padded = nn.utils.rnn.pad_sequence(y, batch_first=True, padding_value=0)

  # We are now ready to return our variables. The order we return our variables
  # here will match the order we read them in our tr
    # We are now ready to return our variables. The order we return our variables
  # here will match the order we read them in our training loop.
  return x_padded, y_padded, lenghts  

In [182]:
def _custom_collate_fn(batch, window_size, word_to_ix):
  # Prepare the datapoints
  x, y = zip(*batch)  
  x = [pad_window(s, window_size=window_size) for s in x]
  x = [convert_tokens_to_indices(s, word_to_ix) for s in x]

  # Pad x so that all the examples in the batch have the same size
  pad_token_ix = word_to_ix["<pad>"]
  x = [torch.LongTensor(x_i) for x_i in x]
  x_padded = nn.utils.rnn.pad_sequence(x, batch_first=True, padding_value=pad_token_ix)

  # Pad y and record the length
  lengths = [len(label) for label in y]
  lenghts = torch.LongTensor(lengths)
  y = [torch.LongTensor(y_i) for y_i in y]
  y_padded = nn.utils.rnn.pad_sequence(y, batch_first=True, padding_value=0)

  return x_padded, y_padded, lenghts  

In [183]:
# Parameters to be passed to the DataLoader
data = list(zip(train_sentences, train_labels))
batch_size = 2
shuffle = True
window_size = 2
collate_fn = partial(custom_collate_fn, window_size=window_size, word_to_ix=word_to_ix)

# Instantiate the DataLoader
loader = DataLoader(data, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn)

# Go through one loop
counter = 0
for batched_x, batched_y, batched_lengths in loader:
  print(f"Iteration {counter}")
  print("Batched Input:")
  print(batched_x)
  print("Batched Labels:")
  print(batched_y)
  print("Batched Lengths:")
  print(batched_lengths)
  print("")
  counter += 1

Iteration 0
Batched Input:
tensor([[ 0,  0, 22,  2,  6, 20, 15,  0,  0],
        [ 0,  0,  9,  7,  8, 18,  0,  0,  0]])
Batched Labels:
tensor([[0, 0, 0, 0, 1],
        [0, 0, 0, 1, 0]])
Batched Lengths:
tensor([5, 4])

Iteration 1
Batched Input:
tensor([[ 0,  0, 10, 13, 11, 17,  0,  0,  0],
        [ 0,  0, 19, 16, 12,  8,  4,  0,  0]])
Batched Labels:
tensor([[0, 0, 0, 1, 0],
        [0, 0, 0, 0, 1]])
Batched Lengths:
tensor([4, 5])

Iteration 2
Batched Input:
tensor([[ 0,  0, 19,  5, 14, 21, 12,  3,  0,  0]])
Batched Labels:
tensor([[0, 0, 0, 1, 0, 1]])
Batched Lengths:
tensor([6])



In [184]:
# Print the original tensor
print(f"Original Tensor: ")
print(batched_x)
print("")

# Create the 2 * 2 + 1 chunks
chunk = batched_x.unfold(1, window_size*2 + 1, 1)
print(f"Windows: ")
print(chunk)

Original Tensor: 
tensor([[ 0,  0, 19,  5, 14, 21, 12,  3,  0,  0]])

Windows: 
tensor([[[ 0,  0, 19,  5, 14],
         [ 0, 19,  5, 14, 21],
         [19,  5, 14, 21, 12],
         [ 5, 14, 21, 12,  3],
         [14, 21, 12,  3,  0],
         [21, 12,  3,  0,  0]]])


# Model

In [185]:
class WordWindowClassifier(nn.Module):

  def __init__(self, hyperparameters, vocab_size, pad_ix=0):
    super(WordWindowClassifier, self).__init__()
    
    """ Instance variables """
    self.window_size = hyperparameters["window_size"]
    self.embed_dim = hyperparameters["embed_dim"]
    self.hidden_dim = hyperparameters["hidden_dim"]
    self.freeze_embeddings = hyperparameters["freeze_embeddings"]

    """ Embedding Layer 
    Takes in a tensor containing embedding indices, and returns the 
    corresponding embeddings. The output is of dim 
    (number_of_indices * embedding_dim).

    If freeze_embeddings is True, set the embedding layer parameters to be
    non-trainable. This is useful if we only want the parameters other than the
    embeddings parameters to change. 

    """
    self.embeds = nn.Embedding(vocab_size, self.embed_dim, padding_idx=pad_ix)
    if self.freeze_embeddings:
      self.embed_layer.weight.requires_grad = False

    """ Hidden Layer
    """
    full_window_size = 2 * window_size + 1
    self.hidden_layer = nn.Sequential(
      nn.Linear(full_window_size * self.embed_dim, self.hidden_dim), 
      nn.Tanh()
    )

    """ Output Layer
    """
    self.output_layer = nn.Linear(self.hidden_dim, 1)

    """ Probabilities 
    """
    self.probabilities = nn.Sigmoid()

  def forward(self, inputs):
    """
    Let B:= batch_size
        L:= window-padded sentence length
        D:= self.embed_dim
        S:= self.window_size
        H:= self.hidden_dim
        
    inputs: a (B, L) tensor of token indices
    """
    B, L = inputs.size()

    """
    Reshaping.
    Takes in a (B, L) LongTensor
    Outputs a (B, L~, S) LongTensor
    """
    # Fist, get our word windows for each word in our input.
    token_windows = inputs.unfold(1, 2 * self.window_size + 1, 1)
    _, adjusted_length, _ = token_windows.size()

    # Good idea to do internal tensor-size sanity checks, at the least in comments!
    assert token_windows.size() == (B, adjusted_length, 2 * self.window_size + 1)

    """
    Embedding.
    Takes in a torch.LongTensor of size (B, L~, S) 
    Outputs a (B, L~, S, D) FloatTensor.
    """
    embedded_windows = self.embeds(token_windows)

    """
    Reshaping.
    Takes in a (B, L~, S, D) FloatTensor.
    Resizes it into a (B, L~, S*D) FloatTensor.
    -1 argument "infers" what the last dimension should be based on leftover axes.
    """
    embedded_windows = embedded_windows.view(B, adjusted_length, -1)

    """
    Layer 1.
    Takes in a (B, L~, S*D) FloatTensor.
    Resizes it into a (B, L~, H) FloatTensor
    """
    layer_1 = self.hidden_layer(embedded_windows)

    """
    Layer 2
    Takes in a (B, L~, H) FloatTensor.
    Resizes it into a (B, L~, 1) FloatTensor.
    """
    output = self.output_layer(layer_1)

    """
    Softmax.
    Takes in a (B, L~, 1) FloatTensor of unnormalized class scores.
    Outputs a (B, L~, 1) FloatTensor of (log-)normalized class scores.
    """
    output = self.probabilities(output)
    output = output.view(B, -1)

    return output

# Training


In [186]:
# Prepare the data
data = list(zip(train_sentences, train_labels))
batch_size = 2
shuffle = True
window_size = 2
collate_fn = partial(custom_collate_fn, window_size=window_size, word_to_ix=word_to_ix)

# Instantiate a DataLoader
loader = DataLoader(data, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn)

# Initialize a model
# It is useful to put all the model hyperparameters in a dictionary
model_hyperparameters = {
    "batch_size": 4,
    "window_size": 2,
    "embed_dim": 25,
    "hidden_dim": 25,
    "freeze_embeddings": False,
}

vocab_size = len(word_to_ix)
model = WordWindowClassifier(model_hyperparameters, vocab_size)

# Define an optimizer
learning_rate = 0.01
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

# Define a loss function, which computes to binary cross entropy loss
def loss_function(batch_outputs, batch_labels, batch_lengths):   
    # Calculate the loss for the whole batch
    bceloss = nn.BCELoss()
    loss = bceloss(batch_outputs, batch_labels.float())

    # Rescale the loss. Remember that we have used lengths to store the 
    # number of words in each training example
    loss = loss / batch_lengths.sum().float()

    return loss

In [187]:
# Function that will be called in every epoch
def train_epoch(loss_function, optimizer, model, loader):
  
  # Keep track of the total loss for the batch
  total_loss = 0
  for batch_inputs, batch_labels, batch_lengths in loader:
    # Clear the gradients
    optimizer.zero_grad()
    # Run a forward pass
    outputs = model.forward(batch_inputs)
    # Compute the batch loss
    loss = loss_function(outputs, batch_labels, batch_lengths)
    # Calculate the gradients
    loss.backward()
    # Update the parameteres
    optimizer.step()
    total_loss += loss.item()

  return total_loss


# Function containing our main training loop
def train(loss_function, optimizer, model, loader, num_epochs=10000):

  # Iterate through each epoch and call our train_epoch function
  for epoch in range(num_epochs):
    epoch_loss = train_epoch(loss_function, optimizer, model, loader)
    if epoch % 100 == 0: print(epoch_loss)

In [188]:
num_epochs = 1000
train(loss_function, optimizer, model, loader, num_epochs=num_epochs)

0.291179895401001
0.22550997510552406
0.18485084548592567
0.15565814450383186
0.11372429132461548
0.0899219885468483
0.07194405421614647
0.06870515085756779
0.04596986621618271
0.047092363238334656


In [189]:
# Create test sentences
test_corpus = ["She comes from Paris"]
test_sentences = [s.lower().split() for s in test_corpus]
test_labels = [[0, 0, 0, 1]]

# Create a test loader
test_data = list(zip(test_sentences, test_labels))
batch_size = 1
shuffle = False
window_size = 2
collate_fn = partial(custom_collate_fn, window_size=2, word_to_ix=word_to_ix)
test_loader = torch.utils.data.DataLoader(test_data, 
                                           batch_size=1, 
                                           shuffle=False, 
                                           collate_fn=collate_fn)

In [190]:
for test_instance, labels, _ in test_loader:
  outputs = model.forward(test_instance)
  print(labels)
  print(outputs)


tensor([[0, 0, 0, 1]])
tensor([[0.0434, 0.0189, 0.2205, 0.9050]], grad_fn=<ViewBackward0>)
