In [1]:
import torch
import torch.nn as nn

import pprint
pp = pprint.PrettyPrinter()

# Using Tensors in PyTorch

# Initializing tensors
Below are demonstrations of the different ways tensors can be initialized in PyTorch:
### 1 - From a list


In [2]:
arr = [range(10)]
arr_t = torch.tensor(arr)
arr_t

# Typecasting tensors
# Way 1: use dtype argument in torch.tensor()
# Important dtypes = torch.float, torch.bool, torch.long
arr_t_float = torch.tensor(arr, dtype = torch.float)
arr_t_float
# Way 2: use <tensor name>.<data type>() method of an existing tensor
arr_t.float()
# Way 3: using the torch.Tensor() function (with capital T), which by default instantiates a tensor of type float.
# torch.FloatTensor() can be used to achieve same functionality as torch.Tensor()
# torch.LongTensor() can be used to initialize a tensor of type 64-bit int
arr_t_long = torch.LongTensor(arr)
arr_t_long
arr_t_long.shape

torch.Size([1, 10])

### 2 - From a NumPy array

In [3]:
import numpy as np

# Can convert a numpy array to a tensor using the torch.from_numpy() function
arr_np = np.array(arr)
arr_np
arr_t_np = torch.from_numpy(arr_np)
arr_t_np

tensor([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]])

### 3 - From an existing tensor

In [4]:
# Can use 1 of 4 functions to initialize a tensor from an exisitng tensor
# The torch.zeros_like(<Existing Tensor>) function initializes a tensor of 0s with the same shape and device as the existing tensor
t_zeros = torch.zeros_like(arr_t)
t_zeros

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [5]:
# The torch.ones_like(<Existing Tensor>) function initializes a tensor of 1s with the same shape and device as the existing tensor
t_ones = torch.ones_like(arr_t)
t_ones

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

In [6]:
# The torch.rand_like(<Existing Tensor>) function initializes a tensor with values sampled from a uniform distribution bounded by 0 and 1 having the same shape and device as the existing tensor
t_rand = torch.rand_like(arr_t.float())
t_rand

tensor([[0.2834, 0.9870, 0.0276, 0.2358, 0.9876, 0.3671, 0.1337, 0.7062, 0.9826,
         0.8990]])

In [7]:
# The torch.randn_like(<Existing Tensor>) function initializes a tensor with values sampled from the normal distribution having the same shape and device as the existing tensor
t_randn = torch.randn_like(arr_t.float())
t_randn

# Important note: torch.rand_like() and torch.randn_like() work only for existing tensors of type float

tensor([[ 1.0621, -1.2848, -0.8471, -0.3908,  1.6612,  1.4181,  0.1920,  0.2399,
          0.7167, -0.2970]])

### 4 - By specifiying tensor shape (dimensions)

In [8]:
# Can use 1 of 4 functions, each having names similar to the 4 in the above section, with the exception of the 'like' suffix
# torch.zeros(<shape>) can be used to initialize a tensor of 0s of dimensions specified shape argument
shape = (4, 3, 2)
torch.zeros(shape)

tensor([[[0., 0.],
         [0., 0.],
         [0., 0.]],

        [[0., 0.],
         [0., 0.],
         [0., 0.]],

        [[0., 0.],
         [0., 0.],
         [0., 0.]],

        [[0., 0.],
         [0., 0.],
         [0., 0.]]])

In [9]:
# torch.ones() can be used to initialize tensor of 1s
torch.ones(shape)

tensor([[[1., 1.],
         [1., 1.],
         [1., 1.]],

        [[1., 1.],
         [1., 1.],
         [1., 1.]],

        [[1., 1.],
         [1., 1.],
         [1., 1.]],

        [[1., 1.],
         [1., 1.],
         [1., 1.]]])

In [10]:
# torch.rand() can be used to initialize tensor of values sampled from uniform distirbution between 0 and 1
torch.rand(shape)

tensor([[[0.6988, 0.4774],
         [0.6486, 0.8024],
         [0.4054, 0.3805]],

        [[0.1979, 0.1019],
         [0.1972, 0.2979],
         [0.3218, 0.7185]],

        [[0.7333, 0.3482],
         [0.2904, 0.8147],
         [0.9556, 0.6868]],

        [[0.5627, 0.4832],
         [0.2455, 0.9455],
         [0.9203, 0.0527]]])

In [11]:
# torch.randn() can be used to initialize tensor of values sampled from the normal distribution
torch.randn(shape)

tensor([[[ 0.3745, -0.7559],
         [ 1.0815, -0.0441],
         [ 0.1290,  0.5564]],

        [[ 0.1326,  2.1671],
         [ 0.9182,  0.6388],
         [ 0.1839,  0.5316]],

        [[ 0.9678,  1.1021],
         [-1.6087,  0.7781],
         [ 0.9970, -2.2567]],

        [[ 0.2437, -0.5370],
         [-1.3689,  0.7581],
         [-0.6686, -1.4012]]])

### 5 - Using torch.arange()

In [12]:
# Similar to how we use the range() function to initialize a list, can use 'start' and 'step' arguments to specify list values
# Using torch.arange() without start and step args
torch.arange(10)

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [13]:
# Using torch.arange() with start and end arg
torch.arange(start = 5, end = 10)

tensor([5, 6, 7, 8, 9])

In [14]:
# Using torch.arange() with step arg
torch.arange(5, 10, step=2)

tensor([5, 7, 9])

### Reshaping Tensors

In [15]:
# We can reshape tensors using the view() method. View takes in a set of dimensions the product of whom must be equal to the product of the dimensions of the current vector
# We can ask view() to infer a dimension by specifying a -1 character in the tensor
data = [range(10)]
data_t = torch.tensor(data)
data_t.view(5, -1)

tensor([[0, 1],
        [2, 3],
        [4, 5],
        [6, 7],
        [8, 9]])

In [16]:
# Another way to reshape vectors is using the reshape() method
# Reshape also takes a set of dimensions as input
# unsqueeze() method is used to add a dimension that has only 1 element in it and takes an index as argument which defines the index at which the new dimension is to be added
# squeeze() method is used to eliminate a dimension that has only 1 element in it
data_reshape = data_t.reshape(2, 5)
data_unsqueeze = data_reshape.unsqueeze(1)
data_squeeze = data_unsqueeze.squeeze(1)
data_squeeze

tensor([[0, 1, 2, 3, 4],
        [5, 6, 7, 8, 9]])

### Extracting Tensor information

In [17]:
# Extracting the data type
data_t.dtype

torch.int64

In [18]:
# Extracting the dimensions
data_t.shape

torch.Size([1, 10])

In [19]:
# Extracting the number of elements
data_t.numel()

10

In [20]:
# Extracting the number of dimensions
data_t.ndim

2

In [21]:
# Extracting where the tensor in stored, cpu or gpu
data_t.device

# To move a tensor to the GPU
if torch.cuda.is_available():
  data_t.to('cuda')

### Indexing a tensor

In [22]:
# The symbol ':' is used to say that all the elements of the specified dimension are to be selected
# Indexing can also be done using lists
new_tensor = torch.tensor([range(20)])
new_tensor = new_tensor.view(2, 5, 2)
new_tensor

tensor([[[ 0,  1],
         [ 2,  3],
         [ 4,  5],
         [ 6,  7],
         [ 8,  9]],

        [[10, 11],
         [12, 13],
         [14, 15],
         [16, 17],
         [18, 19]]])

In [23]:
# The following code takes the 0th row of the tensor, the 1st, 2nd, and 3rd columns of that row, and the 1st elements of the individual lists at each column
i = 0
j = [1, 2, 3]
k = 1
new_tensor[i, j, k]
# Example of using ':'
# The following code will take all rows and access the lists at the 1st, 2nd, and 3rd columns of each row, and the 1st element of each individual list
new_tensor[:, j, k]

tensor([[ 3,  5,  7],
        [13, 15, 17]])

In [24]:
# To convert a tensor element to a scalar, use the item attribute
new_tensor[1, 2, 1].item()

15

### Performing arithmetic operations on tensors
Tensors work very similarly to matrices when performing arithmetic operations. Addition, subtraction, multiplication, divison is performed element wise when performing them with a single number

In [25]:
new_tensor = torch.tensor(range(10))
# Performing addition
new_tensor + 5

tensor([ 5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [26]:
# Performing subtraction
new_tensor - 3

tensor([-3, -2, -1,  0,  1,  2,  3,  4,  5,  6])

In [27]:
# Performing multiplication
new_tensor * 5

tensor([ 0,  5, 10, 15, 20, 25, 30, 35, 40, 45])

In [28]:
# Performing division
(new_tensor / 5)

tensor([0.0000, 0.2000, 0.4000, 0.6000, 0.8000, 1.0000, 1.2000, 1.4000, 1.6000,
        1.8000])

Operations between two tensors is performed similarly to how it is performed between two matrices. Specifically, matrix multiplication is performed in the following way: 

Given two tensors of the following dimensions:

rows1 x columns1

rows2 x columns2

columns1 and rows2 need to be equivalent if the tensors are to be multiplied and the resulting tensor will have dimensions rows1 * columns2

In [29]:
# Initializing a tensor of dimensions 3 x 2
tensor_1 = torch.tensor(range(6)).view(3, 2)
tensor_1

tensor([[0, 1],
        [2, 3],
        [4, 5]])

In [30]:
# Initializing another tensor of dimensions 2 x 4
tensor_2 = torch.tensor(range(8)).view(2, 4)
tensor_2

tensor([[0, 1, 2, 3],
        [4, 5, 6, 7]])

In [31]:
# Matrix multiplication can be performed using the '@' symbol or using the matmul() method
tensor_1 @ tensor_2

tensor([[ 4,  5,  6,  7],
        [12, 17, 22, 27],
        [20, 29, 38, 47]])

In [32]:
# Matrix multiplication can be performed using the '@' symbol or using the matmul() method
tensor_1.matmul(tensor_2)

tensor([[ 4,  5,  6,  7],
        [12, 17, 22, 27],
        [20, 29, 38, 47]])

In [33]:
# To transpose a tensor, use the T attribute
tensor_1.T

tensor([[0, 2, 4],
        [1, 3, 5]])

In [34]:
tensor_1 = torch.tensor(range(20)).view(2, 5, 2)
tensor_1

tensor([[[ 0,  1],
         [ 2,  3],
         [ 4,  5],
         [ 6,  7],
         [ 8,  9]],

        [[10, 11],
         [12, 13],
         [14, 15],
         [16, 17],
         [18, 19]]])

In [35]:
# The mean across the dimensions of a tensor can be found using the mean() method that takes as parameter the dimension across which the mean is to be commputed
# mean() method only works on tensors of dtype float
# Mean across the 0th dimension. As the tensor is 2 x 5 x 2, mean across the 0th dimension gives the mean 5 x 2 tensor
tensor_1.float().mean(0)

tensor([[ 5.,  6.],
        [ 7.,  8.],
        [ 9., 10.],
        [11., 12.],
        [13., 14.]])

In [36]:
# Mean across 1st dimension, provides mean 2 x 2 tensor
tensor_1.float().mean(1)

tensor([[ 4.,  5.],
        [14., 15.]])

In [37]:
# Mean across the 2nd dimension, provides mean 2 x 5 tensor
tensor_1.float().mean(2)

tensor([[ 0.5000,  2.5000,  4.5000,  6.5000,  8.5000],
        [10.5000, 12.5000, 14.5000, 16.5000, 18.5000]])

In [38]:
# Standard Deviation can also be calculated across dimensions of tensors of dtype float
# Standard deviation across the 0th dimension
tensor_1.float().std(0)

tensor([[7.0711, 7.0711],
        [7.0711, 7.0711],
        [7.0711, 7.0711],
        [7.0711, 7.0711],
        [7.0711, 7.0711]])

In [39]:
# Concatenation (joining) of 2 or more tensors is done using the torch.cat() function that takes the following arguments:
# 1 - A list of tensors to be concatenated
# 2 - The dimension across which the concatenation is supposed to happen
# For example, if the dim argument is set to 0 (default), the tensors will be concatenated vertically, rows of each individual tensor will be added
# If the dim argument is set to 1, the tensors will be concatenated horizontally, columns of each individual tensor will be added
tensor_1 = torch.arange(10).view(5, 2)
tensor_2 = torch.arange(10, 20).view(5, 2)
tensor_cat_0 = torch.cat([tensor_1, tensor_2], dim = 0)
tensor_cat_1 = torch.cat([tensor_1, tensor_2], dim = 1)

In [40]:
# In-place operations in PyTorch are those that modify the tensor that invokes them and are denoted by an underscore (_) as the suffix to the method name
# A normal addition operation (not in-place)
tensor_1 + tensor_2

tensor([[10, 12],
        [14, 16],
        [18, 20],
        [22, 24],
        [26, 28]])

In [41]:
# An In-place operation, tensor_1 stores the result of tensor_1 + tensor_2
tensor_1.add_(tensor_2)
tensor_1

tensor([[10, 12],
        [14, 16],
        [18, 20],
        [22, 24],
        [26, 28]])

### Calculating gradients

In [42]:
# PyTorch provides an autograd feature that is used to automatically perform backpropagation and calculate gradients
# There are 3 important steps to do this
# 1 - Set the requires_grad attribute to True when declaring tensor using torch.tensor()
# 2 - Call the backward() method on the tensor that was computed using the tensor that has the requires_grad attribute set to True
# 3 - Access the grad attribute of the original tensor to get the gradient
# Calculating the gradient of y = x^3 for x = {1, 2, 3, 5}
# Derivative of x^3 is 3x^2 so gradients should be {3, 12, 27, 75}
xs = torch.tensor([1., 2., 3., 5.], requires_grad=True)
grads = []
for x in xs:
  y = x * x * x
  y.backward()
xs.grad

tensor([ 3., 12., 27., 75.])

In [43]:
# One problem of backward() is that it adds up the gradients as we perform subsequent operations
# Therefore, the zero_grad() method is to be invoked after each arithmetic operation to prevent gradients from exploding
y = xs[0] * xs[0] * xs[0]
y.backward()
# Actual gradient should be 3, same as above, but computed gradient will be 6 because previous gradient is not discarded
xs.grad

tensor([ 6., 12., 27., 75.])

# Building Neural Networks
## Training Linear Layers using Tensors

In [44]:
# Importing PyTorch's Neural Network module torch.nn with an alias
import torch.nn as nn
# In order to train a linear layer, we use the nn.Linear() method that takes the following arguments:
# 1 - H_in : This represents the number of features per sample (per row) in the input matrix
# 2 - H_out : This represents the number of features per sample (per row) of the output matrix
# The input matrix will have shape N x * x H_in, where * represents any arbitrary number of dimensions in between
# The output matrix will have shape N x * x H_out, where * represents the same arbitrary number of dimensions in the input matrix
# The Linear Layer performs the simple operation Ax + b, where:
# 1 - A is the learnable weight matrix of shape H_out x H_in 
# 2 - x is the input matrix of shape N x * x H_in
# 3 - b is the learnable bias matrix having a single dimension H_out

# Initializing input matrix x
x = torch.randn(5, 2, 4)
# Initializing Linear Layer with H_in = 4, H_out = 2
linear = nn.Linear(4, 2)
# Applying Linear Layer to x
linear_output = linear(x)
# Printing the output matrix of shape 5 x 2 x 2
linear_output

tensor([[[-0.0082,  0.5932],
         [-0.0449, -0.3718]],

        [[ 0.3209, -0.3980],
         [-0.9270,  0.7147]],

        [[ 0.3592, -0.4881],
         [-0.3677,  0.4989]],

        [[-0.3953,  0.0140],
         [ 0.4554, -0.4031]],

        [[-0.7613,  0.3426],
         [-0.8563,  0.1040]]], grad_fn=<AddBackward0>)

In [45]:
# Printing the learnable weight matrix A of shape 2 x 4
linear.weight

Parameter containing:
tensor([[-0.1780,  0.4519,  0.0389, -0.3753],
        [ 0.3373, -0.4345,  0.0399,  0.0989]], requires_grad=True)

In [46]:
# Printing the learnable bias matrix b of length H_out (2)
linear.bias

Parameter containing:
tensor([-0.0171, -0.0724], requires_grad=True)

## Adding Activation Function Layers

In [47]:
# Activation functions are functions that add a non-linearity between 2 linear layers
# Examples of activation functions are nn.Sigmoid(), nn.ReLU(), nn.LeakyReLU()
# Applying each non-linearity to output matrix of trained linear layer
sigmoid = nn.Sigmoid()
ReLU = nn.ReLU()
LReLU = nn.LeakyReLU()
# Printing the output of applying the sigmoid non-linearity
sigmoid(linear_output)

tensor([[[0.4979, 0.6441],
         [0.4888, 0.4081]],

        [[0.5795, 0.4018],
         [0.2835, 0.6714]],

        [[0.5888, 0.3803],
         [0.4091, 0.6222]],

        [[0.4024, 0.5035],
         [0.6119, 0.4006]],

        [[0.3184, 0.5848],
         [0.2981, 0.5260]]], grad_fn=<SigmoidBackward>)

In [48]:
# Printing the output of applying the ReLU non-linearity
ReLU(linear_output)

tensor([[[0.0000, 0.5932],
         [0.0000, 0.0000]],

        [[0.3209, 0.0000],
         [0.0000, 0.7147]],

        [[0.3592, 0.0000],
         [0.0000, 0.4989]],

        [[0.0000, 0.0140],
         [0.4554, 0.0000]],

        [[0.0000, 0.3426],
         [0.0000, 0.1040]]], grad_fn=<ReluBackward0>)

In [49]:
# Printing the output of applying the LeakyReLU non-linearity
LReLU(linear_output)

tensor([[[-8.2463e-05,  5.9324e-01],
         [-4.4878e-04, -3.7175e-03]],

        [[ 3.2092e-01, -3.9800e-03],
         [-9.2695e-03,  7.1467e-01]],

        [[ 3.5920e-01, -4.8807e-03],
         [-3.6769e-03,  4.9894e-01]],

        [[-3.9533e-03,  1.4048e-02],
         [ 4.5536e-01, -4.0311e-03]],

        [[-7.6128e-03,  3.4256e-01],
         [-8.5634e-03,  1.0403e-01]]], grad_fn=<LeakyReluBackward0>)

In [50]:
# Combining layers can be done in a handy way through nn.Sequential(). This method takes in all the different layer configruations and applies them
# sequentially on the input data. For example, the above code could be re-written as the following to achieve the same output:
x = torch.randn(5, 2, 4)
block = nn.Sequential(
    nn.Linear(4, 2),
    nn.Sigmoid()
)
linear_output = block(x)
linear_output

tensor([[[0.5199, 0.8095],
         [0.5599, 0.5442]],

        [[0.5945, 0.5565],
         [0.3706, 0.6706]],

        [[0.6647, 0.4774],
         [0.6655, 0.4370]],

        [[0.6675, 0.4588],
         [0.6549, 0.7620]],

        [[0.4927, 0.6063],
         [0.4508, 0.5421]]], grad_fn=<SigmoidBackward>)

## Customizing networks using nn.Module
### Building a custom network

In [51]:
# We can create our own network classes by extending the nn.Module class
# 1 - Initialize parameters in the __init__ function of the model class
# 2 - Call the __init__ function of the super class (nn.Module) in the __init__ function of the model class
class Sample2LayerModel(nn.Module):
  def __init__(self, input_size, L1_size, L2_size):
    super(Sample2LayerModel, self).__init__()
    self.input_size = input_size
    self.L1_size = L1_size
    self.L2_size = L2_size

    self.model = nn.Sequential(
        nn.Linear(input_size, L1_size),
        nn.ReLU(),
        nn.Linear(L1_size, L2_size),
        nn.Sigmoid()
    )
  def forward(self, x):
    out_layer = self.model(x)
    return out_layer

x = torch.randn(2, 4, 5)
model = Sample2LayerModel(5, 3, 2)
model(x)
list(model.parameters())

[Parameter containing:
 tensor([[ 0.0833,  0.0202,  0.3647,  0.0758,  0.3828],
         [ 0.0812,  0.0637,  0.1803, -0.2869,  0.4356],
         [-0.0465, -0.2032, -0.3033, -0.3750,  0.1760]], requires_grad=True),
 Parameter containing:
 tensor([ 0.0440, -0.0746,  0.3534], requires_grad=True),
 Parameter containing:
 tensor([[ 0.3421, -0.4712,  0.2706],
         [ 0.1778, -0.4514, -0.0487]], requires_grad=True),
 Parameter containing:
 tensor([-0.5738, -0.5398], requires_grad=True)]

### Optimizing the custom-made network

In [52]:
# Model optimization is done using the torch.optim module available in PyTorch
# Importing the optim module using an alias
import torch.optim as optim

# There are many different types of optimizers available in PyTorch, the most popular of which are optim.SGD() and optim.Adam()
# Optimizer objects are instantiated by passing the model parameters (via model.parameters()) and the learning rate as arguments
# There are 4 important steps to optimizing a model:
# 1 - Clear out all the gradients, call zero_grad() method on the optimizer
# 2 - Use the model to compute a tensor of predictions for a given input
# 3 - Compute the loss of the generated predictions (loss functions pre-built into PyTorch - nn.L1Loss(), nn.CrossEntropyLoss(), nn.MSELoss etc.)
# 4 - Perform backpropagation on the loss (call the backward() method)
# 5 - Step the weights for the next iteration (call the step() method on the optimizer)

y = torch.ones(10, 5)
x = y + torch.randn_like(y)
model = Sample2LayerModel(5, 3, 5)
Adam = optim.Adam(model.parameters(), lr=1e-1)
loss_func = nn.BCELoss()
for i in range(10):
  Adam.zero_grad()
  y_pred = model(x)
  y_pred
  loss = loss_func(y_pred, y)
  print(f"BCE Loss at Epoch # {i}: {loss}")
  loss.backward()
  Adam.step()

BCE Loss at Epoch # 0: 0.6162057518959045
BCE Loss at Epoch # 1: 0.4578053653240204
BCE Loss at Epoch # 2: 0.29428309202194214
BCE Loss at Epoch # 3: 0.16388945281505585
BCE Loss at Epoch # 4: 0.07958802580833435
BCE Loss at Epoch # 5: 0.03495621308684349
BCE Loss at Epoch # 6: 0.01467656996101141
BCE Loss at Epoch # 7: 0.006173713598400354
BCE Loss at Epoch # 8: 0.002687691478058696
BCE Loss at Epoch # 9: 0.0012284605763852596


In [53]:
x = y + torch.randn_like(y)
y_preds = model(x)
loss_func(y_preds, y)

tensor(7.5957e-05, grad_fn=<BinaryCrossEntropyBackward>)

# Building an NLP word classification model

In [54]:
# In every NLP model, there are 4 main components or stages of development:
# 1 - Data - this refers to the creation of dataset of batched word embedding tensors
# 2 - Modelling
# 3 - Training
# 4 - Prediction

# Stage 1 - Building a dataset.
corpus = [
          "Jack is a boy",
          "Jill is a girl",
          "Josephine broke the window",
          "Joe stood watching",
          "Andy likes Martha",
          "Rachel and Ross",
          "Henry was a beast",
          "Robert is a cool guy"
]
# Defining a list of names to be recognized
names = ["jack", "jill", "josephine", "joe", "andy", "rachel", "henry", "robert", "ross"]

# Defining a pre-processing function that will lowercase all words and split on whitespace
def pre_process(corpus):
  return [sentence.lower().split() for sentence in corpus]

preprocessed_corpus = pre_process(corpus)
preprocessed_corpus

[['jack', 'is', 'a', 'boy'],
 ['jill', 'is', 'a', 'girl'],
 ['josephine', 'broke', 'the', 'window'],
 ['joe', 'stood', 'watching'],
 ['andy', 'likes', 'martha'],
 ['rachel', 'and', 'ross'],
 ['henry', 'was', 'a', 'beast'],
 ['robert', 'is', 'a', 'cool', 'guy']]

In [55]:
corpus_labels = [[1 if word in names else 0 for word in sentence] for sentence in preprocessed_corpus]
corpus_labels

[[1, 0, 0, 0],
 [1, 0, 0, 0],
 [1, 0, 0, 0],
 [1, 0, 0],
 [1, 0, 0],
 [1, 0, 1],
 [1, 0, 0, 0],
 [1, 0, 0, 0, 0]]

In [56]:
vocabulary = set(word for sentence in preprocessed_corpus for word in sentence)
vocabulary

{'a',
 'and',
 'andy',
 'beast',
 'boy',
 'broke',
 'cool',
 'girl',
 'guy',
 'henry',
 'is',
 'jack',
 'jill',
 'joe',
 'josephine',
 'likes',
 'martha',
 'rachel',
 'robert',
 'ross',
 'stood',
 'the',
 'was',
 'watching',
 'window'}

In [57]:
# Adding a '<pad>' token to vocabulary for later use
# Adding a '<unk>' token to vocabulary for later use
vocabulary.add("<pad>")
vocabulary.add("<unk>")

# Creating a sorted index for the vocabulary (word to index correspondance)
index = sorted(list(vocabulary))
# Creating a index to word correspondance
word_index_pair = {word: ind for ind, word in enumerate(index)}
word_index_pair

{'<pad>': 0,
 '<unk>': 1,
 'a': 2,
 'and': 3,
 'andy': 4,
 'beast': 5,
 'boy': 6,
 'broke': 7,
 'cool': 8,
 'girl': 9,
 'guy': 10,
 'henry': 11,
 'is': 12,
 'jack': 13,
 'jill': 14,
 'joe': 15,
 'josephine': 16,
 'likes': 17,
 'martha': 18,
 'rachel': 19,
 'robert': 20,
 'ross': 21,
 'stood': 22,
 'the': 23,
 'was': 24,
 'watching': 25,
 'window': 26}

In [58]:
# Defining function to padd a sentence to ensure it fits a given window size
def pad_sentence(sentence, window_size, window_token="<pad>"):
  window = [window_token] * window_size
  return window + sentence + window

# Defining a function that will take a tokenized sentence and return a list of the inidividual tokens
def token_to_index(sentence, word_to_index):
  indices = []
  for token in sentence:
    if token in word_to_index:
      indices.append(word_to_index[token])
    else:
      indices.append(word_to_index["<unk>"])
  return indices

# Defining a function that will take a list of indices and return a list of their corresponding words/tokens
def index_to_token(indices):
  sentence = []
  for ind in indices:
    sentence.append(index[ind])
  return sentence

new_sentence = ["cool", "guy"]
token_to_index(new_sentence, word_index_pair)

[8, 10]

In [59]:
# Converting entire corpus into indices
corpus_indices = [token_to_index(sentence, word_index_pair) for sentence in preprocessed_corpus]
corpus_indices

[[13, 12, 2, 6],
 [14, 12, 2, 9],
 [16, 7, 23, 26],
 [15, 22, 25],
 [4, 17, 18],
 [19, 3, 21],
 [11, 24, 2, 5],
 [20, 12, 2, 8, 10]]

In [60]:
reverted_corpus = [index_to_token(indices) for indices in corpus_indices]
reverted_corpus

[['jack', 'is', 'a', 'boy'],
 ['jill', 'is', 'a', 'girl'],
 ['josephine', 'broke', 'the', 'window'],
 ['joe', 'stood', 'watching'],
 ['andy', 'likes', 'martha'],
 ['rachel', 'and', 'ross'],
 ['henry', 'was', 'a', 'beast'],
 ['robert', 'is', 'a', 'cool', 'guy']]

In [61]:
# Once we have the indices of every word in the corpus, the next step is to create the word embeddings for each word in the corpus
# which will be stored in the form of a embedding table having shape N x E where N is the number of unique words in the corpus and E is the number of features per embedding
# To access the word embedding of a word, just index the embedding table using the index provided by the token_to_index() function
# An embedding matrix can be initialized randomly using the nn.Embedding() method which takes the arguments N and E
vocab_embeds = nn.Embedding(len(vocabulary), 4)
list(vocab_embeds.parameters())

[Parameter containing:
 tensor([[-4.9031e-01,  8.0741e-02,  2.0764e+00,  8.3294e-01],
         [-1.3380e+00,  2.9521e-01, -1.7673e-01,  2.7782e-01],
         [ 9.1266e-01,  1.6677e-01, -8.0263e-01,  7.0830e-01],
         [-1.7560e+00, -1.0903e+00, -7.2437e-01,  7.3535e-01],
         [-1.8526e+00,  1.4280e+00, -2.7353e-01, -1.2659e+00],
         [-1.1594e-01, -1.5239e+00, -1.2135e+00,  8.4046e-01],
         [ 2.5654e-01,  4.2652e-01, -5.2919e-01,  1.0842e+00],
         [ 2.2344e-01,  1.5700e+00, -1.6874e-01, -6.1901e-01],
         [-1.2203e-01, -2.0465e-01,  1.2390e+00, -1.2371e+00],
         [ 9.7892e-02, -2.0413e+00, -2.6087e-01, -1.5696e+00],
         [-4.5248e-01,  1.1316e+00, -9.9751e-03,  1.5222e+00],
         [ 1.3787e+00, -3.6119e-01,  9.3978e-01,  5.2157e-01],
         [-1.9032e+00,  1.4626e+00, -4.8857e-01,  1.8613e+00],
         [ 1.4181e+00,  8.1234e-01, -1.8199e+00, -1.6225e-03],
         [ 5.6616e-01, -1.0337e+00,  1.5482e-01, -3.2893e+00],
         [-4.2602e-01,  4.9563e-

In [62]:
# Mapping each word in the corpus to its respective embedding
sentence_embeds = []
for sentence in corpus_indices:
  sentence_embeds.append(vocab_embeds(torch.tensor(sentence, dtype=torch.long)))
sentence_embeds

[tensor([[ 1.4181e+00,  8.1234e-01, -1.8199e+00, -1.6225e-03],
         [-1.9032e+00,  1.4626e+00, -4.8857e-01,  1.8613e+00],
         [ 9.1266e-01,  1.6677e-01, -8.0263e-01,  7.0830e-01],
         [ 2.5654e-01,  4.2652e-01, -5.2919e-01,  1.0842e+00]],
        grad_fn=<EmbeddingBackward>),
 tensor([[ 0.5662, -1.0337,  0.1548, -3.2893],
         [-1.9032,  1.4626, -0.4886,  1.8613],
         [ 0.9127,  0.1668, -0.8026,  0.7083],
         [ 0.0979, -2.0413, -0.2609, -1.5696]], grad_fn=<EmbeddingBackward>),
 tensor([[ 1.7855, -1.7439,  1.2680, -1.4248],
         [ 0.2234,  1.5700, -0.1687, -0.6190],
         [ 1.4889,  0.6515,  0.5753,  1.9953],
         [-0.3303, -0.7326, -1.1886, -0.8301]], grad_fn=<EmbeddingBackward>),
 tensor([[-4.2602e-01,  4.9563e-01,  2.0227e+00,  1.1446e+00],
         [-8.5340e-01,  7.0993e-01,  2.4207e+00,  7.5106e-01],
         [-8.6584e-01, -1.2920e+00, -1.1977e+00,  1.4975e-03]],
        grad_fn=<EmbeddingBackward>),
 tensor([[-1.8526,  1.4280, -0.2735, -1.265

In [63]:
# Next, we use the DataLoader function from torch.utils.data to breakup the data into batches that can be used to learn the network
# The DataLoader function takes the following arguments:
# 1 - The data - this represents the entire dataset, features and labels combined
# 2 - Batch Size - this represents the number of examples to learn per batch. Over an epoch, all the batches will be iterated
# 3 - Shuffle - this represents whether the order of batches is to be shuffled before each sampling iteration
# 4 - collate_fn - this represents the collate function that will be passed to the DataLoader.
# A collate function is a function that is used either to print the stats of a batch or to perform further processing on it before it is passed as input to the network
# The collate function is applied to the current batch so it always takes it as an argument. Additional arguments may be added as needed

# Importing DataLoader and partial function
# Partial function is used to hard-code some of the arguments to a function
from torch.utils.data import DataLoader
from functools import partial

# Defining custom collate function
def collate_func(batch, window_size, word_to_index):
  x, y = zip(*batch)

  # Padding all sentences first so that each word can be windowed succesfully
  # And then converting each tokenized sentence to an indexed sentence
  x_padded = [pad_sentence(x_i, window_size=window_size) for x_i in x]
  x_indexed = [token_to_index(x_i, word_to_index=word_to_index) for x_i in x_padded]
  x_tensor = [torch.LongTensor(x_i) for x_i in x_indexed]
  pad_index = word_to_index["<pad>"]
  # Padding again to ensure all sentences are of same length
  x_pad_indexed = nn.utils.rnn.pad_sequence(x_tensor, batch_first = True, padding_value = pad_index)

  # Remembering the lengths of each sentence without padding to avoid inaccurate loss
  lengths = [len(y_i) for y_i in y]
  lengths_tensor = torch.LongTensor(lengths)
  y_tensor = [torch.LongTensor(y_i) for y_i in y]
  # Padding all y's to be of same length using nn.utils.rnn.pad_sequence()
  # y not padded to fit window size because it won't be windowed
  y_padded = nn.utils.rnn.pad_sequence(y_tensor, batch_first=True, padding_value = 0)

  return x_pad_indexed, y_padded, lengths_tensor


  


In [64]:
data = list(zip(preprocessed_corpus, corpus_labels))
batch_size = 2
window_size = 2
shuffle=True
f = partial(collate_func, window_size = window_size, word_to_index = word_index_pair)

loader = DataLoader(data, batch_size = batch_size, shuffle=shuffle, collate_fn = f)

for x_batch, y_batch, l_batch in loader:
  print("Input X batch: ")
  print(x_batch)
  print("Output Y labels: ")
  print(y_batch)
  print("Lengths: ")
  print(l_batch)

Input X batch: 
tensor([[ 0,  0, 16,  7, 23, 26,  0,  0],
        [ 0,  0, 11, 24,  2,  5,  0,  0]])
Output Y labels: 
tensor([[1, 0, 0, 0],
        [1, 0, 0, 0]])
Lengths: 
tensor([4, 4])
Input X batch: 
tensor([[ 0,  0, 20, 12,  2,  8, 10,  0,  0],
        [ 0,  0, 15, 22, 25,  0,  0,  0,  0]])
Output Y labels: 
tensor([[1, 0, 0, 0, 0],
        [1, 0, 0, 0, 0]])
Lengths: 
tensor([5, 3])
Input X batch: 
tensor([[ 0,  0, 19,  3, 21,  0,  0,  0],
        [ 0,  0, 14, 12,  2,  9,  0,  0]])
Output Y labels: 
tensor([[1, 0, 1, 0],
        [1, 0, 0, 0]])
Lengths: 
tensor([3, 4])
Input X batch: 
tensor([[ 0,  0,  4, 17, 18,  0,  0,  0],
        [ 0,  0, 13, 12,  2,  6,  0,  0]])
Output Y labels: 
tensor([[1, 0, 0, 0],
        [1, 0, 0, 0]])
Lengths: 
tensor([3, 4])


In [65]:
# Now that we are succesfully able to create batches of data, we need to be able to split them into windows.
# For a sentence with the n non-padded tokens, the number of windows created will be n
# Each window will be of size 2N + 1, where N is the window size (2N because there'll be N tokens either side of the center token, and the center token itself counts as 1 so +1)
# PyTorch provides a function unfold() that performs this task and takes the following arguments:
# 1 - Dimension - this is the dimension in which the windows are to be created
# 2 - Window size - this is the window size and is 2N + 1
# 3 - Step - This is the step that is to be taken after each window is sliced
# The unfold() method can be called on any tensor using the dot operator
x_unfold = x_batch.unfold(1, 2*2 + 1, 1)
x_unfold

tensor([[[ 0,  0,  4, 17, 18],
         [ 0,  4, 17, 18,  0],
         [ 4, 17, 18,  0,  0],
         [17, 18,  0,  0,  0]],

        [[ 0,  0, 13, 12,  2],
         [ 0, 13, 12,  2,  6],
         [13, 12,  2,  6,  0],
         [12,  2,  6,  0,  0]]])

In [66]:
# We will use the unfold() function in the development of our model
# The model architecture is going to be follows:
# 1 - The first layer will be an embedding layer in which the following operations will be carried out:
#     a) Each input batch will be windowed using the unfold() function, therefore the input matrix will become of shape B x L~ x W, where B is the batch size, L~ represents the number of windows (L is the length of the padded sentence), and W represents the window size (2N + 1)
#     b) Next, each token of each window will be looked up in the embedding table and replaced by their respective embedding, therefore the input matrix will become of shape B x L~ x W x D, where B, L~, and W are the same as before and D represents the number of dimensions per embedding
# 2 - The second layer will be a linear layer called the hidden layer. This layer will perform a linear operation on the embeddings matrix and will then apply the tanh() non_linearity.
# The purpose of this layer will be to represent each window (W x D) collectively as a whole in a single dimension which will be the output dimension H_out of this layer and will be denoted with H. So, the output matrix of the hidden layer will be of shape B x L~ x H
# 3 - The third layer will be another linear layer called the output layer. This layer will perform a linear operation on the output of the hidden layer and will shrink the information carried in the Hth dimension of the hidden layer output to a single scalar (each window will be represented using a single scalar). Therefore, the output matrix of this layer will be of shape B x L~ x 1.
# 4 - The 4th and final layer will be a sigmoid non-linearity that will be added to convert the learnt window information into a probability. These probabilities will then be output by the model with a higher probability meaning that a name in that location is more likely than any other word
class WordWindowClassifier(nn.Module):
  def __init__(self, hyperparameters, vocab_size, padding_ind=0):
    super(WordWindowClassifier, self).__init__()
    self.window_size = hyperparameters["window_size"]
    self.embedding_dimensions = hyperparameters["embed_dim"]
    self.hidden_dimensions = hyperparameters["hidden_dim"]

    # Defining window width = 2N + 1
    self.window_width = 2 * window_size + 1
    # Defining a randomly initialized embedding matrix having dimensions n x D, where n is the number of words in the vocabulary and D is the number of embedding dimensions per word
    self.embeddings = nn.Embedding(vocab_size, self.embedding_dimensions, padding_idx = padding_ind)
    # Defining the hidden layer as a linear layer with input features W x D and output features H followed by a tanh() non-linearity
    self.hidden_layer = nn.Sequential(
        nn.Linear(self.window_width * self.embedding_dimensions, self.hidden_dimensions),
        nn.Tanh())
    # Defining the output layer as a linear layer with input features H and output features 1
    self.output_layer = nn.Linear(self.hidden_dimensions, 1)
    # Defining the sigmoid() non-linearity
    self.probability = nn.Sigmoid()
  def forward(self, inputs):
    # Getting the dimensions of the input - Batch Size x Length of each batch
    B, L = inputs.size()
    # Converting each token of each batch into its respective window, input matrix now has dimensions B x L~ x W
    tokenized_inputs_windowed = inputs.unfold(1, self.window_width, 1)
    # Extracting the number of windows of every batch example (extracting L~)
    _, unpadded_length, _ = tokenized_inputs_windowed.size()
    # Converting each window token into its respective embedding, input matrix now has dimensions B x L~ x W x D
    tokenized_windowed_embeddings = self.embeddings(tokenized_inputs_windowed)
    # Reshaping input matrix so that the last 2 dimensions can be merged into a single dimension, input matrix now has dimensions B x L~ x (W * D)
    tokenized_windowed_embeddings = tokenized_windowed_embeddings.view(B, unpadded_length, -1)
    # Passing the input into the hidden layer, output matrix of this layer has dimensions B x L~ x H, where H is the output features of the layer
    H_output = self.hidden_layer(tokenized_windowed_embeddings)
    # Passing the output of the hidden layer into the output layer, output matrix of this layer has dimensions B x L~ x 1
    F_output = self.output_layer(H_output)
    # Converting the output of the layer into probabilities
    output = self.probability(F_output)
    output = output.view(B, -1)
    return output

In [67]:
# Now that the modelling has finished and the final architecture implemented, the next step is to train the model
# To train the model, we will first need to use the DataLoader to generate batches of our data

# Initializing arguments to DataLoader
data = list(zip(preprocessed_corpus, corpus_labels))
shuffle = True
batch_size = 2
window_size = 2
f = partial(collate_func, window_size = window_size, word_to_index = word_index_pair)

# Initializing DataLoader object
loader = DataLoader(data, batch_size=batch_size, shuffle = shuffle, collate_fn=f)

#Initializing a dictionary of the hyperparameters of the model
hyperparameters = {
    "window_size": 2,
    "embed_dim": 25,
    "hidden_dim": 25,
}

# Initializing the model
model = WordWindowClassifier(hyperparameters, len(vocabulary), padding_ind = 0)
# Initializing the optimizer
lr = 0.01
SGD = optim.SGD(model.parameters(), lr = lr)
# Defining a custom loss function that will use nn.BCELoss(), cannot use BCELoss() directly because outputs are padded
def loss_func(preds, labels, batch_lengths):
  loss = nn.BCELoss()
  loss_val = loss(preds, labels.float())
  loss_val = loss_val / batch_lengths.sum().float()
  return loss_val

# Defining a training function to train the model for n number of epochs
def train(loader, optimizer, model, n_epochs=10000):
  for i in range(n_epochs):
    for x_batch, y_batch, lengths_batch in loader:
      optimizer.zero_grad()
      preds = model(x_batch)
      loss = loss_func(preds, y_batch, lengths_batch)
      loss.backward()
      optimizer.step()
    if i % 1000 == 0:
      print(loss.item())

train(loader, SGD, model)

0.10533452779054642
0.009202997200191021
0.0012022750452160835
0.001559594296850264
0.0012242839438840747
0.000683983729686588
0.0009183045476675034
0.0003013168752659112
0.000336874567437917
0.0005727809038944542


In [68]:
# Defining some test data
test_corpus = ["Jacky is a little kid", "John is a jock"]
test_corpus = [sentence.lower().split() for sentence in test_corpus]
test_labels = [[1, 0, 0, 0, 0], [1, 0, 0, 0]]
batch_size = 1
shuffle = True
window_size = 2
test_data = list(zip(test_corpus, test_labels))
f = partial(collate_func, window_size=window_size, word_to_index = word_index_pair)
loader = DataLoader(test_data, batch_size=batch_size, shuffle=shuffle, collate_fn = f)
for x_batch, y_batch, _ in loader:
  preds = model(x_batch)
  print(y_batch)
  print(preds)

tensor([[1, 0, 0, 0, 0]])
tensor([[0.9925, 0.0014, 0.0606, 0.0023, 0.0039]], grad_fn=<ViewBackward>)
tensor([[1, 0, 0, 0]])
tensor([[9.9250e-01, 1.3878e-03, 1.3681e-01, 7.0494e-04]],
       grad_fn=<ViewBackward>)
