References:
- https://www.youtube.com/watch?v=ZMxVe-HK174

In [1]:
import torch
import torch.nn as nn

# max number of words that can be passed into the transformer
# in reality this would be in the the thousands
max_sequence_length = 10

# dimension of the embeddings (typically 512)
d_model = 6

$$
PE(\text{position}, 2i) = \sin\bigg( \frac{ \text{position} }{10000^\frac{2i}{d_{model}}} \bigg)
$$

$$
PE(\text{position}, 2i+1) = \cos\bigg( \frac{ \text{position} }{10000^\frac{2i}{d_{model}}} \bigg)
$$

We can rewrite these as

$$
PE(\text{position}, i) = \sin\bigg( \frac{ \text{position} }{10000^\frac{i}{d_{model}}} \bigg) \text{ when i is even}
$$

$$
PE(\text{position}, i) = \cos\bigg( \frac{ \text{position} }{10000^\frac{i-1}{d_{model}}} \bigg) \text{ when i is odd}
$$

In [2]:
# get a set of values between 0 and D model, incrementing by 2
even_i = torch.arange(0, d_model, 2).float()
even_i

tensor([0., 2., 4.])

In [3]:
# user defined scalar
n = 10000

even_denominator = torch.pow(n, even_i / d_model)
even_denominator

tensor([  1.0000,  21.5443, 464.1590])

In [4]:
odd_i = torch.arange(1, d_model, 2).float()
odd_i

tensor([1., 3., 5.])

In [5]:
odd_denominator = torch.pow(10000, (odd_i - 1)/d_model)
odd_denominator

tensor([  1.0000,  21.5443, 464.1590])

In [6]:
"""
vector that we got for the even denominator and the
vector that we got for the odd denominator are exactly the same

you'll notice that the odd indices are one more than the even
indices and in the formulation we always subtract one from the odd indices so
they effectively just became the same thing
"""

"\nvector that we got for the even denominator and the\nvector that we got for the odd denominator are exactly the same\n\nyou'll notice that the odd indices are one more than the even\nindices and in the formulation we always subtract one from the odd indices so\nthey effectively just became the same thing\n"

In [7]:
# just gonna use one denominator
denominator = even_denominator

In [8]:
"""
let's just determine every single position for the sequence 

we can define every position by just taking all the values from 1 to 10 and 

then we'll reshape it to be a two-dimensional Matrix with 
the second dimension as one and you'll get this two-dimensional Matrix 
here 

one for every word
"""
position = torch.arange(
              max_sequence_length, 
              dtype=torch.float
          ).reshape(max_sequence_length, 1)
position

tensor([[0.],
        [1.],
        [2.],
        [3.],
        [4.],
        [5.],
        [6.],
        [7.],
        [8.],
        [9.]])

In [9]:
"""
for even cases we're going to take sine and 
for odd instances we're going to take the cosine
"""
even_PE = torch.sin(position / denominator)
odd_PE = torch.cos(position / denominator)

even_PE.shape, even_PE, 

(torch.Size([10, 3]), tensor([[ 0.0000,  0.0000,  0.0000],
         [ 0.8415,  0.0464,  0.0022],
         [ 0.9093,  0.0927,  0.0043],
         [ 0.1411,  0.1388,  0.0065],
         [-0.7568,  0.1846,  0.0086],
         [-0.9589,  0.2300,  0.0108],
         [-0.2794,  0.2749,  0.0129],
         [ 0.6570,  0.3192,  0.0151],
         [ 0.9894,  0.3629,  0.0172],
         [ 0.4121,  0.4057,  0.0194]]))

In [10]:
"""
what we want though is to interleave these two matrices (even_PE, odd_PE)

so for example for the even position we want this to be the first
index and then we want this to be the second index
and then the third index and then the fourth index 

but starting at zero so it'll be the zeroth index first index
second index third index and 

so on in order to do that I basically stack them together 
well on the second dimension

so that the two that we need to stack on top of each other are right next 
to each other this will give us a 10 x 3 x 2 tensor
"""
stacked = torch.stack([even_PE, odd_PE], dim=2)
stacked.shape, stacked 

(torch.Size([10, 3, 2]), tensor([[[ 0.0000,  1.0000],
          [ 0.0000,  1.0000],
          [ 0.0000,  1.0000]],
 
         [[ 0.8415,  0.5403],
          [ 0.0464,  0.9989],
          [ 0.0022,  1.0000]],
 
         [[ 0.9093, -0.4161],
          [ 0.0927,  0.9957],
          [ 0.0043,  1.0000]],
 
         [[ 0.1411, -0.9900],
          [ 0.1388,  0.9903],
          [ 0.0065,  1.0000]],
 
         [[-0.7568, -0.6536],
          [ 0.1846,  0.9828],
          [ 0.0086,  1.0000]],
 
         [[-0.9589,  0.2837],
          [ 0.2300,  0.9732],
          [ 0.0108,  0.9999]],
 
         [[-0.2794,  0.9602],
          [ 0.2749,  0.9615],
          [ 0.0129,  0.9999]],
 
         [[ 0.6570,  0.7539],
          [ 0.3192,  0.9477],
          [ 0.0151,  0.9999]],
 
         [[ 0.9894, -0.1455],
          [ 0.3629,  0.9318],
          [ 0.0172,  0.9999]],
 
         [[ 0.4121, -0.9111],
          [ 0.4057,  0.9140],
          [ 0.0194,  0.9998]]]))

In [11]:
"""
we just flatten it and effectively we're going to be 
getting that interleavement here too
"""
PE = torch.flatten(stacked, start_dim=1, end_dim=2)

"""
so for our first word this will be the positional encoding (first row)
for the second word it's this one (second row)
for the third word it's here (third row) and so on
"""
PE

tensor([[ 0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  1.0000],
        [ 0.8415,  0.5403,  0.0464,  0.9989,  0.0022,  1.0000],
        [ 0.9093, -0.4161,  0.0927,  0.9957,  0.0043,  1.0000],
        [ 0.1411, -0.9900,  0.1388,  0.9903,  0.0065,  1.0000],
        [-0.7568, -0.6536,  0.1846,  0.9828,  0.0086,  1.0000],
        [-0.9589,  0.2837,  0.2300,  0.9732,  0.0108,  0.9999],
        [-0.2794,  0.9602,  0.2749,  0.9615,  0.0129,  0.9999],
        [ 0.6570,  0.7539,  0.3192,  0.9477,  0.0151,  0.9999],
        [ 0.9894, -0.1455,  0.3629,  0.9318,  0.0172,  0.9999],
        [ 0.4121, -0.9111,  0.4057,  0.9140,  0.0194,  0.9998]])

## Class

In [12]:
import torch
import torch.nn as nn

class PositionalEncoding(nn.Module):

    def __init__(self, d_model, max_sequence_length):
        super().__init__()
        self.max_sequence_length = max_sequence_length
        self.d_model = d_model

    def forward(self):
        even_i = torch.arange(0, self.d_model, 2).float()
        denominator = torch.pow(10000, even_i/self.d_model)
        position = torch.arange(self.max_sequence_length).reshape(self.max_sequence_length, 1)
        even_PE = torch.sin(position / denominator)
        odd_PE = torch.cos(position / denominator)
        stacked = torch.stack([even_PE, odd_PE], dim=2)
        PE = torch.flatten(stacked, start_dim=1, end_dim=2)
        return PE

In [13]:
pe = PositionalEncoding(d_model=6, max_sequence_length=10)
pe.forward()

tensor([[ 0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  1.0000],
        [ 0.8415,  0.5403,  0.0464,  0.9989,  0.0022,  1.0000],
        [ 0.9093, -0.4161,  0.0927,  0.9957,  0.0043,  1.0000],
        [ 0.1411, -0.9900,  0.1388,  0.9903,  0.0065,  1.0000],
        [-0.7568, -0.6536,  0.1846,  0.9828,  0.0086,  1.0000],
        [-0.9589,  0.2837,  0.2300,  0.9732,  0.0108,  0.9999],
        [-0.2794,  0.9602,  0.2749,  0.9615,  0.0129,  0.9999],
        [ 0.6570,  0.7539,  0.3192,  0.9477,  0.0151,  0.9999],
        [ 0.9894, -0.1455,  0.3629,  0.9318,  0.0172,  0.9999],
        [ 0.4121, -0.9111,  0.4057,  0.9140,  0.0194,  0.9998]])