## Positional Encoding

This notebook will code positional encoding for Transformer neural networks with pytrch

In [2]:
import torch
import torch.nn as nn

max_sequence_length = 10 #num_words
d_model = 8 

$$
PE(\text{position}, 2i) = \sin\bigg( \frac{ \text{position} }{10000^\frac{2i}{d_{model}}} \bigg)
$$

$$
PE(\text{position}, 2i+1) = \cos\bigg( \frac{ \text{position} }{10000^\frac{2i}{d_{model}}} \bigg)
$$

We can rewrite these as

$$
PE(\text{position}, i) = \sin\bigg( \frac{ \text{position} }{10000^\frac{i}{d_{model}}} \bigg) \text{ when i is even}
$$

$$
PE(\text{position}, i) = \cos\bigg( \frac{ \text{position} }{10000^\frac{i-1}{d_{model}}} \bigg) \text{ when i is odd}
$$

In [3]:
even_i = torch.arange(0, d_model, 2).float()
even_i

tensor([0., 2., 4., 6.])

In [4]:
even_denominator = torch.pow(10000, even_i/d_model)
even_denominator

tensor([   1.,   10.,  100., 1000.])

In [5]:
odd_i = torch.arange(1, d_model, 2).float()
odd_i

tensor([1., 3., 5., 7.])

In [6]:
odd_denominator = torch.pow(10000, (odd_i - 1)/d_model)
odd_denominator

tensor([   1.,   10.,  100., 1000.])

`even_denominator` and `odd_denominator` are the same! So we can just do one of these actions and call the resulting variable `denominator`

In [7]:
denominator = even_denominator

In [11]:
position = torch.arange(max_sequence_length, dtype=torch.float)
position

tensor([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.])

In [12]:
position= position.reshape(max_sequence_length, 1)
position

tensor([[0.],
        [1.],
        [2.],
        [3.],
        [4.],
        [5.],
        [6.],
        [7.],
        [8.],
        [9.]])

In [13]:
even_PE = torch.sin(position / denominator)
odd_PE = torch.cos(position / denominator)

In [14]:
even_PE

tensor([[ 0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.8415,  0.0998,  0.0100,  0.0010],
        [ 0.9093,  0.1987,  0.0200,  0.0020],
        [ 0.1411,  0.2955,  0.0300,  0.0030],
        [-0.7568,  0.3894,  0.0400,  0.0040],
        [-0.9589,  0.4794,  0.0500,  0.0050],
        [-0.2794,  0.5646,  0.0600,  0.0060],
        [ 0.6570,  0.6442,  0.0699,  0.0070],
        [ 0.9894,  0.7174,  0.0799,  0.0080],
        [ 0.4121,  0.7833,  0.0899,  0.0090]])

In [15]:
even_PE.shape

torch.Size([10, 4])

In [16]:
x = torch.ones((10, 8))

In [17]:
x.shape

torch.Size([10, 8])

In [18]:
pe = torch.zeros_like(x)

In [19]:
pe.size()

torch.Size([10, 8])

In [20]:
even_PE.shape

torch.Size([10, 4])

In [55]:
pe

tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.8415,  0.0000,  0.0998,  0.0000,  0.0100,  0.0000,  0.0010,  0.0000],
        [ 0.9093,  0.0000,  0.1987,  0.0000,  0.0200,  0.0000,  0.0020,  0.0000],
        [ 0.1411,  0.0000,  0.2955,  0.0000,  0.0300,  0.0000,  0.0030,  0.0000],
        [-0.7568,  0.0000,  0.3894,  0.0000,  0.0400,  0.0000,  0.0040,  0.0000],
        [-0.9589,  0.0000,  0.4794,  0.0000,  0.0500,  0.0000,  0.0050,  0.0000],
        [-0.2794,  0.0000,  0.5646,  0.0000,  0.0600,  0.0000,  0.0060,  0.0000],
        [ 0.6570,  0.0000,  0.6442,  0.0000,  0.0699,  0.0000,  0.0070,  0.0000],
        [ 0.9894,  0.0000,  0.7174,  0.0000,  0.0799,  0.0000,  0.0080,  0.0000],
        [ 0.4121,  0.0000,  0.7833,  0.0000,  0.0899,  0.0000,  0.0090,  0.0000]])

In [21]:
odd_PE

tensor([[ 1.0000,  1.0000,  1.0000,  1.0000],
        [ 0.5403,  0.9950,  1.0000,  1.0000],
        [-0.4161,  0.9801,  0.9998,  1.0000],
        [-0.9900,  0.9553,  0.9996,  1.0000],
        [-0.6536,  0.9211,  0.9992,  1.0000],
        [ 0.2837,  0.8776,  0.9988,  1.0000],
        [ 0.9602,  0.8253,  0.9982,  1.0000],
        [ 0.7539,  0.7648,  0.9976,  1.0000],
        [-0.1455,  0.6967,  0.9968,  1.0000],
        [-0.9111,  0.6216,  0.9960,  1.0000]])

In [22]:
odd_PE.shape

torch.Size([10, 4])

In [26]:
stacked = torch.stack([even_PE, odd_PE], dim=2)
stacked.shape, stacked[0][0]

(torch.Size([10, 4, 2]), tensor([0., 1.]))

In [39]:
PE = torch.flatten(stacked, start_dim=1, end_dim=2)
PE, PE.shape

(tensor([[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
           1.0000e+00,  0.0000e+00,  1.0000e+00],
         [ 8.4147e-01,  5.4030e-01,  9.9833e-02,  9.9500e-01,  9.9998e-03,
           9.9995e-01,  1.0000e-03,  1.0000e+00],
         [ 9.0930e-01, -4.1615e-01,  1.9867e-01,  9.8007e-01,  1.9999e-02,
           9.9980e-01,  2.0000e-03,  1.0000e+00],
         [ 1.4112e-01, -9.8999e-01,  2.9552e-01,  9.5534e-01,  2.9995e-02,
           9.9955e-01,  3.0000e-03,  1.0000e+00],
         [-7.5680e-01, -6.5364e-01,  3.8942e-01,  9.2106e-01,  3.9989e-02,
           9.9920e-01,  4.0000e-03,  9.9999e-01],
         [-9.5892e-01,  2.8366e-01,  4.7943e-01,  8.7758e-01,  4.9979e-02,
           9.9875e-01,  5.0000e-03,  9.9999e-01],
         [-2.7942e-01,  9.6017e-01,  5.6464e-01,  8.2534e-01,  5.9964e-02,
           9.9820e-01,  6.0000e-03,  9.9998e-01],
         [ 6.5699e-01,  7.5390e-01,  6.4422e-01,  7.6484e-01,  6.9943e-02,
           9.9755e-01,  6.9999e-03,  9.9998e-01],


## Class

Let's combine all the code above into a cute class

In [68]:
import torch
import torch.nn as nn

class PositionalEncoding(nn.Module):

    def __init__(self, d_model, max_sequence_length):
        super().__init__()
        self.max_sequence_length = max_sequence_length
        self.d_model = d_model

    def forward(self):
        even_odd_i = torch.arange(0, self.d_model, 2).float()
        denominator = torch.pow(10000, even_odd_i/self.d_model)

        position = torch.arange(self.max_sequence_length).reshape(self.max_sequence_length, 1)

        even_PE = torch.sin(position / denominator)
        print(even_PE.shape)
        odd_PE = torch.cos(position / denominator)

        stacked = torch.stack([even_PE, odd_PE], dim=2)
        print(stacked.shape)
        PE = torch.flatten(stacked, start_dim=1, end_dim=2)
        print(PE.shape)
        return PE

In [67]:
pe = PositionalEncoding(d_model=6, max_sequence_length=10)
pe.forward()

torch.Size([10, 3, 2])
torch.Size([10, 6])


tensor([[ 0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  1.0000],
        [ 0.8415,  0.5403,  0.0464,  0.9989,  0.0022,  1.0000],
        [ 0.9093, -0.4161,  0.0927,  0.9957,  0.0043,  1.0000],
        [ 0.1411, -0.9900,  0.1388,  0.9903,  0.0065,  1.0000],
        [-0.7568, -0.6536,  0.1846,  0.9828,  0.0086,  1.0000],
        [-0.9589,  0.2837,  0.2300,  0.9732,  0.0108,  0.9999],
        [-0.2794,  0.9602,  0.2749,  0.9615,  0.0129,  0.9999],
        [ 0.6570,  0.7539,  0.3192,  0.9477,  0.0151,  0.9999],
        [ 0.9894, -0.1455,  0.3629,  0.9318,  0.0172,  0.9999],
        [ 0.4121, -0.9111,  0.4057,  0.9140,  0.0194,  0.9998]])

Happy Coding!