# Understanding Positional Encoding

In [16]:
import torch
import torch.nn as nn

max_sequence_length = 10
d_model = 512

> Indexes (0 ~ max_sequence_length) of each word is transformed into a sinusoid, because:
1. bounds output by [-1, 1]
2. generates unique sinusoid for each position
3. is easy to compute, so easy to extrapolate to larger sequences

In [17]:
# Using Positional Encoding formula
# Both 2i and 2i+1 use the same position / denominator value
two_i = torch.arange(0, d_model, 2).float()
denominator = torch.pow(10000, two_i / d_model)
position = torch.arange(max_sequence_length, dtype=torch.float).reshape(max_sequence_length, 1)
# Split the cases
even_PE = torch.sin(position / denominator)
odd_PE = torch.cos(position / denominator)

> Combine even and odd Positional Encodings

In [18]:
stacked = torch.stack([even_PE, odd_PE], dim=2)
PE = torch.flatten(stacked, start_dim=1, end_dim=2)

# Class Representation

In [14]:
import torch
import torch.nn as nn

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_sequence_length):
        super().__init__()
        self.max_sequence_length = max_sequence_length
        self.d_model = d_model

    def forward(self):
        # Using Positional Encoding formula
        # Both 2i and 2i+1 use the same position / denominator value
        two_i = torch.arange(0, d_model, 2).float()
        denominator = torch.pow(10000, two_i / d_model)
        position = torch.arange(max_sequence_length, dtype=torch.float).reshape(max_sequence_length, 1)
        # Split the cases
        even_PE = torch.sin(position / denominator)
        odd_PE = torch.cos(position / denominator)
        # Combine even and odd Positional Encodings
        stacked = torch.stack([even_PE, odd_PE], dim=2)
        PE = torch.flatten(stacked, start_dim=1, end_dim=2)
        return PE