In [93]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

import lightning as L

# What a positional encoding matrix will look like

In [94]:
pos_encode_matrix=torch.zeros(4, 6)
pos_encode_matrix

tensor([[0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.]])

# The positions of the rows 

In [95]:
positions=torch.arange(start=0,end=4).float().unsqueeze(1)
positions

tensor([[0.],
        [1.],
        [2.],
        [3.]])

# How to calculate
$$
\left(\frac{1}{10000.0 ^ {\frac{2d}{d_{\text{model}}}}}\right)
$$
of Positional Encoding
$$
PE(i, 2d) = \sin\left(\frac{i}{10000^{\frac{2d}{d_{\text{model}}}}}\right)
$$

$$
PE(i, 2d+1) = \cos\left(\frac{i}{10000^{\frac{2d}{d_{\text{model}}}}}\right)
$$

Where:

* $PE(i, d)$ is the positional encoding for the position $i$ at dimension $d$
* $i$ is the position of the word in the sequence (starting from 0)
* $d$ is the dimension of the embedding (starting from 0)
* $d_{\text{model}}$ is the total dimensionality of the model's input (embedding size, e.g., 512, 1024)

In [96]:
2*torch.tensor([1,2,3])

tensor([2, 4, 6])

In [97]:
div_term=1/(torch.tensor(10000.0)**(torch.tensor([2,4,6])/6))
div_term
# 1   /    (10000     **   [2/6, 4/6, 6/6] )

tensor([4.6416e-02, 2.1544e-03, 1.0000e-04])

$$
\frac{1}{\left(10000^{\left[\frac{2}{6}, \frac{4}{6}, \frac{6}{6}\right]}\right)}
$$

### Step-by-Step Breakdown:

1. **Exponentiation**:
   We're calculating $10000^{\left[\frac{2}{6}, \frac{4}{6}, \frac{6}{6}\right]}$, which simplifies to:

   * $10000^{0.3333}$, $10000^{0.6667}$,  $10000^{1}$

2. **Calculating each power**:

   * $10000^{0.3333} \approx 21.5443$
   * $10000^{0.6667} \approx 147.5443$
   * $10000^{1} = 10000$

3. **Division by 6**:
   We now divide these values by 6:

   * $21.5443 / 6 \approx 3.5907$
   * $147.5443 / 6 \approx 24.5907$
   * $10000 / 6 \approx 1666.6667$

4. **Inversion**:
   Finally, we take the inverse of these values:

   * $\frac{1}{3.5907} \approx 0.2786$
   * $\frac{1}{24.5907} \approx 0.0407$
   * $\frac{1}{1666.6667} \approx 0.0006$

The output tensor is:

```
tensor([4.6416e-02, 2.1544e-03, 1.0000e-04])
```

* $4.6416e-02$ is the result of $\frac{1}{3.5907}$
* $2.1544e-03$ is the result of $\frac{1}{24.5907}$
* $1.0000e-04$ is the result of $\frac{1}{1666.6667}$


# How to put the sin values in the even positions and the cos values in the odd positions of the positional_encoding_matrix

In [98]:
torch.sin(positions * div_term) #  even position values

tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00],
        [4.6399e-02, 2.1544e-03, 1.0000e-04],
        [9.2698e-02, 4.3089e-03, 2.0000e-04],
        [1.3880e-01, 6.4633e-03, 3.0000e-04]])

In [99]:
torch.cos(positions * div_term) #  odd position values

tensor([[1.0000, 1.0000, 1.0000],
        [0.9989, 1.0000, 1.0000],
        [0.9957, 1.0000, 1.0000],
        [0.9903, 1.0000, 1.0000]])

In [100]:
pos_encode_matrix

tensor([[0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.]])

In [101]:
pos_encode_matrix[:,0::2]=torch.sin(positions * div_term) # Fill even positions of each column with sine values
pos_encode_matrix

tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [4.6399e-02, 0.0000e+00, 2.1544e-03, 0.0000e+00, 1.0000e-04, 0.0000e+00],
        [9.2698e-02, 0.0000e+00, 4.3089e-03, 0.0000e+00, 2.0000e-04, 0.0000e+00],
        [1.3880e-01, 0.0000e+00, 6.4633e-03, 0.0000e+00, 3.0000e-04, 0.0000e+00]])

In [102]:

pos_encode_matrix[:,1::2]=torch.cos(positions * div_term) # Fill odd positions of each column with cosine values
pos_encode_matrix

tensor([[0.0000e+00, 1.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 1.0000e+00],
        [4.6399e-02, 9.9892e-01, 2.1544e-03, 1.0000e+00, 1.0000e-04, 1.0000e+00],
        [9.2698e-02, 9.9569e-01, 4.3089e-03, 9.9999e-01, 2.0000e-04, 1.0000e+00],
        [1.3880e-01, 9.9032e-01, 6.4633e-03, 9.9998e-01, 3.0000e-04, 1.0000e+00]])

In [103]:
embedding_index=torch.arange(start=0,end=6,step=2).float()
embedding_index

tensor([0., 2., 4.])

In [106]:
class PositionalEncoding(nn.Module):
    def __init__(self,d_model=2,max_len=6):
        super().__init__()

        positional_encoding_matrix=torch.zeros(max_len,d_model)
        positions=torch.arange(start=0,end=max_len).float().unsqueeze(1) # Create a column vector of positions
        embedding_index=torch.arange(start=0,end=d_model,step=2).float()

        div_term=1/torch.tensor(10000**(embedding_index/d_model))

        positional_encoding_matrix[:,0::2]=torch.sin(positions * div_term)
        positional_encoding_matrix[:,1::2]=torch.cos(positions * div_term)

        self.register_buffer("pe",positional_encoding_matrix)
        
    def forward(self, word_embeddings):
        return word_embeddings + self.pe[:word_embeddings.size(0),:]

In [None]:
class Attention(nn.Module):
    def __init(self,d_model=2):
        super().__init__()
        