In [1]:
#| default_exp components
%load_ext autoreload
%autoreload 2


# Transformers components to be assembled in a model

In [2]:
#\ export
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F

device = torch.device('mps')
torch.backends.mps.is_available()

True

## Embeddings

- Token embeddings, using torch embedding lookup
- Positional embedding, which can be fixed or learned.

In [3]:
#\ export

class TokenEmbeddings(nn.Module):
    def __init__(self, vocab_size: int, embedding_dim: int) -> None:
        super().__init__()
        self.embedding_dim = embedding_dim
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, embedding_dim=embedding_dim)
        
    def forward(self, x):
        return self.embedding(x)

In [4]:
vocab_size = 6
embedding_dim = 50
emb = TokenEmbeddings(vocab_size, embedding_dim=embedding_dim)
emb.to(device)
x = torch.tensor([0,1,2,1], dtype=torch.long, device=device)
emb_x = emb(x)
assert emb_x.shape==(x.shape[0],embedding_dim)
assert torch.equal(emb_x[1],emb_x[3])

### Positional Encoding

As each word in a sentence simultaneously flows through the Transformer’s encoder/decoder stack, The model itself doesn’t have any sense of position/order for each word. Consequently, there’s still the need for a way to incorporate the order of the words into our model.
So we give the model some sense of position of the token in the sequence. 

#### Potential solutions:

The first idea that might come to mind is to assign a number to each time-step within the [0, 1] range in which 0 means the first word and 1 is the last time-step. One of the problems it will introduce is that you can’t figure out how many words are present within a specific range. In other words, time-step delta doesn’t have consistent meaning across different sentences.

Another idea is to assign a number to each time-step linearly. That is, the first word is given “1”, the second word is given “2”, and so on. The problem with this approach is that not only the values could get quite large, but also our model can face sentences longer than the ones in training.

Ideally, the following criteria should be satisfied:

- It should output a unique encoding for each time-step (word’s position in a sentence)
- Distance between any two time-steps should be consistent across sentences with different lengths.
- Our model should generalize to longer sentences without any efforts. Its values should be bounded.
- It must be deterministic.

Proposed solutions:

The initial solution that was proposed isn’t a single number. Instead, it’s a d-dimensional vector that contains information about a specific position in a sentence. And  this vector is used to equip each word with information about its position in a sentence. 

$P(k,2i) = sin(\frac{k}{n^{\frac{2i}{d}}})$

$P(k,2i+1) = cos(\frac{k}{n^{\frac{2i}{d}}})$

where : 
- L: sequence length
- k: position of token in input sequence, $0<=k<L/2$
- d: dimension of embedding
- P(k,j): position function to map a position k in sequence to index (k,j) in positional matrix
- n: user defined scalar (ex: 10'000)
- i: column indice in positional matrix $0<=i<d/2$
- 


In [5]:
#\ export

class PositionalEmbeddings(nn.Module):
    
    def __init__(self, max_seq_len: int, embedding_dim: int, is_learned: bool = True) -> None:
        super().__init__()
        self.max_seq_len = max_seq_len
        self.embedding_dim = embedding_dim
        self.is_learned = is_learned
        if self.is_learned:
            self.pos_embedding = nn.Embedding(max_seq_len, embedding_dim)
        else:
            self.pos_embedding = torch.zeros(max_seq_len, embedding_dim)
        
    def forward(self,x):
        ...
        


pos_emb = PositionalEmbeddings(max_seq_len=5, embedding_dim=embedding_dim, is_learned=False)        

In [24]:
#| hide
import nbdev; nbdev.nbdev_export()