## Positional Embedding
Positional embedding provides a model with information about the position or order of tokens in a sequence, as models like Transformers process inputs in parallel and lack inherent sequential understanding. The two main types are absolute and relative positional embeddings. Absolute embeddings use a fixed or learned vector for each specific position, while relative embeddings focus on the distance or relationship between tokens

In [6]:
import torch

vocab_size = 50257 ## GPT 2 vocab size
output_dim = 768 ## GPT 3 has dimension of 12288 for GPT 2 its 768

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim) ## create a weight matrix of dimension 768 which will be optimized during training

In [7]:
token_embedding_layer

Embedding(50257, 768)

In [8]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

In [9]:


with open("/content/the-verdict.txt", encoding ="utf-8") as f:
    raw_text = f.read()
print("Total number of characters:", len(raw_text))
print(raw_text[:99])


Total number of characters: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [10]:
## instantiating the data loader
from torch.utils.data import DataLoader, Dataset
max_length = 4 ## sliding window max length
class GPTDataset(Dataset):
  def __init__(self,text, tokenizer, max_length, stride):
    self.input_ids = []
    self.target_ids = []

    ## tokenize the entire dataset
    token_ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
    ## using the sliding window to chunk the input
    for i in range(0, len(token_ids)- max_length, stride):
      input_chunk = token_ids[i:i+max_length]
      target_chunk = token_ids[i+1: i+1 + max_length]
      self.input_ids.append(torch.tensor(input_chunk))
      self.target_ids.append(torch.tensor(target_chunk))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, index):
    return self.input_ids[index], self.target_ids[index]

def create_dataloader(text,batch_size= 4, max_length = 256,stride=128, shuffle= True, drop_last = True, num_worker= 0 ):
  tokenizer = tiktoken.get_encoding("gpt2")
  dataset = GPTDataset(text, tokenizer, max_length, stride)
  dataLoader = DataLoader(
      dataset,
      batch_size = batch_size,
      shuffle = shuffle,
      drop_last= drop_last,
      num_workers= num_worker
  )
  return dataLoader




In [13]:
import torch
dataloader = create_dataloader(
    raw_text,
    batch_size=8,
    max_length=max_length,
    stride=1,
    shuffle=False,
)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[   40,   367,  2885,  1464],
        [  367,  2885,  1464,  1807],
        [ 2885,  1464,  1807,  3619],
        [ 1464,  1807,  3619,   402],
        [ 1807,  3619,   402,   271],
        [ 3619,   402,   271, 10899],
        [  402,   271, 10899,  2138],
        [  271, 10899,  2138,   257]]), tensor([[  367,  2885,  1464,  1807],
        [ 2885,  1464,  1807,  3619],
        [ 1464,  1807,  3619,   402],
        [ 1807,  3619,   402,   271],
        [ 3619,   402,   271, 10899],
        [  402,   271, 10899,  2138],
        [  271, 10899,  2138,   257],
        [10899,  2138,   257,  7026]])]


In [14]:
inputs, target = next(data_iter)

In [16]:
inputs.shape ## 8 is the size of batch and 4 is the max length of input

torch.Size([8, 4])

In [17]:
## now when  we convert into embedding of dimesion 768 it will be converted into 8 x 4x 768 dimensional vectore
token_embedding = token_embedding_layer(inputs)
print(token_embedding.shape)

torch.Size([8, 4, 768])


In [20]:
torch.arange(max_length)

tensor([0, 1, 2, 3])

In [18]:
## creating absolute positional embedding

context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim )
pos_embedding = pos_embedding_layer(torch.arange(max_length))
print(pos_embedding.shape)

torch.Size([4, 768])


In [21]:
## adding pos and token embedding
input_embedding = token_embedding + pos_embedding
print(input_embedding.shape)

torch.Size([8, 4, 768])
