# 0. Import important libraris (modules)

In [2]:
import torch
print(torch.__version__)

2.8.0+cpu


In [6]:
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, random_split
from torch.utils.tensorboard import SummaryWriter
# Math
import math
# Huggingface libraries
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace
# Pathlib
from pathlib import Path
# Typing
from typing import  Any
# Library that progress bars in loops
from tqdm import tqdm
# Importing library of warnings
import warnings


## 1. Input Embedding


English sentence:
The animal didn't cross the street because it was too tired


**Tokens**

The, animal, didn't, cross, the, street, because, it, was, too, tired


**Vocabulary**

the=0, animal=1, didn't=2, cross=3, street=4, because=5, it=6, was=7, too=8, tired=9

0,1,2,3,0,4,5,6,7,8,9

In [7]:
class InputEmbedding(nn.Module):
    def __init__(self, d_model: int, vocab_size: int):
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_model)
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.embedding(x) * math.sqrt(self.d_model)    

## Positional Encoding

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, seq_len: int, dropout: float):
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)

        # Create a matrix of shape (seq_len, d_model) to hold the positional encodings
        pe = torch.zeros(seq_len, d_model)#
        position = torch.arrange(0, seq_len, dtype=torch.float).unsqueeze(1) 
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:  # (batch_size, seq_len, d_model)
        x = x + self.pe[:, :x.shape[1], :].requires_grad_(False) # (1, seq_len, d_model)
        return self.dropout(x)