In [11]:
import torch
from torch.nn.utils.rnn import pad_sequence

import re


In [29]:
# Exemple de phrases
source_sentences = ["I eat bread", "I like coffee"]
target_sentences = ["Je mange du pain de mie", "Je aime le café"]


In [30]:
# Simulateur de tokenizer (ex : Hugging Face)
class SimpleTokenizer:
    def __init__(self, vocab):
        self.vocab = vocab
        self.pad_token = "<pad>"
        self.start_token = "<start>"
        self.end_token = "<end>"
        self.pad_id = vocab[self.pad_token]
        self.start_id = vocab[self.start_token]
        self.end_id = vocab[self.end_token]

    def tokenize(self, sentence):
        # Sépare les mots et la ponctuation
        return re.findall(r"\w+|[^\w\s]", sentence.lower())

    def encode(self, sentence):
        tokens = [self.start_token] + self.tokenize(sentence) + [self.end_token]
        return [self.vocab[token] for token in tokens]


In [31]:
# Dictionnaire fictif (vocabulaire)
vocab = {
    "<pad>": 0,
    "<start>": 1,
    "<end>": 2,
    "i": 3,
    "eat": 4,
    "bread": 5,
    ".": 6,
    "je": 7,
    "mange": 8,
    "du": 9,
    "pain": 10,
    "like": 11,
    "coffee": 12,
    "aime": 13,
    "le": 14,
    "café": 15,
    "de": 16,
    "mie": 17,
}

tokenizer = SimpleTokenizer(vocab)


In [32]:
# Encodage des phrases
source_encoded = [
    torch.tensor(tokenizer.encode(sentence)) for sentence in source_sentences
]
target_encoded = [
    torch.tensor(tokenizer.encode(sentence)) for sentence in target_sentences
]


In [33]:
source_encoded

[tensor([1, 3, 4, 5, 2]), tensor([ 1,  3, 11, 12,  2])]

In [34]:
target_encoded

[tensor([ 1,  7,  8,  9, 10, 16, 17,  2]), tensor([ 1,  7, 13, 14, 15,  2])]

In [35]:
# Padding des séquences
source_padded = pad_sequence(
    source_encoded, batch_first=True, padding_value=tokenizer.pad_id
)
target_padded = pad_sequence(
    target_encoded, batch_first=True, padding_value=tokenizer.pad_id
)


In [36]:
source_padded

tensor([[ 1,  3,  4,  5,  2],
        [ 1,  3, 11, 12,  2]])

In [37]:
target_padded

tensor([[ 1,  7,  8,  9, 10, 16, 17,  2],
        [ 1,  7, 13, 14, 15,  2,  0,  0]])

In [38]:
# Décalage des targets pour Teacher Forcing
# Input du décodeur (décalé) : enlever le dernier token
decoder_input = target_padded[:, :-1]
# Target attendue : enlever le premier token
decoder_target = target_padded[:, 1:]

# Affichage
print("Source Padded (Inputs pour l'encodeur) :\n", source_padded)
print("Decoder Input (Inputs pour le décodeur) :\n", decoder_input)
print("Decoder Target (Cibles pour l'entraînement) :\n", decoder_target)


Source Padded (Inputs pour l'encodeur) :
 tensor([[ 1,  3,  4,  5,  2],
        [ 1,  3, 11, 12,  2]])
Decoder Input (Inputs pour le décodeur) :
 tensor([[ 1,  7,  8,  9, 10, 16, 17],
        [ 1,  7, 13, 14, 15,  2,  0]])
Decoder Target (Cibles pour l'entraînement) :
 tensor([[ 7,  8,  9, 10, 16, 17,  2],
        [ 7, 13, 14, 15,  2,  0,  0]])


In [39]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Paramètres globaux
SOURCE_VOCAB_SIZE = 20  # Taille fictive du vocabulaire anglais
TARGET_VOCAB_SIZE = 20  # Taille fictive du vocabulaire français
EMBEDDING_DIM = 512  # Dimension des embeddings
NUM_HEADS = 8  # Nombre de têtes dans l'attention multi-tête
NUM_ENCODER_LAYERS = 6  # Nombre de couches dans l'encodeur
NUM_DECODER_LAYERS = 6  # Nombre de couches dans le décodeur
FFN_HIDDEN_DIM = 2048  # Taille des couches intermédiaires
MAX_SEQ_LEN = 100  # Longueur maximale des séquences
PAD_IDX = 0  # Index du token de padding


# Modèle Transformer complet
class TranslationTransformer(nn.Module):
    def __init__(
        self,
        source_vocab_size,
        target_vocab_size,
        embedding_dim,
        num_heads,
        num_encoder_layers,
        num_decoder_layers,
        ffn_hidden_dim,
        pad_idx,
        max_seq_len=100,
    ):
        super(TranslationTransformer, self).__init__()
        self.embedding_src = nn.Embedding(
            source_vocab_size, embedding_dim, padding_idx=pad_idx
        )
        self.embedding_tgt = nn.Embedding(
            target_vocab_size, embedding_dim, padding_idx=pad_idx
        )
        self.positional_encoding = PositionalEncoding(embedding_dim, max_seq_len)

        self.transformer = nn.Transformer(
            d_model=embedding_dim,
            nhead=num_heads,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=ffn_hidden_dim,
            dropout=0.1,
        )

        self.fc_out = nn.Linear(embedding_dim, target_vocab_size)

    def forward(self, src, tgt, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask):
        # Embedding + Position Encoding
        src_emb = self.positional_encoding(self.embedding_src(src))
        tgt_emb = self.positional_encoding(self.embedding_tgt(tgt))

        # Transformer Forward Pass
        transformer_out = self.transformer(
            src_emb.permute(1, 0, 2),  # SeqLen x Batch x Embedding
            tgt_emb.permute(1, 0, 2),  # SeqLen x Batch x Embedding
            src_mask=src_mask,
            tgt_mask=tgt_mask,
            src_key_padding_mask=src_padding_mask,
            tgt_key_padding_mask=tgt_padding_mask,
        )

        # Output projection (vocab probabilities)
        output = self.fc_out(
            transformer_out.permute(1, 0, 2)
        )  # Batch x SeqLen x VocabSize
        return output


# Encodage Positionnel
class PositionalEncoding(nn.Module):
    def __init__(self, embedding_dim, max_seq_len):
        super(PositionalEncoding, self).__init__()
        self.encoding = torch.zeros(max_seq_len, embedding_dim)
        self.encoding.requires_grad = False  # Pas d'apprentissage

        position = torch.arange(0, max_seq_len).unsqueeze(1).float()
        div_term = torch.exp(
            torch.arange(0, embedding_dim, 2).float()
            * (-torch.log(torch.tensor(10000.0)) / embedding_dim)
        )
        self.encoding[:, 0::2] = torch.sin(position * div_term)
        self.encoding[:, 1::2] = torch.cos(position * div_term)

    def forward(self, x):
        seq_len = x.size(1)
        return x + self.encoding[:seq_len, :].to(x.device)


# Masque pour l'encodeur et le décodeur
def generate_square_subsequent_mask(sz):
    return torch.triu(torch.ones(sz, sz) * float("-inf"), diagonal=1)


# Exemple d'utilisation
if __name__ == "__main__":
    # Initialisation du modèle
    model = TranslationTransformer(
        source_vocab_size=SOURCE_VOCAB_SIZE,
        target_vocab_size=TARGET_VOCAB_SIZE,
        embedding_dim=EMBEDDING_DIM,
        num_heads=NUM_HEADS,
        num_encoder_layers=NUM_ENCODER_LAYERS,
        num_decoder_layers=NUM_DECODER_LAYERS,
        ffn_hidden_dim=FFN_HIDDEN_DIM,
        pad_idx=PAD_IDX,
        max_seq_len=MAX_SEQ_LEN,
    )

    # Exemple de données
    src = torch.tensor(
        [[1, 2, 3, 4, 5], [6, 7, 8, 0, 0]]
    )  # Batch x SeqLen (indices du vocabulaire source)
    tgt = torch.tensor(
        [[1, 7, 8, 9, 0], [6, 14, 15, 0, 0]]
    )  # Batch x SeqLen (indices du vocabulaire cible)

    # Masques
    src_mask = None  # Pas nécessaire pour l'encodeur avec des séquences complètes
    tgt_mask = generate_square_subsequent_mask(
        tgt.size(1)
    )  # Masque triangulaire pour le décodeur
    src_padding_mask = src == PAD_IDX  # Masque pour ignorer les tokens de padding
    tgt_padding_mask = tgt == PAD_IDX  # Masque pour ignorer les tokens de padding

    # Passage dans le modèle
    out = model(src, tgt, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask)
    print("Output shape (Batch x SeqLen x VocabSize):", out.shape)

    # Exemple de probabilité softmax pour le 1er batch et 1er token
    print("Probabilités pour le premier token:", F.softmax(out[0, 0], dim=-1))




Output shape (Batch x SeqLen x VocabSize): torch.Size([2, 5, 20])
Probabilités pour le premier token: tensor([0.0361, 0.0194, 0.0603, 0.0614, 0.0728, 0.1461, 0.0292, 0.0710, 0.0499,
        0.0499, 0.0241, 0.0223, 0.0328, 0.0715, 0.0459, 0.0172, 0.0665, 0.0728,
        0.0255, 0.0254], grad_fn=<SoftmaxBackward0>)


