### **Import Library**

Sebelum membuat Transformer from scratch, import library terlebih dahulu

In [88]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy

### **Word Embedding**

Seperti task NLP lainnya, kita memetakan token input ke dalam sebuah matrix. Dalam hal ini kita memerlukan Word Embedding

In [89]:
class WordEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model, padding_idx=0):
        super(WordEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=padding_idx)
        self.d_model = d_model
        self.scale = math.sqrt(d_model)

    def forward(self, x):
        x = self.embedding(x)
        return x * self.scale

token_ids = torch.tensor([
    [1, 2, 3, 4, 5],    
    [6, 7, 8, 9, 10]      
])  # shape: [batch_size=2, seq_len=5]  

vocab_size = 10000
d_model = 512

embedding_layer = WordEmbedding(vocab_size, d_model)
word_embedding_output = embedding_layer(token_ids)
print(word_embedding_output.shape) 
print(word_embedding_output)  # Output the word embeddings

torch.Size([2, 5, 512])
tensor([[[ 18.0635,  28.5057, -25.3359,  ...,  28.3018,  18.6842, -40.6766],
         [  5.7509,  14.3622,  -6.0185,  ...,  27.5548,  13.9551,   3.0788],
         [-37.3428, -22.3181,   1.7631,  ...,  18.8375,  -7.4120, -36.4528],
         [-32.6975, -26.0815, -18.5710,  ...,  56.4819, -29.7464, -15.3596],
         [-23.8286, -20.0306,  31.4403,  ..., -31.8298,  39.9772,  29.4495]],

        [[ 13.7110,  16.9010,  -0.5562,  ...,  -2.8209,  10.2214,  -1.1217],
         [-16.0309, -35.5362,   9.4748,  ...,  27.0622,   0.7215, -25.1355],
         [  9.8960,  35.8026, -32.6231,  ..., -10.0365,  36.1730,  -5.7838],
         [ 28.1151, -17.5125,  -5.2629,  ..., -18.0260,   2.9462,  25.7082],
         [  2.6106, -29.2682,  11.5294,  ...,  -7.8573, -15.2650,   4.5868]]],
       grad_fn=<MulBackward0>)


### **Positional Encoding**

Karena Transformer tidak mengandalkan urutan alami dari data (seperti urutan waktu pada RNN), informasi posisi token ditambahkan secara eksplisit melalui positional encoding. Vektor ini menggunakan fungsi sinus dan cosinus untuk memberikan “koordinat” yang unik pada tiap token sehingga model dapat memahami urutan kata dalam kalimat. 

Setiap token dalam input ditambahkan dengan vektor posisi yang dihitung menggunakan fungsi sinus dan cosinus. Vektor ini memiliki dimensi yang sama dengan embedding sehingga penjumlahan antara embedding dan positional encoding dapat dilakukan secara langsung.

Positional encoding memberikan representasi unik berbasis pola periodik, mirip dengan bagaimana gelombang sinusoidal dapat merepresentasikan informasi dalam sinyal, sehinnga bisa dianggap seperti memberikan `koordinat` untuk setiap token. Kenapa fungsi sinus dan cosinus? Karena Penggunaan fungsi sinus dan cosinus memastikan bahwa posisi relatif antara kata-kata tetap terjaga, bahkan ketika panjang urutan bervariasi. Hal ini penting karena Transformer tidak memiliki mekanisme memori seperti RNN.

Rumus dari Postional Encoding adalah sebagai berikut:
$$
PE_{(pos, 2i)} = \sin\left(\frac{pos}{10000^{\frac{2i}{d}}}\right)
$$

$$
PE_{(pos, 2i+1)} = \cos\left(\frac{pos}{10000^{\frac{2i}{d}}}\right)
$$

- `d` adalah dimensi embedding
- `pos` adalah index dari posisi
- `i` adalah index dari dimensi

In [90]:
class PositionalEncoding(torch.nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        # Buat tensor posisi dengan ukuran (max_len, d_model)
        pe = torch.zeros(max_len, d_model)
        print(pe.shape)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        print(position.shape)
        # Hitung pembagi frekuensi
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        # Aplikasi rumus sinus dan cosinus
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        # Tambahkan dimensi batch agar dapat langsung ditambahkan ke embedding
        pe = pe.unsqueeze(0)  # Shape: [1, max_len, d_model]
        self.register_buffer('pe', pe)

    def forward(self, x):
        # x: [batch_size, seq_len, d_model]
        x = x + self.pe[:, :x.size(1), :]
        return x
    
# Contoh penggunaan:
d_model = 512
pe_layer = PositionalEncoding(d_model)
pe_output = pe_layer(word_embedding_output)
print("Shape input setelah positional encoding:", pe_output.shape)
print("Output:", pe_output)

torch.Size([5000, 512])
torch.Size([5000, 1])
Shape input setelah positional encoding: torch.Size([2, 5, 512])
Output: tensor([[[ 18.0635,  29.5057, -25.3359,  ...,  29.3018,  18.6842, -39.6766],
         [  6.5924,  14.9025,  -5.1967,  ...,  28.5548,  13.9552,   4.0788],
         [-36.4335, -22.7343,   2.6995,  ...,  19.8375,  -7.4118, -35.4528],
         [-32.5564, -27.0715, -18.3259,  ...,  57.4819, -29.7461, -14.3596],
         [-24.5854, -20.6842,  30.7832,  ..., -30.8298,  39.9776,  30.4495]],

        [[ 13.7110,  17.9010,  -0.5562,  ...,  -1.8209,  10.2214,  -0.1217],
         [-15.1895, -34.9959,  10.2967,  ...,  28.0622,   0.7216, -24.1355],
         [ 10.8053,  35.3865, -31.6867,  ...,  -9.0365,  36.1732,  -4.7838],
         [ 28.2562, -18.5025,  -5.0178,  ..., -17.0260,   2.9465,  26.7082],
         [  1.8538, -29.9218,  10.8723,  ...,  -6.8573, -15.2646,   5.5868]]],
       grad_fn=<AddBackward0>)


### **Attention Mechanism**

Bagaimana cara transformer mengerti konteks dari suatu sequence - kata per kata?. Contohnya pada kalimat "Sigra membeli sate dan rasanya enak sekali". Kata "enak" bisa saja merujuk ke "sate" ataupun "Sigra". Attention Mechanism memungkinkan model untuk mengerti hubungan kata per kata dengan cara melihat bagaimana mirip setiap kata yang ada di dalam sequence. Kata "sate" dan "enak" akan memiliki kemiripan lebih tinggi dibanding kata yang lain, hal ini akan mengakibatkan bagaimana kata "enak" akan di encode di dalam transformer.

#### **Single Head Attention**

In [91]:
class SelfAttention(nn.Module):
    def __init__(self, d_model):
        super(SelfAttention, self).__init__()
        self.d_model = d_model

        self.q_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.out_linear = nn.Linear(d_model, d_model)
        self.softmax = nn.Softmax(dim=-1)
        # self.dropout = nn.Dropout(0.1)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        # Q, K, V: [batch_size, seq_len, d_model]

        # Hitung dot product attention
        dp_attention = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_model)
        # dp_attention: [batch_size, seq_len, seq_len]
    
        if mask is not None:
             # Mask: [batch_size, 1, seq_len, seq_len] or [1, 1, seq_len, seq_len] or [seq_len, seq_len] (broadcastable)
            dp_attention = dp_attention.masked_fill(mask == 0, -1e9) # Asumsi mask adalah 0 untuk posisi yang di-mask

        # Normalisasi
        attn_weights = self.softmax(dp_attention)
        # attn_weights: [batch_size, seq_len, seq_len]

        # Hitung output
        output = torch.matmul(attn_weights, V)
        # output: [batch_size, seq_len, d_model]

        return output, attn_weights
        

    def forward(self, x, mask=None):
        # x: [batch_size, seq_len, d_model]

        # Hitung Q, K, V
        Q = self.q_linear(x)
        K = self.k_linear(x)
        V = self.v_linear(x)
        # Q, K, V: [batch_size, seq_len, d_model]

        # Scaled Dot product attention
        attn_output, attn_weights = self.scaled_dot_product_attention(Q, K, V, mask)
        output = self.out_linear(attn_output)
        # output: [batch_size, seq_len, d_model]
        
        return output, attn_weights
    
# Contoh penggunaan SelfAttention
batch_size = 2
seq_len = 5

self_attention_layer = SelfAttention(d_model)
x_self_attention = pe_output.view(batch_size, seq_len, d_model)  # [batch_size, seq_len, d_model]
attn_output, attn_weights = self_attention_layer(x_self_attention)

print("Shape output SelfAttention:", attn_output.shape)  # [batch_size, seq_len, d_model]
print("Shape attention weights:", attn_weights.shape)  # [batch_size, seq_len, seq_len]

Shape output SelfAttention: torch.Size([2, 5, 512])
Shape attention weights: torch.Size([2, 5, 5])


In [92]:
class MultiheadSelfAttention(nn.Module):
    def __init__(self, d_model, n_heads=8):
        super(MultiheadSelfAttention, self).__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_k = d_model // n_heads
        assert d_model % n_heads == 0, "d_model harus dibagi habis oleh n_heads"

        self.q_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.out_proj = nn.Linear(d_model, d_model)

        self.softmax = nn.Softmax(dim=-1)
        # self.dropout = nn.Dropout(0.1) # Optional

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        # Q, K, V: [batch_size, n_heads, seq_len, d_k]

        # Hitung dot product attention
        dp_attention = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        # dp_attention: [batch_size, n_heads, seq_len, seq_len]
    
        if mask is not None:
             # Mask: [batch_size, 1, seq_len, seq_len] or [batch_size, n_heads, seq_len, seq_len] (broadcastable)
            dp_attention = dp_attention.masked_fill(mask == 0, -1e9)

        # Normalisasi
        attn_weights = self.softmax(dp_attention)
        # attn_weights: [batch_size, n_heads, seq_len, seq_len]

        # Hitung output
        output = torch.matmul(attn_weights, V)
        # output: [batch_size, n_heads, seq_len, d_k]

        return output, attn_weights

    def split_heads(self, x):
        # x: [batch_size, seq_len, d_model]
        batch_size, seq_length, d_model = x.size()
        x = x.view(batch_size, seq_length, self.n_heads, self.d_k).permute(0, 2, 1, 3)
        # x: [batch_size, n_heads, seq_len, d_k]
        return x
    
    def concat_heads(self, x):
        # x: [batch_size, n_heads, seq_len, d_k]
        batch_size, n_heads, seq_length, d_k = x.size()
        x = x.permute(0, 2, 1, 3).contiguous().view(batch_size, seq_length, self.d_model)
        # x: [batch_size, seq_len, d_model]

        return x

    def forward(self, q, k, v, mask=None):
        # x: [batch_size, seq_len, d_model]
        Q = self.split_heads(self.q_linear(q))
        K = self.split_heads(self.k_linear(k))
        V = self.split_heads(self.v_linear(v))
        
        attn_output, attention_weights = self.scaled_dot_product_attention(Q, K, V, mask)
        output = self.concat_heads(attn_output)
        output = self.out_proj(output)
        # output: [batch_size, seq_len, d_model]

        return output, attention_weights
    
# Contoh penggunaan MultiheadSelfAttention
batch_size = 2
seq_len = 5

multihead_attention_layer = MultiheadSelfAttention(d_model, n_heads=8)
x_multihead_attention = pe_output.view(batch_size, seq_len, d_model)  # [batch_size, seq_len, d_model]
attn_output_multihead, attn_weights_multihead = multihead_attention_layer(x_multihead_attention, x_multihead_attention, x_multihead_attention) # [batch_size, seq_len, d_model]

print("Shape output MultiheadSelfAttention:", attn_output_multihead.shape)  # [batch_size, seq_len, d_model]
print("Shape attention weights MultiheadSelfAttention:", attn_weights_multihead.shape)  # [batch_size, n_heads, seq_len, seq_len]

Shape output MultiheadSelfAttention: torch.Size([2, 5, 512])
Shape attention weights MultiheadSelfAttention: torch.Size([2, 8, 5, 5])


In [93]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(FeedForward, self).__init__()
        # Biasanya d_ff lebih besar dari d_model
         # d_model = 512
        # d_ff = 4 * d_model
       
        self.linear1 = nn.Linear(d_model, d_ff)
        self.relu = nn.ReLU() # bisa diganti dengan fungsi aktivasi lain tapi yg linear, misalnya GELU
        # self.dropout = nn.Dropout(0.1) # Opsional
        self.linear2 = nn.Linear(d_ff, d_model)
        

    def forward(self, x):
        # x: [batch_size, seq_len, d_model]
        output = self.linear1(x)
        output = self.relu(output)
        # output = self.dropout(output)
        output = self.linear2(output)

        return output
        

In [94]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, dropout=0.1):
        super(EncoderLayer, self).__init__()
        self.attention = MultiheadSelfAttention(d_model, n_heads)
        self.norm1 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)

        self.ffn = FeedForward(d_model, d_ff)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        # x: [batch_size, seq_len, d_model]

        # Multi-head self-attention
        attn_output, attn_weights = self.attention(x, x, x, mask=mask)  # -> [batch, seq_len, d_model]

        # Residual connection + LayerNorm
        x = self.norm1(x + self.dropout1(attn_output))

        # Feedforward network
        ffn_output = self.ffn(x)  # -> [batch, seq_len, d_model]

        # Residual connection + LayerNorm
        x = self.norm2(x + self.dropout2(ffn_output))

        return x, attn_weights
    
# Contoh penggunaan
d_model = 512
n_heads = 8
d_ff = 2048  # Biasanya d_ff lebih besar dari d_model
transformer_encoder = EncoderLayer(d_model, n_heads, d_ff)

# Input untuk TransformerEncoder
x_transformer_encoder = pe_output.view(batch_size, seq_len, d_model)  # [batch_size, seq_len, d_model]
encoder_output, encoder_attn_weights = transformer_encoder(x_transformer_encoder)
print("Shape output EncoderLayer:", encoder_output.shape)  # [batch_size, seq_len, d_model]
print("Shape attention weights EncoderLayer:", encoder_attn_weights.shape)  # [batch_size, n_heads, seq_len, seq_len]

Shape output EncoderLayer: torch.Size([2, 5, 512])
Shape attention weights EncoderLayer: torch.Size([2, 8, 5, 5])


In [None]:
class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size, d_model, n_heads, d_ff, num_layers, max_len=5000, dropout=0.1):
        super(TransformerEncoder, self).__init__()
        self.embedding = WordEmbedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_len)
        self.layers = nn.ModuleList([EncoderLayer(d_model, n_heads, d_ff, dropout) for _ in range(num_layers)])
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x, mask=None):
        # x: [batch_size, seq_len]
        x = self.embedding(x)  # -> [batch_size, seq_len, d_model]
        x = self.positional_encoding(x)  # -> [batch_size, seq_len, d_model]

        for layer in self.layers:
            x, _ = layer(x, mask=mask)

        x = self.norm(x)  # Normalisasi akhir
        return x
    
# Contoh penggunaan TransformerEncoder
vocab_size = 10000
d_model = 512
n_heads = 8
d_ff = 2048
num_layers = 6  # Jumlah layer encoder
transformer_encoder = TransformerEncoder(vocab_size, d_model, n_heads, d_ff, num_layers)

# Input untuk TransformerEncoder
x_transformer_encoder = token_ids  # [batch_size, seq_len]
encoder_output = transformer_encoder(x_transformer_encoder)
print("Shape output TransformerEncoder:", encoder_output.shape)  # [batch_size, seq_len, d_model]
print("Output TransformerEncoder:", encoder_output)  # Output dari TransformerEncoder
transformer_encoder

torch.Size([5000, 512])
torch.Size([5000, 1])
torch.Size([2, 5, 512])
Shape output TransformerEncoder: torch.Size([2, 5, 512])
Output TransformerEncoder: tensor([[[-0.0671,  0.0982,  0.6902,  ..., -0.6005,  1.9808,  1.4554],
         [ 0.5984,  0.3918,  1.1752,  ...,  0.0959, -1.3847, -1.4753],
         [ 0.1981,  1.3724, -0.0871,  ...,  0.8550, -0.1228,  0.0693],
         [ 0.8066,  2.1204,  0.5496,  ...,  0.8747,  0.4876,  0.2374],
         [ 0.9034,  2.2935, -1.6426,  ...,  0.6958,  0.5359, -1.5127]],

        [[-2.7358,  1.7557, -1.9190,  ...,  1.3079, -0.4140,  0.3887],
         [-0.6214,  0.7021, -0.4069,  ...,  1.3759, -1.4253, -0.1855],
         [-0.8720, -0.6711, -0.9085,  ...,  0.0365, -0.2451,  0.6557],
         [-0.7131,  0.4458, -0.7062,  ...,  1.2501, -0.9426, -0.9888],
         [-0.2770,  1.6805, -0.5702,  ...,  0.8715, -1.9425,  0.3065]]],
       grad_fn=<NativeLayerNormBackward0>)


TransformerEncoder(
  (embedding): WordEmbedding(
    (embedding): Embedding(10000, 512, padding_idx=0)
  )
  (positional_encoding): PositionalEncoding()
  (layers): ModuleList(
    (0-5): 6 x EncoderLayer(
      (attention): MultiheadSelfAttention(
        (q_linear): Linear(in_features=512, out_features=512, bias=True)
        (k_linear): Linear(in_features=512, out_features=512, bias=True)
        (v_linear): Linear(in_features=512, out_features=512, bias=True)
        (out_proj): Linear(in_features=512, out_features=512, bias=True)
        (softmax): Softmax(dim=-1)
      )
      (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (ffn): FeedForward(
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (relu): ReLU()
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
      )
      (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (dropout2): Dropout(

In [96]:
import torch
import torch.nn as nn

class DecoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, dropout=0.1):
        super(DecoderLayer, self).__init__()

        self.self_attn = MultiheadSelfAttention(d_model, n_heads)
        self.cross_attn = MultiheadSelfAttention(d_model, n_heads)
        self.ffn = FeedForward(d_model, d_ff)

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)

        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)

    def forward(self, x, y, tgt_mask=None, cross_att_mask=None):
        # x: [batch_size, tgt_seq_len, d_model]
        # y/encoder_ouput: [batch_size, src_seq_len, d_model]

        # Masked self-attention (self attention punya decoder)
        self_attn_output, _ = self.self_attn(q=x, k=x, v=x, mask=tgt_mask)

        # Residual connection + LayerNorm (Self-Attention Block)
        # Residual ditambahkan ke input self-attention (yaitu tgt)
        x = self.norm1(x + self.dropout1(self_attn_output))

        # Cross-attention (attention antara decoder dan encoder)
        cross_attn_output, _ = self.cross_attn(q=x,  # query = decoder Q
                                               k=y,  # key = encoder K
                                               v=y,    # value = encoder V
                                               mask=cross_att_mask)  
        
        # Residual connection + LayerNorm (Cross-Attention Block)
        # Residual ditambahkan ke input cross-attention (yaitu tgt setelah norm1)
        x = self.norm2(x + self.dropout2(cross_attn_output)) # tgt di sisi kanan adalah input ke cross-attn

        # Feed Forward Network
        ffn_output = self.ffn(x)

        # Residual connection + LayerNorm (FFN Block)
        # Residual ditambahkan ke input FFN (yaitu tgt setelah norm2)
        x = self.norm3(x + self.dropout3(ffn_output)) # tgt di sisi kanan adalah input ke FFN

        return x


# Contoh penggunaan DecoderLayer
d_model = 512
n_heads = 8
d_ff = 2048  # Biasanya d_ff lebih besar dari d_model
batch_size = 2
seq_len = 5
decoder_layer = DecoderLayer(d_model, n_heads, d_ff)

# Input untuk DecoderLayer
x_decoder = pe_output.view(batch_size, seq_len, d_model)  # [batch_size, seq_len, d_model]
encoder_output = encoder_output  # Output dari TransformerEncoder # [batch_size, seq_len, d_model]
decoder_output = decoder_layer(x_decoder, encoder_output)
print("Shape output DecoderLayer:", decoder_output.shape)  # [batch_size, seq_len, d_model]
print("Output DecoderLayer:", decoder_output)  # Output dari DecoderLayer

Shape output DecoderLayer: torch.Size([2, 5, 512])
Output DecoderLayer: tensor([[[ 1.3042,  1.0292, -1.4429,  ...,  1.0451,  1.0885, -1.5961],
         [-0.0095,  0.4970, -0.1694,  ...,  0.9962,  1.0766,  0.1079],
         [-1.4791, -0.9519,  0.9839,  ...,  0.8316,  0.4817, -1.8652],
         [-0.8098, -0.9606, -1.0625,  ...,  2.1481, -0.5122, -0.1365],
         [-0.6694, -1.0202,  1.3164,  ..., -0.9272,  2.2489,  0.4197]],

        [[ 0.4735, -0.1348, -0.4349,  ..., -0.1836,  0.9704, -0.5184],
         [-0.9053, -1.5357, -0.1046,  ...,  1.5170,  0.1962, -0.8287],
         [-0.1218,  1.5366, -2.0503,  ..., -0.9942,  2.2141, -0.2618],
         [ 0.6441, -0.8123, -0.7478,  ..., -1.2165,  0.3667,  1.2559],
         [ 0.4269, -1.3934, -0.2759,  ..., -0.0729, -0.5538, -0.2161]]],
       grad_fn=<NativeLayerNormBackward0>)


In [97]:
class TransformerDecoder(nn.Module):
    def __init__(self, vocab_size, d_model, n_heads, d_ff, num_layers, max_len=5000, dropout=0.1):
        super(TransformerDecoder, self).__init__()
        self.embedding = WordEmbedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_len)
        self.layers = nn.ModuleList([DecoderLayer(d_model, n_heads, d_ff) for _ in range(num_layers)])
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x, encoder_output, tgt_mask=None, cross_att_mask=None):
        # x: [batch_size, tgt_seq_len]
        x = self.embedding(x)  # -> [batch_size, tgt_seq_len, d_model]
        x = self.positional_encoding(x)  # -> [batch_size, tgt_seq_len, d_model]

        for layer in self.layers:
            x = layer(x, encoder_output, tgt_mask=tgt_mask, cross_att_mask=cross_att_mask)

        x = self.norm(x)  # Normalisasi akhir
        return x
    
# Contoh penggunaan TransformerDecoder
vocab_size = 10000
d_model = 512
n_heads = 8
d_ff = 2048
num_layers = 6  # Jumlah layer decoder
transformer_decoder = TransformerDecoder(vocab_size, d_model, n_heads, d_ff, num_layers)

# Input untuk TransformerDecoder
x_transformer_decoder = token_ids  # [batch_size, tgt_seq_len]
decoder_output = transformer_decoder(x_transformer_decoder, encoder_output)
print("Shape output TransformerDecoder:", decoder_output.shape)  # [batch_size, tgt_seq_len, d_model]
print("Output TransformerDecoder:", decoder_output)  # Output dari TransformerDecoder

torch.Size([5000, 512])
torch.Size([5000, 1])
Shape output TransformerDecoder: torch.Size([2, 5, 512])
Output TransformerDecoder: tensor([[[-1.6022, -0.5740, -0.9025,  ...,  0.9632, -0.9350, -0.3362],
         [-2.5412, -0.8257, -0.1859,  ...,  0.9046, -0.7554,  0.0827],
         [-1.9336, -0.4975, -0.1599,  ...,  0.5034,  0.0300, -0.3450],
         [-1.9409, -0.7423, -0.3439,  ...,  0.3450,  0.1998, -0.5473],
         [-0.6115, -2.0115, -0.9331,  ...,  1.2873,  0.0926,  0.2538]],

        [[ 0.1636, -1.0241,  0.1872,  ...,  1.7562, -2.5314, -0.4062],
         [-1.6630, -0.3286,  1.5659,  ...,  0.9768, -0.5189, -0.7102],
         [ 0.1092, -0.7213,  0.6141,  ...,  1.4317, -1.4953,  0.8493],
         [-0.8203, -0.2893,  0.6903,  ...,  1.1725, -1.5588, -0.2430],
         [-0.0904,  0.5841,  0.9267,  ...,  0.1071, -1.9865, -1.5323]]],
       grad_fn=<NativeLayerNormBackward0>)


In [99]:
class Transformer(nn.Module):
    def __init__(self, vocab_size, d_model, n_heads, d_ff, num_layers, max_len=5000, dropout=0.1):
        super(Transformer, self).__init__()
        self.encoder = TransformerEncoder(vocab_size, d_model, n_heads, d_ff, num_layers, max_len, dropout)
        self.decoder = TransformerDecoder(vocab_size, d_model, n_heads, d_ff, num_layers,  max_len, dropout)
        self.output_layer = nn.Linear(d_model, vocab_size)

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        """
        src: Source sequence token IDs [batch_size, src_seq_len]
        tgt: Target sequence token IDs [batch_size, tgt_seq_len]
        """
        
        # Encoder processes the source sequence
        encoder_output = self.encoder(src, mask=src_mask)

        # Decoder processes the target sequence and the encoder's output
        # Note: The cross-attention mask is typically the source mask (src_mask)
        decoder_output = self.decoder(tgt, encoder_output, tgt_mask=tgt_mask, cross_att_mask=src_mask)

        # Final linear layer to get vocabulary scores
        output = self.output_layer(decoder_output)

        return output
    
vocab_size = 10000
d_model = 512
n_heads = 8
d_ff = 2048
num_layers = 6

transformer_model = Transformer(vocab_size, d_model, n_heads, d_ff, num_layers)

# Example source and target tensors
src_token_ids = torch.tensor([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]]) # (batch_size=2, seq_len=5)
tgt_token_ids = torch.tensor([[11, 12, 13, 0, 0], [14, 15, 16, 17, 0]]) # (batch_size=2, seq_len=5)


# Input for the corrected Transformer
transformer_output = transformer_model(src=src_token_ids, tgt=tgt_token_ids)
print("Shape of final Transformer output:", transformer_output.shape) # Should be [batch_size, seq_len, vocab_size]

torch.Size([5000, 512])
torch.Size([5000, 1])
torch.Size([5000, 512])
torch.Size([5000, 1])
torch.Size([2, 5, 512])
Shape of final Transformer output: torch.Size([2, 5, 10000])
