<a href="https://colab.research.google.com/github/kla55/transformer/blob/main/transformer_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import torch
import torch.nn as nn
import math

In [3]:
class InputEmbeddings(nn.Module):
    def __init__(self, d_model: int, vocab_size: int): # constructor - needs dimensions and vocab size
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_model) # mapping between numbers and vector size - 512

    def forward(self, x):
        return self.embedding(x) * math.sqrt(self.d_model) # sqrt

In [4]:
# Example usage
d_model = 512
vocab_size = 10000
input_embeddings = InputEmbeddings(d_model=d_model, vocab_size=vocab_size)

# Sample input: batch of sequences with token IDs
sample_input = torch.randint(0, vocab_size, (4, 10))  # Batch of 4 sequences, each with 10 tokens
embedded_output = input_embeddings(sample_input)

print("Input shape:", sample_input.shape)  # (batch_size, sequence_length)
print("Output shape:", embedded_output.shape)  # (batch_size, sequence_length, d_model)

Input shape: torch.Size([4, 10])
Output shape: torch.Size([4, 10, 512])


In [5]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, seq_len: int, dropout: float):
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)

        # create an array
        pe = torch.zeros(seq_len, d_model)
        print(pe.shape)
        # create a position tensor
        # - Adds an additional dimension to the tensor at index 1, converting the 1D tensor into a 2D tensor - (seq_len, 1)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)
        print(position.shape)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        print(div_term.shape)
        # apply the sin to even positions and cos to odd positions
        # pe[all vocab, starting at position 0/1 and for every 2]
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0) # (seq_len, d_model) -> (1, seq_len, d_model)
        # A buffer is a persistent tensor in the model that is not considered a learnable parameter (i.e., it won't be updated during backpropagation).
        self.register_buffer('pe', pe)

    def forward(self, x):
        # to add this positional encoding to every word inside the sentence
        # extracts the part of the positional encoding needed for the current input and locks it so it won't change during training
        # :x.size(1): Selects elements up to the length of the sequence dimension of x. This means that the operation is selecting a subset of self.pe that matches the sequence length of the input tensor x.
        x = x + (self.pe[:, :x.size(1), :]).requires_grad_(False)
        return self.dropout(x)


In [6]:
# Example usage
d_model = 512
seq_len = 10
dropout = 0.1
pos_encoding = PositionalEncoding(d_model=d_model, seq_len=seq_len, dropout=dropout)

# Sample input: batch of embeddings
batch_size = 4
sample_embeddings = torch.randn(batch_size, seq_len, d_model)  # Random embeddings for a batch of 4 sequences

# Apply positional encoding
encoded_output = pos_encoding(sample_embeddings)

print("Input shape:", sample_embeddings.shape)  # (batch_size, seq_len, d_model)
print("Output shape:", encoded_output.shape)     # (batch_size, seq_len, d_model)

torch.Size([10, 512])
torch.Size([10, 1])
torch.Size([256])
Input shape: torch.Size([4, 10, 512])
Output shape: torch.Size([4, 10, 512])


In [7]:
class LayerNormalization(nn.Module):
  """
  Layer normalization is a technique used in neural networks to stabilize and accelerate the training process.
  It normalizes the inputs across the features of each layer, which helps in making the model more robust and easier to train.

  In layer normalization, the mean and variance are computed for each individual sample across all the features (or neurons) within a layer.
  The input to a particular layer is normalized by subtracting the mean and dividing by the standard deviation calculated over the features of that input. This results in inputs that have a mean of 0 and a standard deviation of 1.
  After normalization, the output is typically scaled and shifted using learnable parameters (gamma and beta) so that the network can still represent a wide range of inputs if needed.
  Need Epislon for stability - if sigma is close to 0 then the mew value becomes big - so we do not want big or small values
  """
  def __init__(self, eps: float = 10**6):
    super().__init__()
    self.eps = eps
    self.alpha = nn.Parameter(torch.ones(1)) # multiplier
    self.beta = nn.Parameter(torch.zeros(1)) # additive

  def forward(self, x):
    mean = x.mean(-1, keepdim=True)
    std = x.std(-1, keepdim=True)
    return self.alpha * (x - mean) / (std + self.eps)

In [52]:
class LayerNormalisation(nn.Module):
    def __init__(self, epsilon=1e-6):
        super().__init__()
        self.epsilon = epsilon
        self.alpha = nn.Parameter(torch.ones(1))
        self.bias = nn.Parameter(torch.zeros(1))

    def forward(self, x):
        x = x.float()
        mean = x.mean(dim=-1, keepdim=True)
        std = x.std(dim=-1, keepdim=True)
        return self.alpha * (x - mean) / (std + self.epsilon) + self.bias

In [8]:
# Example usage
layer_norm = LayerNormalization(eps=1e-6)

# Create a batch of input data
input_data = torch.randn(4, 6)  # Batch of 4 samples, each with 6 features

# Apply layer normalization
normalized_output = layer_norm(input_data)

print("Input data:\n", input_data)
print("\nNormalized output:\n", normalized_output)
print("\nOutput mean (per sample):", normalized_output.mean(-1))  # Should be close to 0
print("Output std (per sample):", normalized_output.std(-1))    # Should be close to 1

Input data:
 tensor([[ 0.2011, -2.4617,  0.8124, -0.1935, -0.6042,  1.1987],
        [-0.5959,  0.7405, -0.2917,  0.5042,  0.4152, -0.0412],
        [ 0.6398, -0.3841, -0.0936,  0.0431,  0.2456,  0.7659],
        [-1.9061,  0.8484,  0.6571, -2.1188, -1.6789, -1.0248]])

Normalized output:
 tensor([[ 0.2895, -1.7628,  0.7607, -0.0146, -0.3311,  1.0584],
        [-1.3928,  1.2005, -0.8025,  0.7420,  0.5692, -0.3164],
        [ 0.9934, -1.3340, -0.6738, -0.3630,  0.0974,  1.2800],
        [-0.7898,  1.3109,  1.1651, -0.9520, -0.6165, -0.1177]],
       grad_fn=<DivBackward0>)

Output mean (per sample): tensor([-1.1021e-08,  0.0000e+00,  9.9341e-09,  1.9868e-08],
       grad_fn=<MeanBackward1>)
Output std (per sample): tensor([1.0000, 1.0000, 1.0000, 1.0000], grad_fn=<StdBackward0>)


In [21]:
class FeedForwardBlock(nn.Module):

  def __init__(self, d_model: int, d_ff: int, dropout: float):
    super().__init__()
    self.linear_1 = nn.Linear(d_model, d_ff)
    self.dropout = nn.Dropout(dropout)
    self.linear_2 = nn.Linear(d_ff, d_model)

  def forward(self, x):
    # (Batch, Sequence, d_model) -> (Batch, Sequence, dff) -> (Batch, Sequence, d_model)
    return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))

In [55]:
d_model = 512
d_ff = 2048
dropout = 0.1
ff_block = FeedForwardBlock(d_model=d_model, d_ff=d_ff, dropout=dropout)

# Input data: batch of 4 sequences, each with 10 tokens and an embedding dimension of 512
input_data = torch.randn(4, 10, d_model)

# Pass through the feedforward block
output_data = ff_block(input_data)

print("Input shape:", input_data.shape)  # (Batch, Sequence, d_model)
print("Output shape:", output_data.shape)  # Should also be (Batch, Sequence, d_model)

Input shape: torch.Size([4, 10, 512])
Output shape: torch.Size([4, 10, 512])


In [80]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout):
        super().__init__()
        self.attention_scores = None
        self.d_model = d_model
        self.num_heads = num_heads
        self.dropout = nn.Dropout(dropout)

        self.d_k = d_model // self.num_heads

        self.wq = nn.Linear(d_model, d_model)
        self.wk = nn.Linear(d_model, d_model)
        self.wv = nn.Linear(d_model, d_model)

        self.wo = nn.Linear(d_model, d_model)

        self.layer_norm1 = LayerNormalisation()
        self.layer_norm2 = LayerNormalisation()
        self.layer_norm3 = LayerNormalisation()

    @staticmethod
    def attention(q, k, v, mask, dropout):
        d_k = q.shape[-1]
        attention_scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)
        if mask is not None:
            attention_scores = attention_scores.masked_fill(mask == 0, -1e9)
        attention_scores = attention_scores.softmax(dim=-1)  # (Batch, num_heads, Seq_Len,  Seq_Len)

        if dropout is not None:
            attention_scores = dropout(attention_scores)

        attn = torch.matmul(attention_scores, v)  # (Batch, num_heads, Seq_Len, d_k)
        return attn, attention_scores

    def forward(self, q, k, v, mask):

        q = self.wq(q)

        k = self.wk(k)

        v = self.wv(v)

        q = q.view(q.shape[0], q.shape[1], self.num_heads, self.d_k).transpose(1, 2)

        k = k.view(k.shape[0], k.shape[1], self.num_heads, self.d_k).transpose(1, 2)

        v = v.view(v.shape[0], v.shape[1], self.num_heads, self.d_k).transpose(1, 2)

        x, self.attention_scores = MultiHeadAttention.attention(q, k, v, mask, self.dropout)

        x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.num_heads * self.d_k)

        x = self.wo(x)

        return x

In [75]:
class MultiHeadAttentionBlock(nn.Module):

  def __init__(self, d_model: int, h: int, dropout: float):
    super().__init__()
    self.d_model = d_model
    self.h = h
    assert d_model % h == 0, "d_model is not divisible by h"

    self.d_k = d_model // h
    self.w_q = nn.Linear(d_model, d_model)
    self.w_k = nn.Linear(d_model, d_model)
    self.w_v = nn.Linear(d_model, d_model)

    self.w_o = nn.Linear(d_model, d_model)
    self.dropout = nn.Dropout(dropout)

  @staticmethod
  def attention(query, key, value, mask, dropout: nn.Dropout):
    d_k = query.shape[-1]
    attention_scores = (query @ key.transpose(-2, -1)) / math.sqrt(d_k)
    if mask is not None:
      attention_scores.masked_fill_(mask == 0, -1e9)
    if dropout is not None:
      attention_scores = dropout(torch.softmax(attention_scores, dim=-1))
    return (attention_scores @ value), attention_scores

  def forward(self, q, k, v, mask=None):
    # q = [batch size, query len, hid dim]
    # k = [batch size, key len, hid dim]
    # v = [batch size, value len, hid dim]
    query = self.w_q(q)
    key = self.w_k(k)
    value = self.w_v(v)

    query = query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1,2) #.permute(0, 2, 1, 3)
    key = key.view(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1,2)
    value = value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1,2)

    x, self.attention_scores = MultiHeadAttentionBlock.attention(query, key, value, mask, self.dropout)
    x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.h * self.d_k)
    x = self.w_o(x)
    return x


In [57]:
d_model = 512
h = 8
dropout = 0.1
mha_block = MultiHeadAttentionBlock(d_model=d_model, h=h, dropout=dropout)

# Define input tensors for a batch of sequences
batch_size = 4
sequence_length = 10
q = torch.randn(batch_size, sequence_length, d_model)
k = torch.randn(batch_size, sequence_length, d_model)
v = torch.randn(batch_size, sequence_length, d_model)
mask = None  # Example without mask

# Pass through the multi-head attention block
output = mha_block(q, k, v, mask)

print("Output shape:", output.shape)  # (Batch, Sequence, d_model)

Output shape: torch.Size([4, 10, 512])


In [58]:
class ResidualConnection(nn.Module):
  def __init__(self, dropout: float):
    super().__init__()
    self.dropout = nn.Dropout(dropout)
    self.norm = LayerNormalization()

  def forward(self, x, sublayer):
    return x + self.dropout(sublayer(self.norm(x))) # residual connection

In [59]:
class FeedForwardLayer(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        return self.linear2(self.relu(self.linear1(x)))

In [60]:
# Input dimensions
batch_size = 2
seq_len = 5
d_model = 10
d_ff = 20

# Instantiate modules
dropout_rate = 0.1
residual_connection = ResidualConnection(dropout=dropout_rate)
feedforward = FeedForwardLayer(d_model=d_model, d_ff=d_ff)
# Create a sample input
x = torch.rand(batch_size, seq_len, d_model)
# Apply ResidualConnection with the FeedForwardLayer as the sublayer
output = residual_connection(x, feedforward)

print("Input Shape:", x.shape)
print("Output Shape:", output.shape)

Input Shape: torch.Size([2, 5, 10])
Output Shape: torch.Size([2, 5, 10])


In [61]:
class EncoderBlock(nn.Module):
  def __init__(self, self_attention_block: MultiHeadAttentionBlock, feed_forward_block: Feedforwardblock, dropout: float):
    super().__init__()
    self.self_attention_block = self_attention_block
    self.feed_forward_block = feed_forward_block
    self.residual_connections = nn.ModuleList([ResidualConnection(dropout) for _ in range(2)])

  def forward(self, x, src_mask):
    x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, src_mask))
    x = self.residual_connections[1](x, self.feed_forward_block)
    return x


In [62]:
# Input parameters
d_model = 64
num_heads = 8
d_ff = 256
dropout = 0.1

# Instantiate components
self_attention_block = MultiHeadAttentionBlock(d_model, num_heads, dropout)
feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
encoder_block = EncoderBlock(self_attention_block, feed_forward_block, dropout)

# Example input
batch_size = 2
seq_len = 10
x = torch.rand(seq_len, batch_size, d_model)  # Transformer input is (seq_len, batch_size, d_model)
src_mask = None  # Example without masking

# Forward pass
output = encoder_block(x, src_mask)
print("Input Shape:", x.shape)
print("Output Shape:", output.shape)

Input Shape: torch.Size([10, 2, 64])
Output Shape: torch.Size([10, 2, 64])


In [63]:
class Encoder(nn.Module):
  def __init__(self, layers: nn.ModuleList):
    super().__init__()
    self.layers = layers
    self.norm = LayerNormalization()

  def forward(self, x, mask):
    for layer in self.layers:
      x = layer(x, mask)
    return self.norm(x)

In [64]:
# Example configuration
features = 64
num_layers = 6
dropout = 0.1
d_ff = 256
self_attention_block = MultiHeadAttentionBlock(d_model, num_heads, dropout)
feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)

# Create multiple EncoderBlocks
encoder_layers = nn.ModuleList([
    EncoderBlock(self_attention_block, feed_forward_block, dropout) for _ in range(num_layers)
])

# Instantiate the Encoder
encoder = Encoder(layers=encoder_layers)

# Input tensor (sequence length, batch size, feature size)
seq_len = 10
batch_size = 2
x = torch.rand(seq_len, batch_size, features)
mask = None  # Example without masking

# Forward pass
output = encoder(x, mask)

print("Input Shape:", x.shape)
print("Output Shape:", output.shape)

Input Shape: torch.Size([10, 2, 64])
Output Shape: torch.Size([10, 2, 64])


In [65]:
class DecoderBlock(nn.Module):

    def __init__(self, features: int, self_attention_block: MultiHeadAttentionBlock, cross_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.cross_attention_block = cross_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(features, dropout) for _ in range(3)])

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, tgt_mask))
        x = self.residual_connections[1](x, lambda x: self.cross_attention_block(x, encoder_output, encoder_output, src_mask))
        x = self.residual_connections[2](x, self.feed_forward_block)
        return x

In [78]:
class DecoderLayer(nn.Module):

    def __init__(self, d_model, num_heads, dff, dropout):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.dff = dff  # Feed Forward Neural Network Output Size
        self.dropout = nn.Dropout(dropout)

        self.mha = MultiHeadAttention(d_model, num_heads, dropout)
        self.cross_mha = MultiHeadAttention(d_model, num_heads, dropout)
        self.ffn = FeedForward(d_model, dff, dropout)
        self.residual_mha = ResidualConnection(dropout)
        self.residual_cross_mha = ResidualConnection(dropout)
        self.residual_ffn = ResidualConnection(dropout)

    def forward(self, x, encoder_output, source_mask, target_mask):
        # Multi-Head Attention sub-layer
        attn_output = self.residual_mha(x, lambda x: self.mha(x, x, x, target_mask))

        # Cross-Attention sub-layer
        cross_attn_output = self.residual_cross_mha(attn_output,
                                                    lambda x: self.mha(x, encoder_output, encoder_output, source_mask))

        # FeedForward sub-layer
        ffn_output = self.residual_ffn(cross_attn_output, self.ffn)

        return ffn_output

In [82]:
class InputEmbeddings(nn.Module):
    def __init__(self, d_model, vocab_size):
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.word_embeddings = nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        print("InputEmbeddings - Input x shape:", x.shape)
        embeddings = self.word_embeddings(x) * math.sqrt(self.d_model)
        print("InputEmbeddings - Output embeddings shape:", embeddings.shape)
        return embeddings


class PositionalEncoding(nn.Module):
    def __init__(self, d_model, seq_len, dropout):
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)

        pe = torch.zeros(seq_len, d_model)

        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        print("PositionalEncoding - Input x shape:", x.shape)
        x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False)
        print("PositionalEncoding - Output x shape:", x.shape)
        return self.dropout(x)


class LayerNormalisation(nn.Module):
    def __init__(self, epsilon=1e-6):
        super().__init__()
        self.epsilon = epsilon
        self.alpha = nn.Parameter(torch.ones(1))
        self.bias = nn.Parameter(torch.zeros(1))

    def forward(self, x):
        x = x.float()
        mean = x.mean(dim=-1, keepdim=True)
        std = x.std(dim=-1, keepdim=True)
        return self.alpha * (x - mean) / (std + self.epsilon) + self.bias


class FeedForward(nn.Module):
    def __init__(self, d_model, dff, dropout):
        super().__init__()
        self.d_model = d_model
        self.dff = dff
        self.dropout = nn.Dropout(dropout)
        self.linear1 = nn.Linear(d_model, dff)
        self.linear2 = nn.Linear(dff, d_model)

    def forward(self, x):
        print("FeedForward - Input x shape:", x.shape)
        x = self.linear2(self.dropout(torch.relu(self.linear1(x))))
        print("FeedForward - Output x shape:", x.shape)
        return x

In [83]:
# Parameters
num_heads = 8   # Number of attention heads
dff = 256       # Feed-forward network hidden dimension
dropout = 0.1   # Dropout rate
d_model = 512
h = 8
# Instantiate the DecoderLayer
decoder_layer = DecoderLayer(
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    dropout=dropout
)


The Decoder class represents the full decoder stack in a Transformer architecture. It is responsible for sequentially applying multiple DecoderLayer instances (like the one we previously discussed) to process input data, usually in tasks like machine translation, text generation, or other sequence-to-sequence tasks.

num_layers: Number of DecoderLayer instances in the stack.\
d_model: Dimensionality of the model's embeddings and hidden states.\
num_heads: Number of attention heads in the multi-head attention mechanism.\
dff: Hidden layer size in the feed-forward network.\
dropout: Dropout rate for regularization.\

Components:

self.layer: A ModuleList of DecoderLayer instances, each with its own attention and feed-forward blocks.\
self.layer_norm: A final layer normalization step applied to stabilize the output.

In [84]:
class Decoder(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, dff, dropout):
        super().__init__()
        self.num_layers = num_layers
        self.d_model = d_model
        self.num_heads = num_heads
        self.dff = dff
        self.dropout = nn.Dropout(dropout)

        self.layer = nn.ModuleList([DecoderLayer(d_model, num_heads, dff, dropout) for _ in range(num_layers)])
        self.layer_norm = LayerNormalisation()

    def forward(self, x, encoder_output, source_mask, target_mask):
        for i in range(self.num_layers):
            x = self.layer[i](x, encoder_output, source_mask, target_mask)
        return self.layer_norm(x)

In [85]:
# Ensure d_model is divisible by num_heads
d_model = 512  # Embedding size
num_heads = 8  # Attention heads
assert d_model % num_heads == 0, "d_model must be divisible by num_heads"


In [87]:
q = torch.randn(batch_size, sequence_length, d_model)
k = torch.randn(batch_size, sequence_length, d_model)
v = torch.randn(batch_size, sequence_length, d_model)
mask = None  # Example without mask

# Pass through the multi-head attention block
output = mha_block(q, k, v, mask)

print("Output shape:", output.shape)  # (Batch, Sequence, d_model)

Output shape: torch.Size([2, 10, 512])


In [88]:
import torch

# Define the decoder
num_layers = 6
d_model = 512
num_heads = 8
dff = 256
dropout = 0.1

# Assuming Decoder is implemented as defined earlier
decoder = Decoder(num_layers=num_layers, d_model=d_model, num_heads=num_heads, dff=dff, dropout=dropout)

# Example inputs
batch_size = 2
target_seq_len = 10
source_seq_len = 8

x = torch.rand(batch_size, target_seq_len, d_model)  # Target sequence embeddings
encoder_output = torch.rand(batch_size, source_seq_len, d_model)  # Encoder output
source_mask = torch.ones(batch_size, 1, 1, source_seq_len).bool()  # Mask for encoder output

# Causal mask for target sequence
target_mask = torch.tril(torch.ones(target_seq_len, target_seq_len)).bool()  # Shape: (target_seq_len, target_seq_len)
target_mask = target_mask.unsqueeze(0).unsqueeze(1)  # Add batch and head dimensions
target_mask = target_mask.expand(batch_size, num_heads, -1, -1)  # Shape: (batch_size, num_heads, target_seq_len, target_seq_len)

# Forward pass
output = decoder(x, encoder_output, source_mask, target_mask)
print("Decoder output shape:", output.shape)


FeedForward - Input x shape: torch.Size([2, 10, 512])
FeedForward - Output x shape: torch.Size([2, 10, 512])
FeedForward - Input x shape: torch.Size([2, 10, 512])
FeedForward - Output x shape: torch.Size([2, 10, 512])
FeedForward - Input x shape: torch.Size([2, 10, 512])
FeedForward - Output x shape: torch.Size([2, 10, 512])
FeedForward - Input x shape: torch.Size([2, 10, 512])
FeedForward - Output x shape: torch.Size([2, 10, 512])
FeedForward - Input x shape: torch.Size([2, 10, 512])
FeedForward - Output x shape: torch.Size([2, 10, 512])
FeedForward - Input x shape: torch.Size([2, 10, 512])
FeedForward - Output x shape: torch.Size([2, 10, 512])
Decoder output shape: torch.Size([2, 10, 512])
