In [1]:
import torch
from torch import nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
F.softargmax = F.softmax

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Dense - Fully Connected (FC) Layer - nn.Linear()

```nn.Linear(in_features, out_features)```

is equivalent to 

$ y = xW^T+b $ shape [1]

- $ x = [3,4,5] $ shape [1x3]

- $ w = [w_1, w_2, w_3] $ shape [1x3]

- $ b = b $ shape [1]

```in_features = 3```
```out_features = 1```




In [3]:

# Intantiate the layer
layer = nn.Linear(in_features=3, out_features=1, bias=True)  # weight shape = [1×3], bias = [1]
print(f"w = {layer.weight}\n")  # [1 × 3] torch tensor randomly initialized
print(f"b = {layer.bias}\n")    # [1] torch tensor randomly initialized

# Use the layer and check result
x = torch.tensor([1.0, 2.0, 3.0]) # input shape = [3]
y = layer(x) # output shape = [1]
#y_manual = x @ layer.weight.T + layer.bias # output shape = [1]
y_manual = torch.matmul(x, layer.weight.T) + layer.bias # output shape = [1]

assert y == y_manual
print(f"y = {y}\n")
print(f"y_manual = {y_manual}\n")

w = Parameter containing:
tensor([[ 0.1873, -0.5529, -0.0038]], requires_grad=True)

b = Parameter containing:
tensor([0.4424], requires_grad=True)

y = tensor([-0.4876], grad_fn=<ViewBackward0>)

y_manual = tensor([-0.4876], grad_fn=<AddBackward0>)



## MLP (MultiLayer Perceptron)

**1D convolution with kernel_size = 1**

This is basically an MLP with one hidden layer and ReLU activation applied to each and every element in the set.

A 1D convolution with kernel_size=1 behaves like a position-wise feedforward network, or equivalently, an MLP applied independently to each position in a sequence.


- nn.Linear applies the transformation to the last dimension of the tensor.

- In transformer-style models or any sequence model, each element in the sequence can be treated independently across the sequence length when applying such layers.

This models:

MLP(x) = Linear(d_model → hidden_dim) → ReLU → Linear(hidden_dim → d_model)

Each vector x[i] (e.g., word embedding or token representation at position i) is transformed independently.
- Extract richer features from the input vector.
- Introduce nonlinearity so the model can represent more complex functions.
- Transform the input representation into a new space that's better suited for the task (e.g., classification, generation, prediction, etc.).
- Lets you project the input to a higher-dimensional space, manipulate it there, and then bring it back. This adds depth and nonlinearity, giving the network more flexibility.
- It's like giving the network more "thinking room" for each token.
- While attention layers mix information across tokens (global context), these MLP blocks: Do not mix sequence positions. Instead, they refine each token's internal structure (its features) independently.

So, self-attention = communication,
MLP = introspection.

The MLP is what gives the model depth and nonlinear transformation power at the token level. Without it, the model would be shallow and mostly linear.

In [4]:
class FeedForward(nn.Module):
    def __init__(self, d_model, hidden_dim):
        super().__init__()
        self.linear1 = nn.Linear(d_model,    hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, d_model)
        self.activation = nn.ReLU()

    def forward(self, x):
        x = self.linear1(x) # rotation
        x = self.activation(x) # squash
        x = self.linear2(x) # rotation
        return x

## **Attention**

### **Self-Attention**

#### **Explanation: Self-Attention**  

**Input**

Let  
$$
x_i \in \mathbb{R}^n \quad \text{be the input token vector} \quad\text{for } i = 1, \ldots, t
$$  
Define the input sequence matrix:  
$$
X \doteq \{x_i\}_{i=1}^{t} \in \mathbb{R}^{t \times n}
$$

---

**Linear Projections**  

Transformation (rotation) of each token $x_i$ into query, key, and value vectors:  
$$
q_i = W_q x_i \quad \text{(Query)} \qquad Q \doteq \{q_i\}_{i=1}^{t} \in \mathbb{R}^{t \times d}
$$
$$
k_i = W_k x_i \quad \text{(Key)} \qquad K \doteq \{k_i\}_{i=1}^{t} \in \mathbb{R}^{t \times d}
$$
$$
v_i = W_v x_i \quad \text{(Value)} \qquad V \doteq \{v_i\}_{i=1}^{t} \in \mathbb{R}^{t \times d}
$$

---

**Attention Weights**  

Compute attention scores and normalize using softargmax:  
$$
\text{Score i:} \quad s_i = K^\top q_i \in \mathbb{R}^{t}
$$
$$
\text{Scores:} \quad S = QK^\top \in \mathbb{R}^{t \times t}
$$
$$
\text{Attention Weights:} \quad A = \text{[soft](arg)max}_\beta(S) \in \mathbb{R}^{t \times t}
$$

---

**Output**  

Compute weighted sum of value vectors:  
$$
h_{i} = Va_{i} \qquad H \doteq \{h_{i}\}_{i=1}^{t} \in \Re^{d\times t}
$$

$$
H = A V
$$



#### **Minimal Chronological Implementation** (Single forward pass of one layer)

##### Numpy

In [5]:
# 1. Input: 3 tokens, each with 4 features
X = np.array([
    [1, 0, 1, 0],  # Token 1
    [0, 2, 0, 2],  # Token 2
    [1, 1, 1, 1],  # Token 3
])  # Shape (seq_len=3, d_model=4)

# 2. For reproducibility: manually initialize to identity
W_q = np.eye(4)  # Identity for simplicity
W_k = np.eye(4)
W_v = np.eye(4)

# 3. Compute Q, K, V
Q = X @ W_q  # Shape (3, 4)
K = X @ W_k  # Shape (3, 4)
V = X @ W_v  # Shape (3, 4)

# 4. Compute attention scores (Q @ K^T)
scores = Q @ K.T  # Shape (3, 3)

# 5. Scale scores
d_k = Q.shape[1]
scaled_scores = scores / np.sqrt(d_k)

# 6. Softmax to get attention weights
def softmax(x):
    e_x = np.exp(x - np.max(x, axis=1, keepdims=True))  # stable softmax
    return e_x / np.sum(e_x, axis=1, keepdims=True)

attention_weights = softmax(scaled_scores)  # Shape (3, 3)

# 7. Compute weighted sum of V
output = attention_weights @ V  # Shape (3, 4)

# 8. Done — print results
print("Attention Weights:\n", attention_weights)
print("Self-Attention Output:\n", output)

Attention Weights:
 [[0.4223188  0.1553624  0.4223188 ]
 [0.01587624 0.86681333 0.11731043]
 [0.1553624  0.4223188  0.4223188 ]]
Self-Attention Output:
 [[0.8446376  0.73304361 0.8446376  0.73304361]
 [0.13318667 1.85093709 0.13318667 1.85093709]
 [0.5776812  1.26695639 0.5776812  1.26695639]]


##### Torch

In [6]:
# 1. Input: 3 tokens, each with 4 features
X = torch.tensor([
    [1.0, 0.0, 1.0, 0.0],  # Token 1
    [0.0, 2.0, 0.0, 2.0],  # Token 2
    [1.0, 1.0, 1.0, 1.0],  # Token 3
])  # Shape: (seq_len=3, d_model=4)

# 2. Linear layers for Q, K, V. For reproducibility: manually override and initialize to identity
d_model = 4
linear_q = nn.Linear(d_model, d_model, bias=False)
linear_k = nn.Linear(d_model, d_model, bias=False)
linear_v = nn.Linear(d_model, d_model, bias=False)
linear_q.weight.data = torch.eye(d_model)
linear_k.weight.data = torch.eye(d_model)
linear_v.weight.data = torch.eye(d_model)

# 3. Compute Q, K, V
Q = linear_q(X)  # Shape: (3, 4)
K = linear_k(X)  # Shape: (3, 4)
V = linear_v(X)  # Shape: (3, 4)

# 4. Compute attention scores (Q @ K^T)
scores = Q @ K.T  # Shape: (3, 3)

# 5. Scale scores
scaled_scores = scores / torch.sqrt(torch.tensor(d_model, dtype=torch.float32))

# 6. Softmax to get attention weights
attention_weights = F.softmax(scaled_scores, dim=-1)  # Shape: (3, 3)

# 7. Compute weighted sum of V
output = attention_weights @ V  # Shape: (3, 4)

# 8. Done — print results
print("Attention Weights:\n", attention_weights)
print("Self-Attention Output:\n", output)

Attention Weights:
 tensor([[0.4223, 0.1554, 0.4223],
        [0.0159, 0.8668, 0.1173],
        [0.1554, 0.4223, 0.4223]], grad_fn=<SoftmaxBackward0>)
Self-Attention Output:
 tensor([[0.8446, 0.7330, 0.8446, 0.7330],
        [0.1332, 1.8509, 0.1332, 1.8509],
        [0.5777, 1.2670, 0.5777, 1.2670]], grad_fn=<MmBackward0>)


### **Cross-Attention**

#### **Explanation: Cross-Attention**  

In self-attention, the queries (Q), keys (K), and values (V) all come from the same input $X \doteq \{x_i\}_{i=1}^{t}$.

In cross-attention, the queries come from one sequence (e.g., the decoder), and the keys and values come from another sequence (e.g., the encoder). This is commonly used in encoder-decoder models like transformers for machine translation.

$X \doteq \{x_i\}_{i=1}^{t}$ is the querys input (e.g., decoder tokens), used to compute Q

$\Xi \doteq \{\xi_i\}_{i=1}^{\tau}$ is the key/value input (e.g., encoder outputs), used to compute K, V

The length of the keys and values input is much greater than the input for the queries. This allows to search in a greater space for the context information.

$$\tau >> t$$


**Input**

Let  
$$
x_i \in \mathbb{R}^n \quad \text{be the input token vector (decoder tokens)} \quad \text{for } i = 1, \ldots, t
$$  
$$
\xi_i \in \mathbb{R}^n \quad \text{be the key and value input token vector (encoder outputs)} \quad \text{for } i = 1, \ldots, \tau
$$  


Define the input sequence matrices:  
$$
X \doteq \{x_i\}_{i=1}^{t} \in \mathbb{R}^{t \times n}
$$

$$
\Xi \doteq \{\xi_i\}_{i=1}^{\tau} \in \mathbb{R}^{\tau \times n}
$$

---

**Linear Projections**  

Transformation (rotation) of each token $x_i$ and $\xi_i$ into query, key, and value vectors:  
$$
q_i = W_q x_i \quad \text{(Query)} \qquad Q \doteq \{q_i\}_{i=1}^{t} \in \mathbb{R}^{t \times d}
$$
$$
k_i = W_k \xi_i \quad \text{(Key)} \qquad K \doteq \{k_i\}_{i=1}^{\tau} \in \mathbb{R}^{\tau \times d}
$$
$$
v_i = W_v \xi_i \quad \text{(Value)} \qquad V \doteq \{v_i\}_{i=1}^{\tau} \in \mathbb{R}^{\tau \times d}
$$

---

**Attention Weights**  

Compute attention scores and normalize using softargmax:  
$$
\text{Score i:} \quad s_i = K^\top q_i \in \mathbb{R}^{\tau}
$$

$$
\text{Scores:} \quad S = QK^\top \in \mathbb{R}^{\tau \times t}
$$
$$
\text{Attention Weights:} \quad A = \text{[soft](arg)max}_\beta(S) \in \mathbb{R}^{\tau \times t}
$$

---

**Output**  

Compute weighted sum of value vectors:  
$$
h_{i} = Va_{i} \qquad H \doteq \{h_{i}\}_{i=1}^{t} \in \Re^{d\times t}
$$

$$
H = A V
$$

#### **Minimal Chronological Implementation** (Single forward pass of one layer)

#### Numpy

In [7]:
# 1. Inputs: separate query input and key/value input
X_q = np.array([
    [1, 0, 1, 0],  # Query 1
    [0, 1, 0, 1],  # Query 2
])  # Shape: (2, 4)

X_kv = np.array([
    [0, 2, 0, 2],  # Key/Value 1
    [1, 1, 1, 1],  # Key/Value 2
    [1, 0, 1, 0],  # Key/Value 3
])  # Shape: (3, 4)

# 2. Identity weight matrices
W_q = np.eye(4)  # For queries
W_k = np.eye(4)  # For keys
W_v = np.eye(4)  # For values

# 3. Compute Q, K, V
Q = X_q @ W_q      # Shape: (2, 4)
K = X_kv @ W_k     # Shape: (3, 4)
V = X_kv @ W_v     # Shape: (3, 4)

# 4. Compute attention scores (Q @ K^T)
scores = Q @ K.T   # Shape: (2, 3)

# 5. Scale scores
d_k = Q.shape[1]  # or K.shape[1]
scaled_scores = scores / np.sqrt(d_k)

# 6. Stable softmax
def softmax(x):
    e_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return e_x / np.sum(e_x, axis=1, keepdims=True)

attention_weights = softmax(scaled_scores)  # Shape: (2, 3)

# 7. Weighted sum of values
output = attention_weights @ V  # Shape: (2, 4)

# 8. Done
print("Attention Weights:\n", attention_weights)
print("Cross-Attention Output:\n", output)

Attention Weights:
 [[0.1553624  0.4223188  0.4223188 ]
 [0.66524096 0.24472847 0.09003057]]
Cross-Attention Output:
 [[0.8446376  0.73304361 0.8446376  0.73304361]
 [0.33475904 1.57521038 0.33475904 1.57521038]]


#### Torch

In [8]:

# 1. Two different inputs for cross-attention
X_q = torch.tensor([
    [1.0, 0.0, 1.0, 0.0],  # Query 1 (e.g., from decoder)
    [0.0, 1.0, 0.0, 1.0],  # Query 2
])  # Shape: (2, 4)

X_kv = torch.tensor([
    [0.0, 2.0, 0.0, 2.0],  # Key/Value 1 (e.g., from encoder)
    [1.0, 1.0, 1.0, 1.0],  # Key/Value 2
    [1.0, 0.0, 1.0, 0.0],  # Key/Value 3
])  # Shape: (3, 4)

# 2. Linear layers for Q, K, V — identity weights for simplicity
d_model = 4
linear_q = nn.Linear(d_model, d_model, bias=False)
linear_k = nn.Linear(d_model, d_model, bias=False)
linear_v = nn.Linear(d_model, d_model, bias=False)

linear_q.weight.data = torch.eye(d_model)
linear_k.weight.data = torch.eye(d_model)
linear_v.weight.data = torch.eye(d_model)

# 3. Compute Q from query input, K and V from key/value input
Q = linear_q(X_q)     # Shape: (2, 4)
K = linear_k(X_kv)    # Shape: (3, 4)
V = linear_v(X_kv)    # Shape: (3, 4)

# 4. Compute attention scores: Q @ K^T
scores = Q @ K.T      # Shape: (2, 3)

# 5. Scale
scaled_scores = scores / torch.sqrt(torch.tensor(d_model, dtype=torch.float32))

# 6. Softmax
attention_weights = F.softmax(scaled_scores, dim=-1)  # Shape: (2, 3)

# 7. Output: weighted sum of values
output = attention_weights @ V  # Shape: (2, 4)

# 8. Done
print("Attention Weights:\n", attention_weights)
print("Cross-Attention Output:\n", output)

Attention Weights:
 tensor([[0.1554, 0.4223, 0.4223],
        [0.6652, 0.2447, 0.0900]], grad_fn=<SoftmaxBackward0>)
Cross-Attention Output:
 tensor([[0.8446, 0.7330, 0.8446, 0.7330],
        [0.3348, 1.5752, 0.3348, 1.5752]], grad_fn=<MmBackward0>)


#### **MHA (Multi Head Attention) Module**

Putting it all together with torch and batched.

In [9]:

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, d_input=None):
        super().__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        if d_input is None:
            d_xq = d_xk = d_xv = d_model
        else:
            d_xq, d_xk, d_xv = d_input
            
        # Make sure that the embedding dimension of model is a multiple of number of heads
        assert d_model % self.num_heads == 0

        self.d_k = d_model // self.num_heads
        
        # These are still of dimension d_model. They will be split into number of heads 
        self.W_q = nn.Linear(d_xq, d_model, bias=False)
        self.W_k = nn.Linear(d_xk, d_model, bias=False)
        self.W_v = nn.Linear(d_xv, d_model, bias=False)
        
        # Outputs of all sub-layers need to be of dimension d_model
        self.W_h = nn.Linear(d_model, d_model)
        
    def scaled_dot_product_attention(self, Q, K, V):
        batch_size = Q.size(0) 
        k_length = K.size(-2) 
        
        # Scaling by d_k so that the softargmax doesnt saturate
        Q = Q / np.sqrt(self.d_k)                   # (bs, n_heads, q_length, dim_per_head)
        scores = torch.matmul(Q, K.transpose(2,3))  # (bs, n_heads, q_length, k_length)
        
        A = F.softargmax(scores, dim=-1)            # (bs, n_heads, q_length, k_length)
        
        # Get the weighted average of the values
        H = torch.matmul(A, V)                      # (bs, n_heads, q_length, dim_per_head)

        return H, A 

        
    def split_heads(self, x, batch_size):
        """
        Split the last dimension into (heads X depth)
        Return after transpose to put in shape (batch_size X num_heads X seq_length X d_k)
        """
        return x.view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)

    def group_heads(self, x, batch_size):
        """
        Combine the heads again to get (batch_size X seq_length X (num_heads times d_k))
        """
        return x.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads * self.d_k)
    

    def forward(self, X_q, X_k, X_v):
        batch_size, seq_length, dim = X_q.size()

        # After transforming, split into num_heads 
        Q = self.split_heads(self.W_q(X_q), batch_size)  # (bs, n_heads, q_length, dim_per_head)
        K = self.split_heads(self.W_k(X_k), batch_size)  # (bs, n_heads, k_length, dim_per_head)
        V = self.split_heads(self.W_v(X_v), batch_size)  # (bs, n_heads, v_length, dim_per_head)
        
        # Calculate the attention weights for each of the heads
        H_cat, A = self.scaled_dot_product_attention(Q, K, V)
        
        # Put all the heads back together by concat
        H_cat = self.group_heads(H_cat, batch_size)    # (bs, q_length, dim)
        
        # Final linear layer  
        H = self.W_h(H_cat)          # (bs, q_length, dim)
        
        return H, A

##### Sanity checks for the MHA

To check our self attention works - if the query matches with one of the key values, it should have all the attention focused there, with the value returned as the value at that index

In [10]:
def testing_MHA(Q, K, V):
    H, A = MultiHeadAttention(d_model=512, num_heads=8).scaled_dot_product_attention(Q, K, V)
    print('Attention weights A =', A.squeeze())
    print('Output H =', H.squeeze())
    
    
test_K = torch.tensor(
    [[10, 0, 0], # strong in "x" direction
     [ 0,10, 0], # strong in "y" direction ✅
     [ 0, 0,10], # strong in "z" direction
     [ 0, 0,10]] # strong in "z" direction
).float()[None,None]

test_V = torch.tensor(
    [[   1,0,0],
     [  10,0,0], # ✅
     [ 100,5,0],
     [1000,6,0]]
).float()[None,None]


In [11]:
test_Q = torch.tensor(
    [[0, 10, 0]] # strong in "y" direction
).float()[None,None]

testing_MHA(test_Q, test_K, test_V)

Attention weights A = tensor([3.7266e-06, 9.9999e-01, 3.7266e-06, 3.7266e-06])
Output H = tensor([1.0004e+01, 4.0993e-05, 0.0000e+00])


The query [0,10,0] matches the second key [0,10,0] and so the second value is returned [10,0,0]. 

In [12]:
test_Q = torch.tensor(
    [[0, 0, 10]]
    ).float()  
testing_MHA(test_Q, test_K, test_V)

Attention weights A = tensor([1.8633e-06, 1.8633e-06, 5.0000e-01, 5.0000e-01])
Output H = tensor([549.9979,   5.5000,   0.0000])


If we give a query that matches two keys exactly, it should return the averaged value of the two values for those two keys. We see that it focuses equally on the third and fourth key and returns the average of their values.

Now giving all the queries at the same time:

In [13]:

test_Q = torch.tensor(
    [[0, 0, 10], [0, 10, 0], [10, 10, 0]]
).float()[None,None]
testing_MHA(test_Q, test_K, test_V)

Attention weights A = tensor([[1.8633e-06, 1.8633e-06, 5.0000e-01, 5.0000e-01],
        [3.7266e-06, 9.9999e-01, 3.7266e-06, 3.7266e-06],
        [5.0000e-01, 5.0000e-01, 1.8633e-06, 1.8633e-06]])
Output H = tensor([[5.5000e+02, 5.5000e+00, 0.0000e+00],
        [1.0004e+01, 4.0993e-05, 0.0000e+00],
        [5.5020e+00, 2.0497e-05, 0.0000e+00]])


## Transformer encoder

### Embeddings

Self attention by itself does not have any recurrence or convolutions so to make it sensitive to position we must provide additional positional encodings. These are calculated as follows:

$$
E(p,2i) = sin(p/10000^{2i/d})
$$

$$
E(p,2i+1) = cos(p/10000^{2i/d})
$$

The embeddings are the sum of positional embedding + token embedding.

In [14]:
def create_sinusoidal_embeddings(nb_p, dim, E):
    theta = np.array([
        [p / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)]
        for p in range(nb_p)
    ])
    E[:, 0::2] = torch.FloatTensor(np.sin(theta[:, 0::2]))
    E[:, 1::2] = torch.FloatTensor(np.cos(theta[:, 1::2]))
    E.requires_grad = False
    E = E.to(device)

class Embeddings(nn.Module):
    def __init__(self, d_model, vocab_size, max_position_embeddings):
        super().__init__()
        self.word_embeddings = nn.Embedding(vocab_size, d_model, padding_idx=1)
        self.position_embeddings = nn.Embedding(max_position_embeddings, d_model)
        create_sinusoidal_embeddings(
            nb_p=max_position_embeddings,
            dim=d_model,
            E=self.position_embeddings.weight
        )

        self.LayerNorm = nn.LayerNorm(d_model, eps=1e-12)

    def forward(self, input_ids):
        seq_length = input_ids.size(1)
        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) # (max_seq_length)
        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)                      # (bs, max_seq_length)
        
        # Get word embeddings for each input id
        word_embeddings = self.word_embeddings(input_ids)                   # (bs, max_seq_length, dim)
        
        # Get position embeddings for each position id 
        position_embeddings = self.position_embeddings(position_ids)        # (bs, max_seq_length, dim)
        
        # Add them both 
        embeddings = word_embeddings + position_embeddings  # (bs, max_seq_length, dim)
        
        # Layer norm 
        embeddings = self.LayerNorm(embeddings)             # (bs, max_seq_length, dim)
        return embeddings

### Encoder Layer

In [15]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, conv_hidden_dim):
        super().__init__()

        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = FeedForward(d_model, conv_hidden_dim)
        self.layernorm1 = nn.LayerNorm(normalized_shape=d_model, eps=1e-6)
        self.layernorm2 = nn.LayerNorm(normalized_shape=d_model, eps=1e-6)
    
    def forward(self, x):
        
        # Multi-head attention (self-attention)
        attn_output, _ = self.mha(x, x, x)  # (batch_size, input_seq_len, d_model)
        
        # Layer norm after adding the residual connection 
        out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)
        
        # Feed forward 
        ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
        
        #Second layer norm after adding residual connection 
        out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)

        return out2

### Encoder

Input embedding (token + positional) + Blocks of N Encoder Layers

In [16]:
class Encoder(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, ff_hidden_dim, input_vocab_size,
               maximum_position_encoding):
        super().__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = Embeddings(d_model, input_vocab_size,maximum_position_encoding)

        self.enc_layers = nn.ModuleList()
        for _ in range(num_layers):
            self.enc_layers.append(EncoderLayer(d_model, num_heads, ff_hidden_dim))
        
    def forward(self, x):
        x = self.embedding(x) # Transform to (batch_size, input_seq_length, d_model)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x)

        return x  # (batch_size, input_seq_len, d_model)