# Encoder

![Alt Text](b.png)

### Input Embedding:
The input to the encoder is first passed through an embedding layer that converts each token into a fixed-size dense vector representation.

### Positional Encoding:
Since the Transformer architecture doesn’t have a sense of word order, positional encoding is added to the input embeddings to give the model information about the position of each token in the sequence.

### Multi-Head Attention:
Multi-head attention allows the model to focus on different parts of the input sequence simultaneously. It involves multiple self-attention mechanisms, allowing the model to capture relationships between words irrespective of their positions.

### Add & Norm:
A residual connection is added around the multi-head attention and feed-forward layers followed by layer normalization to stabilize training.

### Feed-Forward Network:
A fully connected feed-forward network is applied after the attention layer, which processes each token individually.

### Stacking Layers (Nx):
The above operations (multi-head attention, feed-forward) are repeated N times, where  N is the number of layers in the encoder.

### 1.Tokenization & Input Embedding


In [75]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
text = """
 The training process for language models (i.e., both encoder-only and decoder-only models) includes pretraining and finetuning. During pretraining, we train the model via a self-supervised objective over a large amount of unlabeled text. Although pretraining is expensive, 
 we can reuse the resulting model numerous times as a starting point for finetuning on various tasks. Due to the public availability of many high-quality pretrained LLMs, most practitioners can simply download a pretrained model and focus upon the finetuning process without ever having to pretrain a model from scratch.

"""

inputs= tokenizer(text , return_tensors = "pt",padding=True,truncation=True)  # "pt" - return input_ids pytorch tensor , 
#truncation=True: Automatically truncates sequences longer than the model’s maximum input length.



In [76]:
input_ids = inputs['input_ids']
input_ids

tensor([[  101,  1996,  2731,  2832,  2005,  2653,  4275,  1006,  1045,  1012,
          1041,  1012,  1010,  2119,  4372, 16044,  2099,  1011,  2069,  1998,
         21933,  4063,  1011,  2069,  4275,  1007,  2950,  3653, 23654,  2075,
          1998,  2986,  8525,  5582,  1012,  2076,  3653, 23654,  2075,  1010,
          2057,  3345,  1996,  2944,  3081,  1037,  2969,  1011, 13588,  7863,
          2058,  1037,  2312,  3815,  1997,  4895, 20470, 12260,  2094,  3793,
          1012,  2348,  3653, 23654,  2075,  2003,  6450,  1010,  2057,  2064,
          2128,  8557,  1996,  4525,  2944,  3365,  2335,  2004,  1037,  3225,
          2391,  2005,  2986,  8525,  5582,  2006,  2536,  8518,  1012,  2349,
          2000,  1996,  2270, 11343,  1997,  2116,  2152,  1011,  3737,  3653,
         23654,  2098,  2222,  5244,  1010,  2087, 14617,  2064,  3432,  8816,
          1037,  3653, 23654,  2098,  2944,  1998,  3579,  2588,  1996,  2986,
          8525,  5582,  2832,  2302,  2412,  2383,  

In [77]:
import torch
import torch.nn as nn

class EmbeddingLayer(nn.Module):
    def __init__(self,vocab_size , d_model):
        super(EmbeddingLayer,self).__init__() 
        self.embedding = nn.Embedding(vocab_size,d_model)

    def forward(self,x):
        return self.embedding(x)

vocab_size =50000
d_model = 512 # Dimension  of embedding

embedding_layer = EmbeddingLayer(vocab_size,d_model)
embedded_input = embedding_layer(input_ids)

In [78]:
print(embedded_input.shape) # 135 - vocabulary size and 512 dimension

torch.Size([1, 135, 512])


### Positional Encoding

![Alt Text](a.png)

In [79]:
import math

class PositionalEncoding(nn.Module):
    def __init__(self,d_model ,max_len =5000):
        super(PositionalEncoding,self).__init__() 
        pe = torch.zeros(max_len,d_model)  # 5000,512
        position = torch.arange(0,max_len,dtype = torch.float).unsqueeze(1)  # 5000,1
        div_term = torch.exp(torch.arange(0,d_model , 2).float() * (-math.log(10000.0)/d_model))
        pe[:,0::2] = torch.sin(position * div_term)
        pe[:,1::2] = torch.cos(position  * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe',pe)

    def forward(self,x):
        x=x+self.pe[:,:x.size(1),:]
        return x

pos_encoding_layer = PositionalEncoding(d_model)
positioned_input = pos_encoding_layer(embedded_input)  # 1, 135,512

In [82]:
positioned_input.shape

torch.Size([1, 135, 512])

### 3. Multi-Head Attention


In [101]:
class MultiHeadAttention(nn.Module):
    def __init__(self,d_model , num_heads):
        super(MultiHeadAttention,self).__init__() 
        assert d_model % num_heads == 0
        self.d_model = d_model 
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.query = nn.Linear(d_model,d_model)
        self.key= nn.Linear(d_model,d_model)
        self.value = nn.Linear(d_model,d_model)
        self.fc_out = nn.Linear(d_model,d_model)

    def forward(self,query,key,value,mask=None):
        N = query.shape[0]
        query_len,key_len ,value_len =  query.shape[1] , key.shape[1] , value.shape[1]
        # Split the embedding into self.num_heads different pieces
        query = self.query(query).view(N, query_len, self.num_heads, self.head_dim).transpose(1, 2)
        key = self.key(key).view(N, key_len, self.num_heads, self.head_dim).transpose(1, 2)
        value = self.value(value).view(N, value_len, self.num_heads, self.head_dim).transpose(1, 2)
        # Scaled dot-product attention
        energy = torch.matmul(query, key.transpose(-1, -2)) / math.sqrt(self.head_dim)
        if mask is not None:
            energy = energy.masked_fill(mask == 0, float('-inf'))
        attention = torch.softmax(energy, dim=-1)
        out = torch.matmul(attention, value)
        
        out = out.transpose(1, 2).contiguous().view(N, query_len, self.d_model)
        return self.fc_out(out)


        

# Example usage
num_heads = 8
multi_head_attention = MultiHeadAttention(d_model, num_heads)

attention_out = multi_head_attention(positioned_input, positioned_input, positioned_input)
print(attention_out.shape)       

torch.Size([1, 135, 512])


In [102]:
attention_out

tensor([[[-0.0280, -0.0293,  0.0969,  ..., -0.3313,  0.2431, -0.2711],
         [-0.0512, -0.0586,  0.1172,  ..., -0.3182,  0.2282, -0.2613],
         [-0.0291, -0.0266,  0.0683,  ..., -0.3149,  0.2242, -0.2864],
         ...,
         [-0.0049, -0.0351,  0.0683,  ..., -0.3615,  0.2324, -0.2865],
         [-0.0193, -0.0222,  0.0572,  ..., -0.3455,  0.2195, -0.2909],
         [-0.0104, -0.0133,  0.0789,  ..., -0.3309,  0.2186, -0.2976]]],
       grad_fn=<ViewBackward0>)

### 4. Add & Norm

In [103]:
class AddNorm(nn.Module):
    def __init__(self , d_model):
        super(AddNorm,self).__init__() 
        self.norm = nn.LayerNorm(d_model)
    def forward(self,x,sublayer):
        return self.norm(x+sublayer)

add_norm =AddNorm(d_model)
attention_with_norm = add_norm(positioned_input, attention_out)
print(attention_with_norm.shape) 

torch.Size([1, 135, 512])


### 5. Feed Forward Network


In [104]:
class FeedForwardNetwork(nn.Module):
    def __init__(self, d_model, d_ff):
        super(FeedForwardNetwork, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

# Example usage
d_ff = 2048
feed_forward = FeedForwardNetwork(d_model, d_ff)

ffn_out = feed_forward(attention_with_norm)
print(ffn_out.shape)  # Output shape: (1, 10, 512)


torch.Size([1, 135, 512])


### 6. Complete Encoder Layer


In [106]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff):
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadAttention(d_model, num_heads)
        self.add_norm1 = AddNorm(d_model)
        self.ffn = FeedForwardNetwork(d_model, d_ff)
        self.add_norm2 = AddNorm(d_model)

    def forward(self, x, mask=None):
        attn_output = self.attention(x, x, x, mask)
        out1 = self.add_norm1(x, attn_output)
        ffn_output = self.ffn(out1)
        out2 = self.add_norm2(out1, ffn_output)
        return out2

# Example usage
encoder_layer = EncoderLayer(d_model, num_heads, d_ff)

output = encoder_layer(positioned_input)
print(output.shape)  # Output shape: (1, 10, 512)


torch.Size([1, 135, 512])
