In [1]:
import torch
import torch.nn as nn

Coding the Entire transformer block

In [2]:
GPT_CONFIG_124M ={
    "vocab_size": 50257, #vocabulary size
    "context_length": 1024, #context length, max no of input tokens allowed to predict the next token
    "emb_dim": 768, #embedding dimmension
    "n_heads": 12, #number of attention heads
    "n_layers": 12, #number of tranformer layers
    "drop_rate": 0.1, #Dropout rate
    "qkv_bias": False #Query-Key-Value bias
} #GPT 2 configuration

In [14]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0),  "d_out must be divisible by num_heads" #checks if the out put dim is divisible by the no of heads because each head needs an equal share of dimensions

        self.d_out = d_out
        self.num_heads = num_heads #number of attention heads
        self.head_dim = d_out // num_heads #reduce the projection dim to match the required output dim

        #initializing the trainable weights for key , value and query 
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)

        #this combines the output from all the heads inot a singe vector
        self.out_proj = nn.Linear(self.d_out, self.d_out) #linear layer to combine head outputs
        #adds regukarization to prevent lazy neurons during training
        self.dropout = nn.Dropout(dropout)
        #creates a causual mask from CAM to ensure that only the previous tokens are used for modeling
        self.register_buffer(
            'mask', torch.triu(torch.ones(context_length, context_length), diagonal=1)
        )

    #this method defines how the module processes inputs
    def forward(self, x):
        #x is the input tensor with the shape batch_size, n_tokens and input dim
        b, num_token, d_in = x.shape

        #generating the key, query and vallues matrices
        keys = self.W_key(x) #shape: b, num_tokens, d_out
        queries = self.W_query(x)
        values = self.W_value(x)

        #we implicitly split the matrix by adding a num-heads dimeneion
        #unroll last die: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        #ie, split the d_out column into two extra columns and add it to the trainable weights.
        keys = keys.view(b, num_token, self.num_heads, self.head_dim)
        values = values.view(b, num_token, self.num_heads, self.head_dim)
        queries = queries.view(b, num_token, self.num_heads, self.head_dim)

        #Transpose: (b, num_tokens, num_heads, heads_dim) -> (b, num_heads, num_tokens, head_dim)
        #group matrices by num_heads inteasd of num_tokens
        #rearramge the dim so that num heads come before num_tokens for head wise opertions
        keys = keys.transpose(1, 2)
        values = values.transpose(1, 2)
        queries = queries.transpose(1, 2)

        #compute scaled dot-product attention (aka self-attention) with a casual mask
        #that is, find the attention scores
        attn_scores = queries @ keys.transpose(2, 3) #dot product for each head

        #original mask truncated to the number of tokens and converted to boolean
        mask_bool = self.mask[:num_token, :num_token].bool()

        #use the mask trucated to the number of tokens and converted to boolean
        attn_scores.masked_fill_(mask_bool, -torch.inf) #apply casual mask by setting certain positions to -infinity

        attn_weights = torch.softmax(attn_scores / (keys.shape[-1]**0.5), dim=-1) #converts the scores into probabilities
        attn_weights = self.dropout(attn_weights) #randomly zeros out some weights to prevent overfitting

        context_vector = (attn_weights @ values).transpose(1, 2)

        #combine heads, where self.d_out = self.num_heads + self.head_dim
        context_vector = context_vector.contiguous().view(b, num_token, self.d_out)#flatten
        context_vector = self.out_proj(context_vector) #combines all heads into one final output

        return context_vector


THE BUILDING BLOCKS: LAYER NORM, GELU AND FEED FOWARD NEURAL NETWORK

In [11]:
class LayerNorm(nn.Module): #setting the LayerNormalization class as a subclass of the nn.Module module
    def __init__(self, emb_dim): #defines the input for the Layer Normalization class, 'dimension of the emdedding_vector'
        super().__init__()
        self.eps = 1e-5 #a small variable epislum to prevent division by zero during normalization
        self.scale = nn.Parameter(torch.ones(emb_dim)) #trainable parameters learnt during the training process
        self.shift = nn.Parameter(torch.zeros(emb_dim)) #trainable parameters learnt during the training process

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True) #normalize each row, across the columns
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

# Implementing the GELU activation function
class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi, dtype=x.dtype, device=x.device)) *
            (x + 0.044715 * torch.pow(x, 3))
        ))
    
# Implementing the FeedForward Module of the transformer architecture
class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),  # Expansion
            GELU(),  # Activation
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),  # Contraction
        )

    def forward(self, x):
        return self.layers(x)

In [12]:
#coding the transformer block
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.attn = MultiHeadAttention( #creates an instance of the attention class, where embedding vectors are converted to context vectors and the shape of the matruxis withheld
            d_in = cfg['emb_dim'],
            d_out = cfg['emb_dim'],
            context_length = cfg['context_length'],
            num_heads = cfg['n_heads'],
            dropout = cfg['drop_rate'],
            qkv_bias = cfg['qkv_bias']
        )
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg['emb_dim'])
        self.norm2 = LayerNorm(cfg['emb_dim'])
        self.drop_shortcut = nn.Dropout(cfg['drop_rate'])

        
    def forward(self, x):
        #shortcut connection for attention block
        shortcut = x
        x = self.norm1(x)
        x = self.attn(x) #shape [batch_szie, num_tokens, emd_sizee]
        x = self.drop_shortcut(x)
        x += shortcut #add the original input back

        #shortcut connection for feed forward block
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x += shortcut #add the original input back

        return x #returns the modieied input which is the same dim as the input of the model


In [13]:
#testing the transformer block
torch.manual_seed(123)
x = torch.rand(2, 4, 768) #specifying the dim of the input vector
block = TransformerBlock(GPT_CONFIG_124M)
output = block(x)
print("Input shape: ", x.shape)
print("Output shape: ", output.shape)

Input shape:  torch.Size([2, 4, 768])
Output shape:  torch.Size([2, 4, 768])
