# **Creating GPT model architecture**

## GPT-2-124M Info

In [None]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,  # Vocabulary size
    "context_length": 1024,  # Context length
    "emb_dim": 768,  # Embedding dimension
    "n_layers": 12,  # Number of transformer blocks
    "n_heads": 12,  # Number of attention heads per transformer block
    "drop_rate": 0.1,  # Dropout rate
    "qkv_bias": False,  # Query-Key-Value bias
}

## Dummy GPT Model

this is just a placehoder thingy. No working stuff. Dont worry.

In [None]:
import torch
import torch.nn as nn


class DummyGPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()

        # Token embeddings as well as positional embedding lookup tables
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        # Using Dummy transformer block, total of n_layers times
        self.trf_blocks = nn.Sequential(
            *[DummyTransformerBlock(cfg) for _ in range(cfg["n_layers"])])

        # Using a dummy class for LayerNorm
        self.final_norm = DummyLayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["vocab_size"], bias=False
        )

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape

        # Getting token embeddings and then positional embeddings and then adding those for input embeddings
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds

        # Applying dropout, passing through transformer block and then final normalization
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)

        # getting probability matrix
        logits = self.out_head(x)
        return logits


class DummyTransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        # A simple placeholder

    def forward(self, x):
        # This block does nothing and just returns its input.
        return x


class DummyLayerNorm(nn.Module):
    def __init__(self, normalized_shape, eps=1e-5):
        super().__init__()
        # The parameters here are just to mimic the LayerNorm interface.

    def forward(self, x):
        # This layer does nothing and just returns its input.
        return x

## Layer normalization

In [None]:
import torch

just using sequential and linear layers to get the weights to let the batches pass through once

In [None]:
batch_example = torch.rand(2, 5)
layer = torch.nn.Sequential(torch.nn.Linear(5, 6), torch.nn.ReLU())
output = layer(batch_example)
print(output)

tensor([[0.0000, 0.7230, 0.2197, 0.0000, 0.8547, 0.0000],
        [0.0000, 0.6109, 0.1887, 0.0141, 1.1073, 0.0000]],
       grad_fn=<ReluBackward0>)


getting mean and variance

In [None]:
mean = output.mean(dim=-1, keepdim=True)
var = output.var(dim=-1, keepdim=True)

mean, var

(tensor([[0.2996],
         [0.3202]], grad_fn=<MeanBackward1>),
 tensor([[0.1526],
         [0.2040]], grad_fn=<VarBackward0>))

we're doing x-u/sqrt(var)

In [None]:
res = (output - mean) / torch.sqrt(var)

In [None]:
for i in res:
  print(i.mean())
  print(i.var())

tensor(-5.9605e-08, grad_fn=<MeanBackward0>)
tensor(1.0000, grad_fn=<VarBackward0>)
tensor(4.9671e-08, grad_fn=<MeanBackward0>)
tensor(1., grad_fn=<VarBackward0>)


THE CLASSSSSSSSSSSSSS

In [None]:
import torch
import torch.nn as nn

class LayerNorm(nn.Module):
  def __init__(self, emb_dim):
    super().__init__()
    self.eps = 1e-5
    self.scale = nn.Parameter(torch.ones(emb_dim))
    self.shift = nn.Parameter(torch.zeros(emb_dim))

  def forward(self, x):
    mean = x.mean(dim=-1, keepdim=True)
    var = x.var(dim=-1, keepdim=True, unbiased=False)
    # we do +self.eps, to let the var not be 0 and division by 0 SHOULD not be done
    norm_x = (x-mean)/torch.sqrt(var + self.eps)

    # We use scale and shift for better training and they are trainable also !!!
    return self.scale * norm_x + self.shift

Heck yeah !

In [None]:
ln = LayerNorm(emb_dim=5)
out_ln = ln(batch_example)
out_ln.mean(dim=-1, keepdim=True), out_ln.var(dim=-1, keepdim=True, unbiased=False)

(tensor([[-2.3842e-08],
         [ 3.5763e-07]], grad_fn=<MeanBackward1>),
 tensor([[0.9999],
         [0.9996]], grad_fn=<VarBackward0>))

## GeLU activation function

In [None]:
import torch
import torch.nn as nn

In [None]:
class GeLU(nn.Module):
  def __init__(self):
    super().__init__()

  # Better version of ReLU()
  def forward(self, x):
    return 0.5*x*(1+torch.tanh(torch.sqrt(torch.tensor(2/torch.pi))* (x + 0.044715*x**3)))

In [None]:
class FeedForward(nn.Module):
  def __init__(self, cfg):
    super().__init__()

    # Feed forward network with GeLU between 2 linear
    self.layers = nn.Sequential(
        nn.Linear(cfg["emb_dim"], 4*cfg["emb_dim"]),
        GeLU(),
        nn.Linear(4*cfg["emb_dim"], cfg["emb_dim"])
    )

  def forward(self, x):
    return self.layers(x)

In [None]:
ff = FeedForward(GPT_CONFIG_124M)
ones = torch.rand(2, 3, 768)
out_ff = ff(ones)
ones, out_ff

(tensor([[[0.1575, 0.4331, 0.1776,  ..., 0.3492, 0.0244, 0.2455],
          [0.9566, 0.5933, 0.6393,  ..., 0.6167, 0.0435, 0.2261],
          [0.4319, 0.2730, 0.2293,  ..., 0.6945, 0.5290, 0.0227]],
 
         [[0.9675, 0.9138, 0.5699,  ..., 0.9252, 0.4692, 0.9113],
          [0.7298, 0.9616, 0.9957,  ..., 0.1108, 0.0734, 0.1066],
          [0.5184, 0.4737, 0.3395,  ..., 0.4596, 0.9735, 0.7594]]]),
 tensor([[[ 0.1558,  0.0788,  0.0144,  ..., -0.1708,  0.0179, -0.0577],
          [ 0.0983,  0.0398, -0.0635,  ..., -0.1152,  0.0316, -0.1472],
          [ 0.1592,  0.0591, -0.0759,  ..., -0.1235, -0.1080, -0.0653]],
 
         [[ 0.2360,  0.0097, -0.0355,  ..., -0.0814,  0.0767, -0.0155],
          [ 0.1177, -0.0355, -0.0075,  ..., -0.0553, -0.0595, -0.1363],
          [ 0.1997,  0.0751,  0.0651,  ..., -0.1827,  0.0676, -0.1640]]],
        grad_fn=<ViewBackward0>))

## ShortCut Connection

In [None]:
import torch
import torch.nn as nn

In [None]:
class ExampleDeepNeuralNetwork(nn.Module):
    def __init__(self, layer_sizes, use_shortcut):
        super().__init__()
        self.use_shortcut = use_shortcut

        # The list of all the layers for this test class
        self.layers = nn.ModuleList([
            nn.Sequential(nn.Linear(layer_sizes[0], layer_sizes[1]), GeLU()),
            nn.Sequential(nn.Linear(layer_sizes[1], layer_sizes[2]), GeLU()),
            nn.Sequential(nn.Linear(layer_sizes[2], layer_sizes[3]), GeLU()),
            nn.Sequential(nn.Linear(layer_sizes[3], layer_sizes[4]), GeLU()),
            nn.Sequential(nn.Linear(layer_sizes[4], layer_sizes[5]), GeLU())
        ])

    def forward(self, x):
        for layer in self.layers:
            # Compute the output of the current layer
            layer_output = layer(x)
            # Check if shortcut can be applied, if yes, we add input into output, else just the ouput
            if self.use_shortcut and x.shape == layer_output.shape:
                x = x + layer_output
            else:
                x = layer_output
        return x

In [None]:
layers = [3,3,3,3,3,1]
dnn_1 = ExampleDeepNeuralNetwork(layers,  use_shortcut=False)
dnn_2 = ExampleDeepNeuralNetwork(layers,  use_shortcut=True)
r = torch.rand(1, 3)
dnn_1(r), dnn_2(r)

(tensor([[0.2493]], grad_fn=<MulBackward0>),
 tensor([[-0.0259]], grad_fn=<MulBackward0>))

## **Complete transformer block**

In [None]:
import torch
import torch.nn as nn

GPT_CONFIG_124M = {
    "vocab_size": 50257,  # Vocabulary size
    "context_length": 1024,  # Context length
    "emb_dim": 768,  # Embedding dimension
    "n_layers": 12,  # Number of
    "n_heads": 12,  # Number of attention heads per transformer block
    "drop_rate": 0.1,  # Dropout rate
    "qkv_bias": False,  # Query-Key-Value bias
}

In [None]:
# Multiheaded attetion mechanism. Dude, this shit was fire !
class MultiHeadAttention(nn.Module):
  def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0), \
            "d_out must be divisible by num_heads"
        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer("mask",torch.triu(torch.ones(context_length, context_length),diagonal=1))

  def forward(self, x):
    b, num_token, d_in = x.shape
    keys = self.W_key(x)
    queries = self.W_query(x)
    values = self.W_value(x)
    keys = keys.view(b, num_token, self.num_heads, self.head_dim)
    values = values.view(b, num_token, self.num_heads, self.head_dim)
    queries = queries.view(b, num_token, self.num_heads, self.head_dim)
    keys = keys.transpose(1,2)
    values = values.transpose(1,2)
    queries = queries.transpose(1,2)
    attn_scores = queries @ keys.transpose(2,3) # we get (..., num_token, num_token)
    masked_bool = self.mask.bool()[:num_token, :num_token]
    attn_scores.masked_fill(masked_bool, -torch.inf)
    attn_scores = attn_scores / keys.shape[-1]**0.5
    attn_weights = torch.softmax(attn_scores, dim=-1)
    attn_weights = self.dropout(attn_weights)
    context_vec = (attn_weights @ values).transpose(1,2)
    context_vec = context_vec.contiguous().view(b, num_token, self.d_out)

    return context_vec

# We normalize the layer at the last dim with mean near to 0 and variance near to 1
class LayerNorm(nn.Module):
  def __init__(self, emb_dim):
    super().__init__()
    self.eps = 1e-5
    self.scale = nn.Parameter(torch.ones(emb_dim))
    self.shift = nn.Parameter(torch.zeros(emb_dim))

  def forward(self, x):
    mean = x.mean(dim=-1, keepdim=True)
    var = x.var(dim=-1, keepdim=True, unbiased=False)
    # we do +self.eps, to let the var not be 0 and division by 0 SHOULD not be done
    norm_x = (x-mean)/torch.sqrt(var + self.eps)

    # We use scale and shift for better training and they are trainable also !!!
    return self.scale * norm_x + self.shift

# GeLU function activation
class GeLU(nn.Module):
  def __init__(self):
    super().__init__()

  # Better version of ReLU()
  def forward(self, x):
    return 0.5*x*(1+torch.tanh(torch.sqrt(torch.tensor(2/torch.pi))* (x + 0.044715*x**3)))

# The classic feed froward neura network
class FeedForward(nn.Module):
  def __init__(self, cfg):
    super().__init__()

    # Feed forward network with GeLU between 2 linear
    self.layers = nn.Sequential(
        nn.Linear(cfg["emb_dim"], 4*cfg["emb_dim"]),
        GeLU(),
        nn.Linear(4*cfg["emb_dim"], cfg["emb_dim"])
    )

  def forward(self, x):
    return self.layers(x)

In [None]:
class TransformerBlock(nn.Module):
  def __init__(self, cfg):
    super().__init__()

    # Dude, no need for the comments here. You already know. It is a transformer block bro !
    self.att = MultiHeadAttention(
        d_in=cfg["emb_dim"],
        d_out=cfg["emb_dim"],
        context_length=cfg["context_length"],
        dropout=cfg["drop_rate"],
        num_heads=cfg["n_heads"],
        qkv_bias=cfg["qkv_bias"]
    )
    self.ff = FeedForward(cfg)
    self.norm1 = LayerNorm(cfg["emb_dim"])
    self.norm2 = LayerNorm(cfg["emb_dim"])
    self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

  def forward(self,x):
    # creating shortcut from x to the first dropout layer
    shortcut = x
    x = self.norm1(x)
    x = self.att(x)
    x = self.drop_shortcut(x)
    x = x + shortcut

    # creating shortcut from first dropout to the second dropout
    shortcut = x
    x = self.norm2(x)
    x = self.ff(x)
    x = self.drop_shortcut(x)
    x = x + shortcut

    return x

In [None]:
input = torch.rand(2,3,768)
trfmblck = TransformerBlock(GPT_CONFIG_124M)
input, trfmblck(input)

(tensor([[[0.8630, 0.3479, 0.8217,  ..., 0.6449, 0.2349, 0.7564],
          [0.8813, 0.8789, 0.8641,  ..., 0.1024, 0.1728, 0.0935],
          [0.8460, 0.5911, 0.9952,  ..., 0.0910, 0.1392, 0.5239]],
 
         [[0.0131, 0.6674, 0.7115,  ..., 0.3334, 0.0476, 0.3675],
          [0.4079, 0.4333, 0.5122,  ..., 0.9597, 0.1936, 0.6188],
          [0.0487, 0.6183, 0.3731,  ..., 0.3049, 0.3739, 0.1200]]]),
 tensor([[[ 1.6255, -0.1657,  0.8310,  ...,  0.5347,  0.2914,  0.8870],
          [ 1.3251,  0.8969,  1.1177,  ...,  0.1688,  0.3840,  0.5984],
          [ 1.4692,  0.4756,  1.3227,  ..., -0.1366,  0.4886,  1.3945]],
 
         [[ 0.2987,  0.9146,  0.3924,  ...,  0.0164, -0.1391, -0.0574],
          [ 0.6332,  0.7505,  0.0119,  ...,  0.6402,  0.2357, -0.0370],
          [ 0.0024,  0.7039,  0.0350,  ...,  0.6196,  0.2175, -0.5970]]],
        grad_fn=<AddBackward0>))