## **Model Architecture**

## Params

In [13]:
!pip install tiktoken
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")



In [14]:
import torch
import torch.nn as nn

GPT_CONFIG_124M = {
    "vocab_size": 50257,  # Vocabulary size
    "context_length": 1024,  # Context length
    "emb_dim": 768,  # Embedding dimension
    "n_layers": 12,  # Number of
    "n_heads": 12,  # Number of attention heads per transformer block
    "drop_rate": 0.1,  # Dropout rate
    "qkv_bias": False,  # Query-Key-Value bias
}

## Complete transformer block

In [15]:
# Multiheaded attetion mechanism. Dude, this shit was fire !
class MultiHeadAttention(nn.Module):
  def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0), \
            "d_out must be divisible by num_heads"
        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer("mask",torch.triu(torch.ones(context_length, context_length),diagonal=1))

  def forward(self, x):
    b, num_token, d_in = x.shape
    keys = self.W_key(x)
    queries = self.W_query(x)
    values = self.W_value(x)
    keys = keys.view(b, num_token, self.num_heads, self.head_dim)
    values = values.view(b, num_token, self.num_heads, self.head_dim)
    queries = queries.view(b, num_token, self.num_heads, self.head_dim)
    keys = keys.transpose(1,2)
    values = values.transpose(1,2)
    queries = queries.transpose(1,2)
    attn_scores = queries @ keys.transpose(2,3) # we get (..., num_token, num_token)
    masked_bool = self.mask.bool()[:num_token, :num_token]
    attn_scores.masked_fill(masked_bool, -torch.inf)
    attn_scores = attn_scores / keys.shape[-1]**0.5
    attn_weights = torch.softmax(attn_scores, dim=-1)
    attn_weights = self.dropout(attn_weights)
    context_vec = (attn_weights @ values).transpose(1,2)
    context_vec = context_vec.contiguous().view(b, num_token, self.d_out)

    return context_vec

# We normalize the layer at the last dim with mean near to 0 and variance near to 1
class LayerNorm(nn.Module):
  def __init__(self, emb_dim):
    super().__init__()
    self.eps = 1e-5
    self.scale = nn.Parameter(torch.ones(emb_dim))
    self.shift = nn.Parameter(torch.zeros(emb_dim))

  def forward(self, x):
    mean = x.mean(dim=-1, keepdim=True)
    var = x.var(dim=-1, keepdim=True, unbiased=False)
    # we do +self.eps, to let the var not be 0 and division by 0 SHOULD not be done
    norm_x = (x-mean)/torch.sqrt(var + self.eps)

    # We use scale and shift for better training and they are trainable also !!!
    return self.scale * norm_x + self.shift

# GeLU function activation
class GeLU(nn.Module):
  def __init__(self):
    super().__init__()

  # Better version of ReLU()
  def forward(self, x):
    return 0.5*x*(1+torch.tanh(torch.sqrt(torch.tensor(2/torch.pi))* (x + 0.044715*x**3)))

# The classic feed froward neura network
class FeedForward(nn.Module):
  def __init__(self, cfg):
    super().__init__()

    # Feed forward network with GeLU between 2 linear
    self.layers = nn.Sequential(
        nn.Linear(cfg["emb_dim"], 4*cfg["emb_dim"]),
        GeLU(),
        nn.Linear(4*cfg["emb_dim"], cfg["emb_dim"])
    )

  def forward(self, x):
    return self.layers(x)

In [16]:
class TransformerBlock(nn.Module):
  def __init__(self, cfg):
    super().__init__()

    # Dude, no need for the comments here. You already know. It is a transformer block bro !
    self.att = MultiHeadAttention(
        d_in=cfg["emb_dim"],
        d_out=cfg["emb_dim"],
        context_length=cfg["context_length"],
        dropout=cfg["drop_rate"],
        num_heads=cfg["n_heads"],
        qkv_bias=cfg["qkv_bias"]
    )
    self.emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
    self.ff = FeedForward(cfg)
    self.norm1 = LayerNorm(cfg["emb_dim"])
    self.norm2 = LayerNorm(cfg["emb_dim"])
    self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

  def forward(self,x):

    x = self.emb(x)
    # creating shortcut from x to the first dropout layer
    shortcut = x
    x = self.norm1(x)
    x = self.att(x)
    x = self.drop_shortcut(x)
    x = x + shortcut

    # creating shortcut from first dropout to the second dropout
    shortcut = x
    x = self.norm2(x)
    x = self.ff(x)
    x = self.drop_shortcut(x)
    x = x + shortcut

    return x

In [17]:
model = TransformerBlock(GPT_CONFIG_124M)

## Generating new Output Tokens

In [18]:
def generate_text_simple(model, idx, max_new_tokens, context_size):

  # idx is (batch, n_tokens) array of indices in current context
  for _ in range(max_new_tokens):
    # If LLM suports only 5 tokens, and the context size is 10, then we only use last 5 toens as context.
    idx_cond = idx[:, -context_size:]

    # Gettings the predictions
    with torch.no_grad():
      # Reshape idx_cond to (batch_size, sequence_length, emb_dim)
      # idx_cond = idx_cond.unsqueeze(-1).repeat(1, 1, model.norm1.scale.shape[0]) # Or model.att.d_in to get the embedding dimension
      logits = model(idx_cond) # (batch, num_tokens, vocab_size)

    # We take the last row. We dont do anything to the batches neither to the last dimension of the vocabularies, but take the last row
    logits = logits[:, -1, :] # (batch, vocab_size)

    # getting probablities from the logits. We can say something like 50% chances of this, 2% chances of this...
    probs = torch.softmax(logits, dim=-1) # (batch, vocab_size)

    # We see the highest value's index
    idx_next = torch.argmax(probs, dim=-1, keepdim=True) # (batch, 1)

    # Append the predicted token_id generated to the original index
    idx = torch.cat((idx, idx_next), dim=1) # (batch, num_tokens+1)

  return idx

In [22]:
start = "Hello, I am "
encoded_start = tokenizer.encode(start)
encoded_tensor = torch.tensor(encoded_start).unsqueeze(0)

encoded_start, encoded_tensor

([15496, 11, 314, 716, 220], tensor([[15496,    11,   314,   716,   220]]))

In [23]:
model.eval() # This puts model in evaluation thingy. It will bypass LayerNormaliztion and such.
output = generate_text_simple(
    model = model,
    idx=encoded_tensor,
    max_new_tokens=6,
    context_size=GPT_CONFIG_124M['context_length']
)
output

tensor([[15496,    11,   314,   716,   220,   454,     8,   427,   744,   367,
           737]])

In [24]:
decoded_text = tokenizer.decode(output.squeeze().tolist())
print(decoded_text)

Hello, I am our) shround H).
