In [3]:
from importlib.metadata import version

import matplotlib
import tiktoken
import torch
import torch.nn as nn

print("matplotlib version:", version("matplotlib"))
print("torch version:", version("torch"))
print("tiktoken version:", version("tiktoken"))

matplotlib version: 3.9.2
torch version: 2.4.1
tiktoken version: 0.7.0


Define GPT Config:
- Vocabulary size: 50,257
- Context length: 1024
- Embedding dimensions: 768
- Number of attention heads: 12
- Number of transformer blocks: 12
- Dropout rate
- QKV Bias (true/false)

In [42]:
GPT_CONFIG_124M = {
    'vocab_size': 50_257,
    'context_length': 1_024,
    'emb_dim': 768,
    'n_heads': 12,
    'n_trans': 12,
    'drop_rate': 0.1,
    'qkv_bias': False
}

Define GPT Model.

Layers:
- Embedding x 1
- Positional Encoding x 1
- Dropout x 1
- Transformer blocks x 12
- Normalization x 1
- Linear layer x 1

Input shape: (n_sentence, n_tokens)

Output: (n_sentence, n_tokens, vocab_size)

In [43]:
class DummyTransformer(nn.Module):
    def forward(self, x):
        return x

class DummyNorm(nn.Module):
    def forward(self, x):
        return x

class DummyGPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.emb = nn.Embedding(cfg['vocab_size'], cfg['emb_dim'])
        self.pos = nn.Embedding(cfg['context_length'],     cfg['emb_dim'])
        self.dropout = nn.Dropout(cfg['drop_rate'])
        self.transform = nn.ModuleList([
            DummyTransformer() for _ in range(cfg['n_trans'])
        ])
        self.norm = DummyNorm()
        self.linear = nn.Linear(cfg['emb_dim'], cfg['vocab_size'], cfg['qkv_bias'])

    def forward(self, x):
        batch_size, seq_len = x.shape
        x = self.emb(x)

        pos_idx = torch.arange(seq_len, device = x.device)
        pos_emb = self.pos(pos_idx)

        x = x + pos_emb
        x = self.dropout(x)

        for trf in self.transform:
            x = trf(x)

        x = self.norm(x)
        x = self.linear(x)

        return x

Given 2 sentences of `cfg['seq_len']` tokens each:
- Tokenize them using `tiktoken`'s `gpt2` tokenizer
- Create a tensor of size `(2, sequence length, gpt2's vocab size)`

In [44]:
import tiktoken

tokenizer = tiktoken.get_encoding('gpt2')

texts = [
    'Every effort moves you',
    'Every day holds a'
]

input = torch.tensor([
    tokenizer.encode(text) for text in texts
])

print(input)
print(input.shape)

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])
torch.Size([2, 4])


In [45]:
model = DummyGPTModel(GPT_CONFIG_124M)
model.eval()

DummyGPTModel(
  (emb): Embedding(50257, 768)
  (pos): Embedding(1024, 768)
  (dropout): Dropout(p=0.1, inplace=False)
  (transform): ModuleList(
    (0-11): 12 x DummyTransformer()
  )
  (norm): DummyNorm()
  (linear): Linear(in_features=768, out_features=50257, bias=False)
)

In [46]:
torch.manual_seed(123)
output = model(input)

In [47]:
print(output)
print(output.shape)

tensor([[[-1.1947,  0.1392, -0.8616,  ..., -1.4987, -0.0314, -0.4490],
         [ 0.0497,  0.3861, -0.3281,  ..., -0.1826,  1.3084,  0.9867],
         [ 0.7005,  1.4747, -0.4149,  ...,  1.7756, -0.2280,  0.5384],
         [ 0.4885,  1.7545, -0.6707,  ...,  1.1501, -0.1143, -0.9368]],

        [[-1.1947,  0.1392, -0.8616,  ..., -1.4987, -0.0314, -0.4490],
         [-0.5591,  0.5797, -0.1296,  ...,  0.2691,  0.3151,  1.4046],
         [ 0.8524,  1.2833, -0.1786,  ..., -0.1982,  0.1097,  0.2812],
         [-0.0190, -0.8277,  0.2299,  ...,  1.7974, -0.1646, -0.1049]]],
       grad_fn=<UnsafeViewBackward0>)
torch.Size([2, 4, 50257])


Implement a Normalization layer with: epsilon, scale and shift matrices.

In [68]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)

        x = (x-mean) / torch.sqrt(var + self.eps)
        return self.scale * x + self.shift

In [76]:
norm_input = torch.randn(4, 20) # sample sentence of 4 tokens each with 20 dimensions
print(norm_input.shape)

torch.Size([4, 20])


In [77]:
torch.set_printoptions(sci_mode=False)

norm_layer = LayerNorm(emb_dim=20)
norm_output = norm_layer(norm_input)

mean = norm_output.mean(dim=-1, keepdim=True)
var = norm_output.var(dim=-1, keepdim=True, unbiased=False)

print('Mean:', mean)
print('Variance:', var)

Mean: tensor([[    -0.0000],
        [    -0.0000],
        [    -0.0000],
        [     0.0000]], grad_fn=<MeanBackward1>)
Variance: tensor([[1.0000],
        [1.0000],
        [1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


GELU

In [78]:
class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) * 
            (x + 0.044715 * torch.pow(x, 3))
        ))

Implement FeedForward module that applies these layers sequentially:
- Linear emb_dim, 4 x emb_dim
- GELU
- Linear 4 x emb_dim, emb_dim

In [86]:
class FeedForward(nn.Module):
    def __init__(self, emb_dim) -> None:
        super().__init__()

        self.layers = nn.Sequential(
            nn.Linear(emb_dim, 4 * emb_dim),
            nn.GELU(),
            nn.Linear(4 * emb_dim, emb_dim)
        )

    def forward(self, x):
        return self.layers(x)

In [88]:
ff = FeedForward(GPT_CONFIG_124M['emb_dim'])
ff.eval()

FeedForward(
  (layers): Sequential(
    (0): Linear(in_features=768, out_features=3072, bias=True)
    (1): GELU(approximate='none')
    (2): Linear(in_features=3072, out_features=768, bias=True)
  )
)

Transformer block:
- Layer norm 1
- Masked multi head attention
- Dropout
- Shortcut
- Layer norm 2
- Feed forward
- Dropout
- Shortcut

In [83]:
class Transformer(nn.Module):
    def __init__(self,  emb_dim, n_heads, drop_rate) -> None:
        super().__init__()

        self.norm1 = nn.LayerNorm(emb_dim)
        self.mha = nn.MultiheadAttention(emb_dim, n_heads, dropout=drop_rate)
        self.dropout1 = nn.Dropout(drop_rate)

        self.norm2 = nn.LayerNorm(emb_dim)
        self.ff = FeedForward(emb_dim)
        self.dropout2 = nn.Dropout(drop_rate)

    def forward(self, x):
        norm_x = self.norm1(x)
        attn_out, _ = self.mha(norm_x, norm_x, norm_x)
        attn_out = self.dropout1(attn_out)
        x = x + attn_out

        norm_x = self.norm2(x)
        ff_out = self.ff(norm_x)
        ff_out = self.dropout2(ff_out)
        x = x + ff_out

        return x

In [85]:
torch.manual_seed(123)

x = torch.rand(2, 4, 768)  # Shape: [batch_size, num_tokens, emb_dim]
block = Transformer(GPT_CONFIG_124M['emb_dim'], GPT_CONFIG_124M['n_heads'], GPT_CONFIG_124M['drop_rate'])
output = block(x)

print("Input shape:", x.shape)
print("Output shape:", output.shape)

Input shape: torch.Size([2, 4, 768])
Output shape: torch.Size([2, 4, 768])


In [124]:
class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.emb = nn.Embedding(cfg['vocab_size'], cfg['emb_dim'])
        self.pos = nn.Embedding(cfg['context_length'],     cfg['emb_dim'])
        self.dropout = nn.Dropout(cfg['drop_rate'])
        self.transform = nn.ModuleList([
            Transformer(cfg['emb_dim'], cfg['n_heads'], cfg['drop_rate']) for _ in range(cfg['n_trans'])
        ])
        self.norm = nn.LayerNorm(cfg['emb_dim'])
        self.linear = nn.Linear(cfg['emb_dim'], cfg['vocab_size'], cfg['qkv_bias'])

    def forward(self, x):
        batch_size, seq_len = x.shape
        x = self.emb(x)

        pos_idx = torch.arange(seq_len, device = x.device)
        pos_emb = self.pos(pos_idx)

        x = x + pos_emb
        x = self.dropout(x)

        for trf in self.transform:
            x = trf(x)

        x = self.norm(x)
        x = self.linear(x)

        return x

In [125]:
model = GPTModel(GPT_CONFIG_124M)
output = model(input)
output.shape

torch.Size([2, 4, 50257])

In [126]:
def generate_text_simple(model, idx, max_new_tokens, context_size):
    # idx is (batch, n_tokens) array of indices in the current context
    for _ in range(max_new_tokens):
        # truncate the input to fit context size
        idx_cond = idx[:, -context_size:]
        
        # Get the predictions
        with torch.no_grad():
            logits = model(idx_cond)
        
        # Focus only on the last time step
        # (batch, n_tokens, vocab_size) becomes (batch, vocab_size)
        logits = logits[:, -1, :]  

        # Apply softmax to get probabilities
        probas = torch.softmax(logits, dim=-1)  # (batch, vocab_size)

        # Get the idx of the vocab entry with the highest probability value
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)  # (batch, 1)

        # Append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch, n_tokens+1)

    return idx

In [134]:
def generate_text(input, n_tokens):
    input_tensor = torch.tensor(tokenizer.encode(input)).unsqueeze(0)
    output_tensor = generate_text_simple(model, input_tensor, n_tokens, GPT_CONFIG_124M['context_length'])
    decoded_text = tokenizer.decode(output_tensor.squeeze(0).tolist())
    print(decoded_text)

In [135]:
generate_text('hell nah', 10)

hell nah590 Citadel CoastAbsolutelyresources Franco proper Explosive Germ Petty
