# Import

In [1]:
import torch
from llm.components import MultiHeadAttention, GPTModel
from torch import nn
import tiktoken

# GPT config

In [3]:
GPT_CONFIG = { 
    'vocab_size': 50257,
    'context_length': 1024,
    'embed_dim': 768,
    'n_heads': 12,
    'n_layers': 12,
    'drop_rate': 0.1,
    'qkv_bias': False
}

In [5]:
#BPE tokenizer
tokenizer = tiktoken.get_encoding('gpt2')

#sample
batch = [
    torch.tensor(tokenizer.encode("Every effort moves you")),
    torch.tensor(tokenizer.encode("Every day holds a")),
]

batch = torch.stack(batch, dim = 0) #stach the elements along dim = 0 
print(batch.shape)

torch.Size([2, 4])


# Layer normalization

In [9]:
x = torch.rand(5,3)
print(x.shape)
print(x)

layer_norm = nn.LayerNorm(3)

x = layer_norm(x)

print(x)

torch.Size([5, 3])
tensor([[0.4697, 0.0384, 0.9540],
        [0.5054, 0.3758, 0.5965],
        [0.8573, 0.3089, 0.2593],
        [0.4583, 0.9476, 0.1771],
        [0.7077, 0.8432, 0.2631]])
tensor([[-0.0472, -1.2004,  1.2476],
        [ 0.1415, -1.2886,  1.1471],
        [ 1.4102, -0.6137, -0.7965],
        [-0.2178,  1.3190, -1.1012],
        [ 0.4159,  0.9626, -1.3784]], grad_fn=<NativeLayerNormBackward0>)


# GELU activation

In [17]:
x = torch.randn(2,3,4)
print('original = \n', x)
gelu = nn.GELU()
print('gelu = \n', gelu(x))
print('relu = \n', torch.relu(x))

original = 
 tensor([[[-1.0694,  0.8192, -0.6045, -1.9654],
         [ 0.8322,  0.8429, -0.9024, -0.4621],
         [-0.9452, -0.4569, -1.3467,  1.4069]],

        [[-0.9557,  0.6398, -0.5620,  0.4425],
         [-0.4646, -0.0563,  1.1198,  0.1933],
         [-0.7056,  0.1581, -1.5766,  0.5918]]])
gelu = 
 tensor([[[-0.1523,  0.6502, -0.1649, -0.0485],
         [ 0.6636,  0.6747, -0.1655, -0.1488],
         [-0.1628, -0.1480, -0.1199,  1.2947]],

        [[-0.1621,  0.4727, -0.1613,  0.2969],
         [-0.1492, -0.0269,  0.9726,  0.1114],
         [-0.1695,  0.0890, -0.0906,  0.4278]]])
relu = 
 tensor([[[0.0000, 0.8192, 0.0000, 0.0000],
         [0.8322, 0.8429, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000, 1.4069]],

        [[0.0000, 0.6398, 0.0000, 0.4425],
         [0.0000, 0.0000, 1.1198, 0.1933],
         [0.0000, 0.1581, 0.0000, 0.5918]]])


In [40]:
class FeedForward(nn.Module):

    def __init__(self, d_in) -> None:
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(d_in, 4 * d_in),
            nn.GELU(),
            nn.Linear(4 * d_in, d_in),
        )
    
    def forward(self, x):
        return self.layers(x)

In [42]:
# Test
x = torch.randn(2,3,768)

ffn = FeedForward(768)

y = ffn(x)
print(y.shape)

torch.Size([2, 3, 768])


In [43]:
class TransformerBlock(nn.Module):

    def __init__(self, cfg) -> None:
        super().__init__()
        vocab_size = cfg['vocab_size']
        context_length = cfg['context_length']
        embed_dim = cfg['embed_dim']
        n_heads = cfg['n_heads']
        n_layers = cfg['n_layers']
        drop_rate = cfg['drop_rate']
        qkv_bias = cfg['qkv_bias']

        self.ff = FeedForward(embed_dim)
        self.attn = MultiHeadAttention(embed_dim, embed_dim, context_length, n_heads, drop_rate, qkv_bias = qkv_bias)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.dropout = nn.Dropout(drop_rate)

    def forward(self, x):

        shortcut = x
        x = self.norm1(x)
        x = self.attn(x)
        x = self.dropout(x)

        x = x + shortcut

        shortcut = x

        x = self.norm2(x)
        x = self.ff(x)
        x = self.dropout(x)

        x = x + shortcut
        return x

In [44]:
test_block = TransformerBlock(GPT_CONFIG)

x = torch.rand(2,3,768)

y = test_block(x)
print(y.shape)

torch.Size([2, 3, 768])


In [45]:
class GPTModel(nn.Module):

    def __init__(self, cfg):
        super().__init__()
        vocab_size = cfg['vocab_size']
        context_length = cfg['context_length']
        embed_dim = cfg['embed_dim']
        n_heads = cfg['n_heads']
        n_layers = cfg['n_layers']
        drop_rate = cfg['drop_rate']
        qkv_bias = cfg['qkv_bias']

        self.token_emb = nn.Embedding(vocab_size, embed_dim)
        self.pos_emb = nn.Embedding(context_length, embed_dim)
        self.drop_emb = nn.Dropout(drop_rate)

        self.transformer_blocks = nn.Sequential(*[TransformerBlock(cfg) for _ in range(n_layers)])
        self.final_norm = nn.LayerNorm(embed_dim)
        self.out_head = nn.Linear(embed_dim, vocab_size, bias = False)

    def forward(self, x: torch.Tensor)->torch.Tensor:

        batch, num_tokens = x.shape

        token_embeddings = self.token_emb(x)

        pos_embeddings = self.pos_emb(torch.arange(num_tokens, device = x.device))

        x = token_embeddings + pos_embeddings
        x = self.drop_emb(x)
        x = self.transformer_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

In [46]:
model = GPTModel(GPT_CONFIG)

In [50]:
x = torch.randint(0,1000, (2,4))
out = model(x)
print(out.shape)


torch.Size([2, 4, 50257])


In [52]:
# Calculate params
total_params = sum(p.numel() for p in model.parameters())
print(f"total number of params; {total_params:,}")

total number of params; 163,009,536


# Text generation

In [55]:
a = torch.rand(2,5)
print(a)
print(a[:, -3:])

tensor([[0.0856, 0.6913, 0.4619, 0.3784, 0.4054],
        [0.6121, 0.0259, 0.9582, 0.5707, 0.4206]])
tensor([[0.4619, 0.3784, 0.4054],
        [0.9582, 0.5707, 0.4206]])


In [61]:
idx = torch.randint(0,100,(1,4))
print(idx)

for _ in range(10):
    idx_cond = idx[:, -3:]
    idx_next = torch.randint(0,100,(1,1))

    idx = torch.cat([idx, idx_next], dim = 1)
    print(idx)

tensor([[ 8, 41, 89, 69]])
tensor([[ 8, 41, 89, 69, 53]])
tensor([[ 8, 41, 89, 69, 53, 76]])
tensor([[ 8, 41, 89, 69, 53, 76, 90]])
tensor([[ 8, 41, 89, 69, 53, 76, 90, 29]])
tensor([[ 8, 41, 89, 69, 53, 76, 90, 29, 10]])
tensor([[ 8, 41, 89, 69, 53, 76, 90, 29, 10, 20]])
tensor([[ 8, 41, 89, 69, 53, 76, 90, 29, 10, 20, 22]])
tensor([[ 8, 41, 89, 69, 53, 76, 90, 29, 10, 20, 22, 15]])
tensor([[ 8, 41, 89, 69, 53, 76, 90, 29, 10, 20, 22, 15,  8]])
tensor([[ 8, 41, 89, 69, 53, 76, 90, 29, 10, 20, 22, 15,  8, 95]])


In [66]:
def generate_text_simple(model, idx, max_new_tokens, context_size):
    model.eval()
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]

        with torch.no_grad():
            logits = model(idx_cond)
        
        logits = logits[:, -1, :]
        #The softmax function is monotonic, meaning it preserves the order of its inputs when transformed into outputs
        probas = torch.softmax(logits, dim = -1) #not neccessary. Explained in chapt 4 page 144

        idx_next = torch.argmax(probas, dim = -1, keepdim = True)

        idx = torch.cat((idx, idx_next), dim = -1)
    
    model.train()
    
    return idx

In [6]:
sample_input = "hello, my name is"
model = GPTModel(GPT_CONFIG)
idx = tokenizer.encode(sample_input)
idx = torch.tensor(idx).unsqueeze(0)
print('original input = ', idx)
pred_idx = model.generate_text_simple(idx, 10, 100)
print(pred_idx)
decoded = tokenizer.decode(pred_idx.squeeze(0).tolist())
print(decoded)

original input =  tensor([[31373,    11,   616,  1438,   318]])
tensor([[31373,    11,   616,  1438,   318,  2499, 47413,  3903,  4336, 21597,
         48297,  4549, 34509, 29029, 34430]])
hello, my name is workshenyband fanmissible EmergingicagoLINherty cartel
