# Import

In [1]:
import torch
from llm.components import MultiHeadAttention, GPTModel
from torch import nn
import tiktoken

# GPT config

In [2]:
GPT_CONFIG = { 
    'vocab_size': 50257,
    'context_length': 1024,
    'embed_dim': 768,
    'n_heads': 12,
    'n_layers': 12,
    'drop_rate': 0.1,
    'qkv_bias': False
}

In [3]:
#BPE tokenizer
tokenizer = tiktoken.get_encoding('gpt2')

#sample
batch = [
    torch.tensor(tokenizer.encode("Every effort moves you")),
    torch.tensor(tokenizer.encode("Every day holds a")),
]

batch = torch.stack(batch, dim = 0) #stach the elements along dim = 0 
print(batch.shape)

torch.Size([2, 4])


# Layer normalization

In [4]:
x = torch.rand(5,3)
print(x.shape)
print(x)

layer_norm = nn.LayerNorm(3)

x = layer_norm(x)

print(x)

torch.Size([5, 3])
tensor([[0.1954, 0.3820, 0.5577],
        [0.4217, 0.0082, 0.6459],
        [0.4851, 0.2461, 0.4296],
        [0.8600, 0.8492, 0.4592],
        [0.2715, 0.8509, 0.4746]])
tensor([[-1.2366,  0.0247,  1.2119],
        [ 0.2389, -1.3265,  1.0876],
        [ 0.9606, -1.3783,  0.4177],
        [ 0.7358,  0.6778, -1.4136],
        [-1.0866,  1.3270, -0.2405]], grad_fn=<NativeLayerNormBackward0>)


# GELU activation

In [5]:
x = torch.randn(2,3,4)
print('original = \n', x)
gelu = nn.GELU()
print('gelu = \n', gelu(x))
print('relu = \n', torch.relu(x))

original = 
 tensor([[[ 0.8634,  0.6201,  1.2964, -0.4779],
         [ 1.1506, -1.5850, -0.4435, -0.3155],
         [ 0.8499, -0.2465, -0.1780, -0.2128]],

        [[-0.1979,  1.1857,  0.2796, -0.0232],
         [ 0.2557, -1.3692, -0.3413, -0.7217],
         [ 0.4607, -0.1092,  0.8432,  0.7599]]])
gelu = 
 tensor([[[ 0.6959,  0.4541,  1.1701, -0.1512],
         [ 1.0069, -0.0895, -0.1458, -0.1187],
         [ 0.6819, -0.0992, -0.0764, -0.0885]],

        [[-0.0834,  1.0460,  0.1706, -0.0114],
         [ 0.1537, -0.1170, -0.1251, -0.1698],
         [ 0.3121, -0.0498,  0.6749,  0.5899]]])
relu = 
 tensor([[[0.8634, 0.6201, 1.2964, 0.0000],
         [1.1506, 0.0000, 0.0000, 0.0000],
         [0.8499, 0.0000, 0.0000, 0.0000]],

        [[0.0000, 1.1857, 0.2796, 0.0000],
         [0.2557, 0.0000, 0.0000, 0.0000],
         [0.4607, 0.0000, 0.8432, 0.7599]]])


In [6]:
class FeedForward(nn.Module):

    def __init__(self, d_in) -> None:
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(d_in, 4 * d_in),
            nn.GELU(),
            nn.Linear(4 * d_in, d_in),
        )
    
    def forward(self, x):
        return self.layers(x)

In [7]:
# Test
x = torch.randn(2,3,768)

ffn = FeedForward(768)

y = ffn(x)
print(y.shape)

torch.Size([2, 3, 768])


In [8]:
class TransformerBlock(nn.Module):

    def __init__(self, cfg) -> None:
        super().__init__()
        vocab_size = cfg['vocab_size']
        context_length = cfg['context_length']
        embed_dim = cfg['embed_dim']
        n_heads = cfg['n_heads']
        n_layers = cfg['n_layers']
        drop_rate = cfg['drop_rate']
        qkv_bias = cfg['qkv_bias']

        self.ff = FeedForward(embed_dim)
        self.attn = MultiHeadAttention(embed_dim, embed_dim, context_length, n_heads, drop_rate, qkv_bias = qkv_bias)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.dropout = nn.Dropout(drop_rate)

    def forward(self, x):

        shortcut = x
        x = self.norm1(x)
        x = self.attn(x)
        x = self.dropout(x)

        x = x + shortcut

        shortcut = x

        x = self.norm2(x)
        x = self.ff(x)
        x = self.dropout(x)

        x = x + shortcut
        return x

In [9]:
test_block = TransformerBlock(GPT_CONFIG)

x = torch.rand(2,3,768)

y = test_block(x)
print(y.shape)

torch.Size([2, 3, 768])


In [10]:
class GPTModel(nn.Module):

    def __init__(self, cfg):
        super().__init__()
        vocab_size = cfg['vocab_size']
        context_length = cfg['context_length']
        embed_dim = cfg['embed_dim']
        n_heads = cfg['n_heads']
        n_layers = cfg['n_layers']
        drop_rate = cfg['drop_rate']
        qkv_bias = cfg['qkv_bias']

        self.token_emb = nn.Embedding(vocab_size, embed_dim)
        self.pos_emb = nn.Embedding(context_length, embed_dim)
        self.drop_emb = nn.Dropout(drop_rate)

        self.transformer_blocks = nn.Sequential(*[TransformerBlock(cfg) for _ in range(n_layers)])
        self.final_norm = nn.LayerNorm(embed_dim)
        self.out_head = nn.Linear(embed_dim, vocab_size, bias = False)

    def forward(self, x: torch.Tensor)->torch.Tensor:

        batch, num_tokens = x.shape

        token_embeddings = self.token_emb(x)

        pos_embeddings = self.pos_emb(torch.arange(num_tokens, device = x.device))

        x = token_embeddings + pos_embeddings
        x = self.drop_emb(x)
        x = self.transformer_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits
    
    def generate_text_simple(self, ids, max_new_tokens, context_size):
        self.eval()
        for _ in range(max_new_tokens):
            idx_cond = ids[:, -context_size:]

            with torch.no_grad():
                logits = self(idx_cond)
            
            logits = logits[:, -1, :]
            #The softmax function is monotonic, meaning it preserves the order of its inputs when transformed into outputs
            probas = torch.softmax(logits, dim = -1) #not neccessary. Explained in chapt 4 page 144

            idx_next = torch.argmax(probas, dim = -1, keepdim = True)

            ids = torch.cat((ids, idx_next), dim = -1)
        
        self.train()
        
        return ids
    


In [11]:
model = GPTModel(GPT_CONFIG)

In [12]:
x = torch.randint(0,1000, (2,4))
out = model(x)
print(out.shape)


torch.Size([2, 4, 50257])


In [13]:
# Calculate params
total_params = sum(p.numel() for p in model.parameters())
print(f"total number of params; {total_params:,}")

total number of params; 163,009,536


# Text generation

In [14]:
a = torch.rand(2,5)
print(a)
print(a[:, -3:])

tensor([[0.1443, 0.4197, 0.9350, 0.4903, 0.1319],
        [0.3416, 0.0168, 0.6511, 0.4401, 0.4722]])
tensor([[0.9350, 0.4903, 0.1319],
        [0.6511, 0.4401, 0.4722]])


In [15]:
idx = torch.randint(0,100,(1,4))
print(idx)

for _ in range(10):
    idx_cond = idx[:, -3:]
    idx_next = torch.randint(0,100,(1,1))

    idx = torch.cat([idx, idx_next], dim = 1)
    print(idx)

tensor([[22, 46, 81, 76]])
tensor([[22, 46, 81, 76, 15]])
tensor([[22, 46, 81, 76, 15, 48]])
tensor([[22, 46, 81, 76, 15, 48, 96]])
tensor([[22, 46, 81, 76, 15, 48, 96, 43]])
tensor([[22, 46, 81, 76, 15, 48, 96, 43, 93]])
tensor([[22, 46, 81, 76, 15, 48, 96, 43, 93, 12]])
tensor([[22, 46, 81, 76, 15, 48, 96, 43, 93, 12, 25]])
tensor([[22, 46, 81, 76, 15, 48, 96, 43, 93, 12, 25, 43]])
tensor([[22, 46, 81, 76, 15, 48, 96, 43, 93, 12, 25, 43, 98]])
tensor([[22, 46, 81, 76, 15, 48, 96, 43, 93, 12, 25, 43, 98, 23]])


In [16]:
def generate_text_simple(model, idx, max_new_tokens, context_size):
    model.eval()
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]

        with torch.no_grad():
            logits = model(idx_cond)
        
        logits = logits[:, -1, :]
        #The softmax function is monotonic, meaning it preserves the order of its inputs when transformed into outputs
        probas = torch.softmax(logits, dim = -1) #not neccessary. Explained in chapt 4 page 144

        idx_next = torch.argmax(probas, dim = -1, keepdim = True)

        idx = torch.cat((idx, idx_next), dim = -1)
    
    model.train()
    
    return idx

In [19]:
def generate(model, ids, max_new_tokens, context_size, temperature = 0.0, top_k = None):
    model.eval()
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]

        with torch.no_grad():
            logits = model(idx_cond)
        
        logits = logits[:, -1, :]

        if top_k:
            #use top_k sampling
            top_logits, top_pos = torch.topk(logits, top_k)

            logits = torch.where(
                logits < top_logits[:,-1],
                input = torch.tensor(-float('inf')).to(logits.device), #if condition is true
                other = logits #if condition is false
            )
        
        if temperature > 0.0:
            #use temp scaling
            logits = logits / temperature
            probs = torch.softmax(logits, dim = -1)
            # sample from probability distribution
            idx_next = torch.multinomial(probs, num_samples=1) 
        else:
            idx_next = torch.argmax(logits, dim = -1, keepdim = True) 
        
        # if idx_next == EOS_ID: # stop generaton if EOS is predicted
        #     break
            
        ids = torch.cat((ids, idx_next), dim = 1)
    
    return ids

In [20]:
sample_input = "hello, my name is"
model = GPTModel(GPT_CONFIG)
idx = tokenizer.encode(sample_input)
idx = torch.tensor(idx).unsqueeze(0)
print('original input = ', idx)
# pred_idx = model.generate_text_simple(idx, 10, 100)
pred_idx = generate(model, idx, 10, 100)
# print(pred_idx)
# decoded = tokenizer.decode(pred_idx.squeeze(0).tolist())
# print(decoded)
#=====
pred_idx = generate(model, idx, 10, 100, temperature=2.0, top_k = 10)
print(pred_idx)
decoded = tokenizer.decode(pred_idx.squeeze(0).tolist())
print(decoded)

original input =  tensor([[31373,    11,   616,  1438,   318]])
tensor([[31373,    11,   616,  1438,   318,  9691, 32850,  9691, 14878, 32850,
         32850, 20788, 14838, 27446, 27446]])
hello, my name is mining MF mining Pokemon MF MF appetite unlockedepheph


# Decoding strategies: this is from chapter 5 but I put it here for convinient

In [None]:
next_token_logits = torch.randn(10)
print('next token logits = ', next_token_logits)
probas = torch.softmax(next_token_logits, dim = 0)
print('probas = ', probas)
next_token_id = torch.argmax(probas).item()
print(next_token_id)

print("multi nomial")
next_token_id = torch.multinomial(probas, num_samples=1).item()
print(next_token_id)

In [None]:
# Top K sampling

print("logits = ", next_token_logits)
top_k = 3
top_logits, top_pos = torch.topk(next_token_logits, top_k)
print(top_logits)
print(top_pos)

new_logits = torch.where(
    #top logits is sorted descending, 
    # last element is minmum value 
    condition=next_token_logits < top_logits[-1], 
    input=torch.tensor(float('-inf')), #when condition is true
    other=next_token_logits, #when condition is false
)

print(new_logits)

In [None]:
def temperature_scaling(logits, temperature):
    
    # Temperatures greater than 1 result in more uniformly distributed token probabilities, 
    # and temperatures smaller than 1 will result in more confident (sharper or more peaky) distributions.
    # temperature of 1 is the same as not using
    scaled = logits / temperature
    return torch.softmax(scaled, dim = 0)

print(temperature_scaling(torch.randn(5), 1))
