### GPT-0

In [1]:
import re
import torch
import torch.nn as nn
import urllib.request
import tiktoken
import numpy as np
from torch.utils.data import Dataset, DataLoader

In [2]:
SEED = 123
URL=("https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt")
filepath='the-verdict.txt'
urllib.request.urlretrieve(URL, filepath)
torch.manual_seed(SEED)

<torch._C.Generator at 0x284fe8a24f0>

In [3]:
class Encoder():
    def __init__(self, href):
        self.href= href
        self.rawtext=""
        self.mapping = []
        self.inv_mapping = []

    def __str__(self):
        return self.rawtext
    
    def tokenize(self):
        if not self.mapping:
            with open(self.href) as file:
                self.rawtext = file.read()
                result=re.split(r'([:;,.?!"()]|--|\s)', self.rawtext)
                rstripped=map(lambda x: x.strip(), result)
                final=[x for x in rstripped if x!='']
                dictionary={word: idx for idx, word in enumerate(set(final))}
                self.mapping = dictionary
                result=[self.mapping[tk] for tk in final if tk in self.mapping]
                self.inv_mapping = {self.mapping[key]: key for key in self.mapping}
                return result

    def encode(self, text):
        result=re.split(r'([:;,.?!"()]|--|\s)', text)
        rstripped=map(lambda x: x.strip(), result)
        final=[x for x in rstripped if x!='']
        result=[self.mapping[tk] for tk in final if tk in self.mapping]
        return result

    def decode(self, ids):
        result = [self.inv_mapping[tk] for tk in ids if tk in self.inv_mapping]
        return result

In [4]:
tokenizer = Encoder(filepath)
tokenizer.tokenize()
print(tokenizer.decode(tokenizer.encode(""""It's the last he painted, you know," Mrs. Gisburn said with pardonable pride.""")))

['"', "It's", 'the', 'last', 'he', 'painted', ',', 'you', 'know', ',', '"', 'Mrs', '.', 'Gisburn', 'said', 'with', 'pardonable', 'pride', '.']


In [5]:
class GPTDataset(Dataset):
    def __init__(self, text, tokenizer, max_length, stride):
        self.inputs=[]
        self.preds=[]
        enc=tokenizer.encode(text)
        print(f'Encoding length {len(enc)}')

        for i in range(0, len(enc)-max_length, stride):
            self.inputs.append(torch.tensor(enc[i: i+max_length]))
            self.preds.append(torch.tensor(enc[i+1: i+1+max_length]))

    def __getitem__(self, index):
        return self.inputs[index], self.preds[index]

    def __len__(self):
        return len(self.inputs)

In [6]:
#Byte pair Encoding
ttokenizer=tiktoken.get_encoding("gpt2")
print(ttokenizer.decode(ttokenizer.encode("Hello World")))

Hello World


In [8]:
def createDataLoader(rawtext, ttokenizer, batch_size, max_length, stride):
    dataset = GPTDataset(rawtext, ttokenizer, max_length, stride)
    trainloader = DataLoader(
        dataset=dataset,
        batch_size= batch_size, 
        shuffle= False, 
        num_workers=0, 
        drop_last=True
    )
    return trainloader

### Embeddings

In [9]:
max_length=4
dataloader = createDataLoader(tokenizer.rawtext, ttokenizer, 8, 4, max_length)
data_iter=iter(dataloader)
batch_one_inputs, batch_one_preds=next(data_iter)
print(batch_one_inputs)

vocab_size = 50527
output_dim = 256

tk_embedding_layer=torch.nn.Embedding(vocab_size, output_dim)
#initial weights
print(tk_embedding_layer.weight)
#token level embeddings
tk_embeddings = tk_embedding_layer(batch_one_inputs)
print(tk_embeddings.shape)

#Positional embeddings
pos_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
print(pos_embeddings.shape)

#Input embeddings
input_embeddings = pos_embeddings + tk_embeddings
print(input_embeddings.shape)

Encoding length 5145
tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
Parameter containing:
tensor([[-0.3035, -0.5880,  0.3486,  ..., -0.0522, -1.0565,  1.1510],
        [-1.3354, -2.9340,  0.1141,  ...,  0.9417, -0.3591,  0.0168],
        [-0.1350, -0.5183,  0.2326,  ...,  0.5226,  0.5430,  1.8613],
        ...,
        [-1.0602,  0.2780, -2.7081,  ...,  2.1562, -0.2877, -0.6318],
        [-1.0330,  0.2692, -0.8864,  ...,  0.5791, -0.6039, -1.0414],
        [-1.0987,  0.2705,  0.2435,  ...,  0.4270,  0.3188, -0.8022]],
       requires_grad=True)
torch.Size([8, 4, 256])
torch.Size([4, 256])
torch.Size([8, 4, 256])


Attention Mechanism

$X \in {(batch\_size, seq\_len, dim\_in)}$  
$M_q, M_k, M_v \in {(dim\_in, dim\_out)}$  
$q, k, v \in {(batch\_size, seq\_len, dim\_out)}$  
  
$attn\_scores, attn\_weights \in {(batch\_size, seq\_len, seq\_len)}$  
$cntxt\_vec \in {(batch\_size, seq\_len, dim\_out)}$  

For MHA  
$q, k, v \in {(batch\_size, seq\_len, num\_heads, dim\_out//num\_heads)}$ 


In [31]:
class SelfAttention(nn.Module):
    def __init__(self, dim_in, dim_out, seq_len, dropout):
        super(SelfAttention, self).__init__()
        shape = (dim_in, dim_out)
        self.M_k = nn.Parameter(torch.rand(shape))
        self.M_q = nn.Parameter(torch.rand(shape))
        self.M_v = nn.Parameter(torch.rand(shape))
        self.dropout = nn.Dropout(dropout)
        self.mask = torch.tril(torch.ones((seq_len, seq_len)))
    
    def forward(self, X):
        queries = X @ self.M_q
        keys = X @ self.M_k
        values = X @ self.M_v

        #compute attention_scores
        attn_scores=torch.softmax((queries @ keys.transpose(1,2))/keys.shape[-1]**0.5, dim=-1)
        
        #mask the attention_scores
        masked_attn_scores = self.mask * attn_scores
        #print(f"MASKED ATTENTION \n{masked_attn_scores}")

        #normalize attention weights
        row_sums = masked_attn_scores.sum(dim=-1, keepdim=True)
        normed_attn_weights = masked_attn_scores / row_sums
        #print(f"NORMED ATTENTION \n{normed_attn_weights}")

        #apply dropout
        normed_attn_weights = self.dropout(normed_attn_weights)

        content_vec = normed_attn_weights @ values
        return content_vec

In [None]:
class NaiveMultiHeadAttention(nn.Module):
    def __init__(self, dim_in, dim_out, seq_len, dropout, num_heads):
        super().__init__()
        assert dim_out % num_heads ==0
        self.heads = nn.ModuleList([SelfAttention(dim_in, dim_out, seq_len, dropout) for _ in range(num_heads)])
    def forward(self, X):
        return torch.cat([head(X) for head in self.heads], dim=-1)

In [35]:
class MultiHeadAttention(nn.Module):
    def __init__(self, dim_in, dim_out, seq_len, dropout, num_heads):
        super().__init__()
        
        assert dim_out % num_heads ==0
        shape=(dim_in, dim_out)
        self.num_heads=num_heads
        self.M_k = nn.Parameter(torch.rand(shape))
        self.M_q = nn.Parameter(torch.rand(shape))
        self.M_v = nn.Parameter(torch.rand(shape))
        self.dropout = nn.Dropout(dropout)
        self.mask = torch.triu(torch.ones(seq_len, seq_len) * float('-inf'), diagonal=1)
        self.out_proj = nn.Linear(dim_out, dim_out)
        
    def forward(self, X):
        queries = X @ self.M_q
        keys = X @ self.M_k
        values = X @ self.M_v

        (batch_size, seq_len, dim_out) = queries.shape
        queries=queries.view(batch_size, seq_len, self.num_heads, dim_out//self.num_heads)
        keys=keys.view(batch_size, seq_len, self.num_heads, dim_out//self.num_heads)
        values=values.view(batch_size, seq_len, self.num_heads, dim_out//self.num_heads)

        queries=queries.transpose(1, 2)
        keys=keys.transpose(1, 2)
        values=values.transpose(1, 2)

        attn_scores=queries @ keys.transpose(2, 3)
        attn_scores = attn_scores + self.mask.unsqueeze(0).unsqueeze(0)
        
        attn_weights=torch.softmax((attn_scores)/keys.shape[-1]**0.5, dim=-1)
        attn_weights=self.dropout(attn_weights)
        
        content_vec = (attn_weights @ values).transpose(1, 2)
        content_vec = content_vec.contiguous().view(batch_size, seq_len, dim_out)
        content_vec = self.out_proj(content_vec)
        return content_vec

In [37]:
inputs = torch.tensor(
  [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55]] # step     (x^6)
)
batch = torch.stack((inputs, inputs), dim=0)
#print(batch.shape)

context_length = batch.shape[1] # This is the number of tokens
d_in, d_out = 3, 2

ca = SelfAttention(d_in, d_out, context_length, 0.0)
context_vecs = ca(batch)
#print("context_vecs.shape:", context_vecs.shape)


mha = MultiHeadAttention(
    d_in, d_out, context_length, 0.0, num_heads=2
)
context_vecs = mha(batch)

print(context_vecs)
print("context_vecs.shape:", context_vecs.shape)

tensor([[[-0.5624, -1.4754],
         [-0.6086, -1.5550],
         [-0.6216, -1.5771],
         [-0.5111, -1.4504],
         [-0.4654, -1.3927],
         [-0.4317, -1.3593]],

        [[-0.5624, -1.4754],
         [-0.6086, -1.5550],
         [-0.6216, -1.5771],
         [-0.5111, -1.4504],
         [-0.4654, -1.3927],
         [-0.4317, -1.3593]]], grad_fn=<ViewBackward0>)
context_vecs.shape: torch.Size([2, 6, 2])


In [69]:
GPT_CONFIG_124M={
    "vocab_size": 50527,
    "seq_len": 1024, 
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1, 
    "qkv_bias": False
}

In [None]:
class DummyTransformerBlock(nn.Module):
    def __init__(self, cnfg):
        super().__init__()
    def forward(self, x):
        return x

class GeLU(nn.Module):
    def __init__(self):
        super().__init__()
    def forward(self, x):
        return 0.5 * x * (1+torch.tanh(torch.sqrt(torch.tensor(2.0/torch.pi)) * (x+0.044715 *torch.pow(x, 3))))

class FeedForward(nn.Module):
    def __init__(self, cnfg, use_shortcut):
        super().__init__()
        self.use_shortcut = use_shortcut
        self.layers = nn.Sequential(
            nn.Linear(cnfg["emb_dim"], 4*cnfg["emb_dim"]), 
            GeLU(), 
            nn.Linear( 4*cnfg["emb_dim"],  cnfg["emb_dim"])
        )
    def forward(self, x):
        return self.layers(x)

class LayerNorm(nn.Module):
    def __init__(self, normalized_size, eps=1e-5):
        super().__init__()
        self.eps = eps
        self.scale = nn.Parameter(torch.ones(normalized_size))
        self.shift = nn.Parameter(torch.ones(normalized_size))

    def forward(self, x):
        mean = torch.mean(x, dim=-1, keepdim=True)
        var = torch.var(x, dim=-1, keepdim=True, unbiased=False)
        normed_x = (x-mean)/(var+self.eps)**0.5
        return self.scale * normed_x + self.shift
    
class TransformerBlock(nn.Module):
    def __init__(self, cnfg):
        super().__init__()

        #prelayer norm


class GPTModel(nn.Module):
    def __init__(self, cnfg):
        super().__init__()
        self.tok_embed = torch.nn.Embedding(cnfg["vocab_size"], cnfg["emb_dim"])
        self.pos_embed = torch.nn.Embedding(cnfg["seq_len"], cnfg["emb_dim"])
        self.drop_embed = nn.Dropout(cnfg["drop_rate"])
        self.trf_blocks = nn.Sequential(*[TransformerBlock(cnfg) for i in range(cnfg["n_layers"])])
        self.final_norm = LayerNorm(cnfg["emb_dim"])
        self.out_head = nn.Linear(cnfg["emb_dim"], cnfg['vocab_size'])

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tk_embeddings=self.tok_embed(in_idx)
        pos_embeddings = self.pos_embed(torch.arange(seq_len, device=in_idx.device))
        embedding_sum = tk_embeddings + pos_embeddings
        x = self.drop_embed(embedding_sum)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

In [74]:
#Byte pair Encoding
ttokenizer=tiktoken.get_encoding("gpt2")
batch = []
txt1 = "Every effort moves you"
txt2 = "Every day holds a"

batch.append(torch.tensor(ttokenizer.encode(txt1)))
batch.append(torch.tensor(ttokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)
print(batch)

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
out = model(batch)
print("Input batch:\n", batch)
print("\nOutput shape:", out.shape)
print(out)

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])
Input batch:
 tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])

Output shape: torch.Size([2, 4, 50527])
tensor([[[-0.7788,  1.0670, -0.4077,  ...,  1.0887,  0.5278, -0.4219],
         [-0.1521,  0.3581,  0.0937,  ...,  0.7276,  0.1815, -0.9721],
         [-0.2564,  0.0832,  0.6153,  ...,  0.6698, -0.9098, -0.2385],
         [ 0.7458,  0.0554,  0.8036,  ...,  0.5722, -0.0747,  0.0744]],

        [[-1.0281,  0.7760, -0.2915,  ...,  0.5905,  0.4172, -0.1146],
         [-0.6563, -0.2283,  0.3421,  ...,  1.1885,  0.0478,  0.0974],
         [-0.5600, -1.0569,  0.2629,  ...,  0.7063, -0.1399, -0.7304],
         [-0.0746, -0.5928,  0.6964,  ...,  1.7611,  0.2118, -1.3779]]],
       grad_fn=<ViewBackward0>)
