In [363]:
# import numpy as np
# import re
# class Tokenizer:
    
#     def __init__(self, vocab):
#         self.str_to_int = vocab # vocab is a mapping of tokens to token IDs
#         self.int_to_str = {i:s for i,s in vocab.items()}
    
#     def encode(self, text): # convert test to token Ids
#         processed = re.split(r'([:,.?_=+()-!";\'@#$%&*]|--|\s)', text)
#         processed = [item.strip() for item in processed if item.strip()] # removing the blank spaces
#         processed = [
#             item if item in self.str_to_int
#             else "<unk>" for item in preprocessed
#         ]
#         ids = [self.str__to_int[s] for s in processed]
#         return ids
    
#     def decode(self, ids): # convert token IDs to text
#         text = " ".join([self.int_to_str[i] for i in ids])
#         text = re.sub(r'\s+([,.?();":!])', r'\1',text)
#         return text 
    

In [364]:
!pip install tiktoken



In [365]:
# byte pair tokenization
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

In [366]:
# alice in wonderland book as dataset 
import requests

url = "https://www.gutenberg.org/cache/epub/7256/pg7256-images.html"  # stable Alice txt URL
response = requests.get(url)

# Write to file
with open("alice.txt", "w", encoding="utf-8") as f:
    f.write(response.text)

# Now read from file
with open("alice.txt", "r", encoding="utf-8") as f:
    text = f.read()

In [367]:
import torch
print(torch.__file__)        # Path to actual torch package
print(torch.__version__)     # Should show version string
print(torch.cuda.is_available())


C:\Users\Siddh\anaconda_for_py\Lib\site-packages\torch\__init__.py
2.7.0+cpu
False


In [368]:
import torch
from torch.utils.data import Dataset, DataLoader
class Dataprep(Dataset):
    
    def __init__(self, text,tokenizer, max_length, stride):
        self.input_tensor = []
        self.target_tensor = []
        
        token_ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
        
        for i in range(0, len(token_ids)-max_length, stride):
            input_window = token_ids[i:i+max_length]
            target_window = token_ids[i+stride:i+max_length+stride]
            self.input_tensor.append(torch.tensor(input_window))
            self.target_tensor.append(torch.tensor(target_window))
            
    def __len__(self):
        return len(self.input_tensor)
    
    def __getitem__(self, idx):
        return self.input_tensor[idx] , self.target_tensor[idx]

In [369]:
def create_dataloader(text, batch_size=4, max_length=256, stride=128, drop_last=True, shuffle=False):
    
        tokenizer = tiktoken.get_encoding("gpt2")
        dataset = Dataprep(text, tokenizer, max_length, stride)
        
        dataloader = DataLoader(   
            dataset,
            batch_size=batch_size,
            drop_last=drop_last,
            shuffle=shuffle
            )
        return dataloader

In [370]:
print(len(tokenizer.encode(text)))
print(len(text))

11071
37345


In [371]:
# creating the inputs and targets tensor of dims 8x4
dataloader = create_dataloader(text, batch_size=8, max_length=4, stride=4, shuffle=False)
data_iter = iter(dataloader)
inputs , targets = next(data_iter)

In [372]:
# vector embeddings
vocab_size = 50257 # gpt2 had 50257 tokens
embed_dim = 256
vec_embed_layer = torch.nn.Embedding(vocab_size, embed_dim)
vector_embeddings = vec_embed_layer(inputs)

# positional encodings
max_length = 4
pos_encoding_layer = torch.nn.Embedding(max_length, embed_dim)
pos_encodings = pos_encoding_layer(torch.arange(max_length))

# input embeddings 
input_embed = vector_embeddings + pos_encodings
input_embed.shape

torch.Size([8, 4, 256])

In [373]:
class self_attention(torch.nn.Module):
    
    def __init__(self, din, dout, qkv_bias=False):
        super().init()
        self.Wq = torch.nn.Linear(din, dout, bias=qkv_bias)
        self.Wk = torch.nn.Linear(din, dout, bias=qkv_bias)
        self.Wv = torch.nn.Linear(din, dout, bias=qkv_bias)

        
    def forward(self, x):
        Q = self.Wq(x)
        K = self.Wk(x)
        V = self.Wv(x)
        
        attention_scores = Q @ K.T
        atten_weights = torch.softmax(attention_scores / K.shape[-1]**0.5, dim = -1)
        context_vector = atten_weights @ V
        return context_vector

In [374]:
# class CausalAttention(torch.nn.Module):
    
#     def __init__(self, din, dout, context_length, dropout_rate, qkv_bias=False):
#         super.init()
#         self.Wq = torch.nn.Linear(din, dout, bias=qkv_bias) # dout is the user defiend but din should strictly be equal to dim = embed_size for matrix mul
#         self.Wk = torch.nn.Linear(din, dout, bias=qkv_bias)
#         self.Wv = torch.nn.Linear(din, dout, bias=qkv_bias)
#         self.dropout = torch.nn.Dropout(dropout_rate)
#         self.register_buffer(torch.triu(torch.ones('mask',context_length, context_length), diagonal=1))
        
#     def forward(self, x):
#         batch, no_tokens, din = x.shape
#         Q = self.Wq(x)
#         K = self.Wk(x)
#         V = self.Wv(x)
        
#         attention_scores = Q @ K.transpose(1,2)
#         masked = attention_scores.masked_fill_(mask.bool()[:no_tokens, no_tokens], -torch.inf)
#         atten_weights = torch.softmax(masked / K.shape[-1] ** 0.5, dim=-1)
#         atten_weights = self.dropout(atten_weights)
#         context_vector = atten_weights @ V
        
#         return context_vector

In [375]:
# class MultiHeadAttention(torch.nn.Module):
    
#     def __init__(self, din, dout, context_length, num_heads, dropout_rate):
#         super().__init__()
#         self.head = torch.Modulelist(
#             [CausalAttention(din, dout, context_length, dropout_rate, qkv_bias) for _ in range(num_heads)]
#         )
        
#     def forward(self, x):
#         return torch.cat( [head[x] for head in self.head] , dim=-1)

In [376]:
class MaskedMultiheadAttention(nn.Module):
    
    def __init__(self, din, dout, context_length, num_heads, dropout_rate, qkv_bias=False):
        super().__init__()
        self.num_heads = num_heads
        self.dout = dout
        self.head_dim  = dout // num_heads
        self.Wq = torch.nn.Linear(din, dout, bias=qkv_bias) # dout is the user defiend but din should strictly be equal to dim = embed_size for matrix mul
        self.Wk = torch.nn.Linear(din, dout, bias=qkv_bias)
        self.Wv = torch.nn.Linear(din, dout, bias=qkv_bias)
        self.out_proj = torch.nn.Linear(dout, dout)
        self.dropout = torch.nn.Dropout(dropout_rate)
        
        mask = torch.triu(torch.ones(context_length, context_length), diagonal=1)
        self.register_buffer("mask", mask)
        
    def forward(self, x):
        batch, n_tokens, din = x.shape
        Q = self.Wq(x)
        K = self.Wk(x)
        V = self.Wv(x)
        
        Q = Q.view(batch, n_tokens, self.num_heads, self.head_dim)
        V = V.view(batch, n_tokens, self.num_heads, self.head_dim)
        K = K.view(batch, n_tokens, self.num_heads, self.head_dim)
        
        Q = Q.transpose(1,2)
        V = V.transpose(1,2)
        K = K.transpose(1,2)
        
        attention_scores = Q @ K.transpose(2,3)
        masked = attention_scores.masked_fill_(self.mask.bool()[:n_tokens, :n_tokens], -torch.inf)
        atten_weights = torch.softmax(masked/self.head_dim ** 0.5, dim=-1)
        atten_weights = self.dropout(atten_weights)
        context_vector = (atten_weights @ V).transpose(1,2)
        context_vector = context_vector.contiguous().view(batch, n_tokens, self.dout)
        context_vector =  self.out_proj(context_vector)
        
        return context_vector

In [377]:
cfg = {
    "vocab_size":50257,
    "context_length":256,
    "emb_dim":768,
    "n_layer":12,
    "n_head":12,
    "dropout":0.1,
    "qkv_bias":False
}

In [378]:
import torch.nn as nn
import numpy as np
class LayerNorm(nn.Module):
    
    def __init__(self, embed_dim):
        super().__init__()
        self.scale = nn.Parameter(torch.ones(embed_dim))
        self.shift = nn.Parameter(torch.zeros(embed_dim))
        self.espilon = 1e-7
        
    def forward(self, x):
        mean = x.mean(axis=-1, keepdims=True)
        var = x.var(axis=-1, keepdims=True, unbiased=True)
        norm_x = (x - mean)/torch.sqrt(var + self.espilon )
        return self.scale * norm_x + self.shift
    
class GELU(nn.Module):
    
    def __init__(self):
        super().__init__()
        
    def forward(self, x):
        gelu = 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2/torch.pi))) * (x + 0.0447 * torch.pow(x,3)))
        return gelu
    
class FeedForward(nn.Module):
    
    def __init__(self, cfg):
        super().__init__()
        self.layer = nn.Sequential(
            nn.Linear(cfg['emb_dim'], 4*cfg['emb_dim']),
            GELU(),
            nn.Linear(4*cfg['emb_dim'], cfg['emb_dim'])
        )
        
    def forward(self, x):
        return self.layer(x)

In [379]:
class Transformer(nn.Module):
    
    def __init__(self, cfg):
        super().__init__()
        self.attention = MaskedMultiheadAttention(
            din = cfg["emb_dim"], 
            dout = cfg["emb_dim"], 
            context_length = cfg["context_length"], 
            num_heads = cfg["n_head"], 
            dropout_rate = cfg["dropout"],
            qkv_bias = cfg["qkv_bias"]
        )
        self.norm1 = LayerNorm(embed_dim = cfg["emb_dim"])
        self.norm2 = LayerNorm(embed_dim = cfg["emb_dim"])
        self.ff = FeedForward(cfg)
        self.drop = nn.Dropout(cfg["dropout"])
        
    def forward(self, x):
        shortcut = x
        x = self.norm1(x) # pre normalization results in better reults than post normalization
        x = self.attention(x)
        x = self.drop(x)
        x = x + shortcut
        
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop(x)
        x = x + shortcut
        
        return x

In [380]:
class GPT(nn.Module):
    
    def __init__(self, cfg):
        super().__init__()
        self.embed_layer = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_layer = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.dropout = nn.Dropout(cfg["dropout"])
        
        self.tf_blocks = nn.Sequential(
            *[Transformer(cfg) for _ in range(cfg["n_layer"])])
        
        self.norm = LayerNorm(cfg["emb_dim"])
        
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)
        
    def forward(self, in_ids):
        batch, seq_len = in_ids.shape
        token_embed = self.embed_layer(in_ids)
        pos_ids = torch.arange(seq_len, device=in_ids.device).unsqueeze(0).expand(batch, -1)
        pos_encoding = self.pos_layer(pos_ids)
        x = token_embed + pos_encoding
        x = self.dropout(x)
        x = self.tf_blocks(x)
        x = self.norm(x)
        logits = self.out_head(x)
        return logits     

In [381]:
def generate_next_tokens(idx, model, max_new_tokens, context_length, temperature=1.0, top_k=50, top_p=0.9):
    idx = idx.to(dtype=torch.long, device=model.pos_layer.weight.device)

    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_length:]
        with torch.no_grad():
            logits = model(idx_cond)

        logits = logits[:, -1, :] / temperature
        probs = torch.softmax(logits, dim=-1)

        # Top-k filtering
        if top_k is not None:
            top_k = min(top_k, probs.size(-1))  # Safety
            top_k_vals, top_k_indices = torch.topk(probs, top_k)
            probs_filtered = torch.zeros_like(probs).scatter_(1, top_k_indices, top_k_vals)
            probs = probs_filtered / probs_filtered.sum(dim=-1, keepdim=True)

        # Nucleus (top-p) filtering
        if top_p is not None:
            sorted_probs, sorted_indices = torch.sort(probs, descending=True)
            cum_probs = sorted_probs.cumsum(dim=-1)

            # Create mask for tokens to keep
            keep_mask = cum_probs <= top_p
            # Always keep at least 1 token
            keep_mask[..., 0] = 1

            filtered_probs = torch.zeros_like(probs)
            filtered_probs.scatter_(1, sorted_indices, keep_mask.float() * sorted_probs)
            probs = filtered_probs / filtered_probs.sum(dim=-1, keepdim=True)

        next_token = torch.multinomial(probs, num_samples=1)
        idx = torch.cat((idx, next_token), dim=1)

    return idx


In [382]:
model = GPT(cfg)
model_params = sum(p.numel() for p in model.parameters())
print(f"total model parameters: {model_params:,}")

total model parameters: 162,419,712


In [383]:
def crossentropyloss(input_batch, target_batch, model, device):
    input_batch , target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    loss = nn.functional.cross_entropy(logits.flatten(0,1), target_batch.flatten())
    return loss

def calc_loss(dataloader, device, model, num_batches=None):
    total_loss = 0
    if len(dataloader) == 0:
        return float('nan')
    elif num_batches is None:
        num_batches = len(dataloader)
    else :
        num_batches = min(num_batches, len(dataloader))
    for i, (input_batch, target_batch) in enumerate(dataloader):
        if i < num_batches:
            total_loss += crossentropyloss(input_batch, target_batch, model, device).item()
        else :
            break
    return  total_loss / num_batches

In [384]:
train_split = 0.85
data_split  = int(train_split * len(text))
train_data = text[:data_split]
val_data = text[data_split:]

torch.manual_seed(123)
train_loader = create_dataloader(train_data, batch_size=4, max_length=cfg["context_length"], stride=cfg["context_length"], drop_last=True, shuffle=False)
val_loader = create_dataloader(val_data, batch_size=4, max_length=cfg["context_length"], stride=cfg["context_length"], drop_last=True, shuffle=False)

for x,y in train_loader:
    print(f"train_loader:{x.shape, y.shape}")
    
for x,y in val_loader:
    print(f"val_loader:{x.shape, y.shape}")


train_loader:(torch.Size([4, 256]), torch.Size([4, 256]))
train_loader:(torch.Size([4, 256]), torch.Size([4, 256]))
train_loader:(torch.Size([4, 256]), torch.Size([4, 256]))
train_loader:(torch.Size([4, 256]), torch.Size([4, 256]))
train_loader:(torch.Size([4, 256]), torch.Size([4, 256]))
train_loader:(torch.Size([4, 256]), torch.Size([4, 256]))
train_loader:(torch.Size([4, 256]), torch.Size([4, 256]))
train_loader:(torch.Size([4, 256]), torch.Size([4, 256]))
train_loader:(torch.Size([4, 256]), torch.Size([4, 256]))
val_loader:(torch.Size([4, 256]), torch.Size([4, 256]))


In [385]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device) # no assignment model = model.to(device) necessary for nn.Module classes


torch.manual_seed(123) # For reproducibility due to the shuffling in the data loader

with torch.no_grad(): # Disable gradient tracking for efficiency because we are not training, yet
    train_loss = calc_loss(train_loader, device, model)
    val_loss = calc_loss(val_loader, device,  model)

print("Training loss:", train_loss)
print("Validation loss:", val_loss)

Training loss: 10.992869483100044
Validation loss: 11.012042045593262


In [386]:
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0) # remove batch dimension
    return tokenizer.decode(flat.tolist())

In [387]:
def evaluate_model(model, device, eval_iter, train_loader, val_loader):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss(train_loader, device, model, num_batches=eval_iter)
        val_loss = calc_loss(val_loader, device, model, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss

In [391]:
def generate_print_sample(model, device, tokenizer, start_context):
    model.eval()
    context_size = model.pos_layer.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_next_tokens(
    model=model, 
    idx=encoded, 
    max_new_tokens=20, 
    context_length=context_size, 
    temperature=1.0, 
    top_k=50, 
    top_p=0.9
)
    decoded = token_ids_to_text(token_ids, tokenizer)
    print(decoded.replace("\n", " "))
    model.train()

In [389]:
def train_model(model, train_loader, val_loader, device, eval_iter, eval_freq, num_epochs, start_context, optimizer,tokenizer):
    
    train_loss, val_loss, tokens_seen = [], [], []
    tokens_seen1, global_step = 0, -1
    
    for epoch in range(num_epochs):
        model.train()
        
        for input_batch, target_batch in train_loader:
            optimizer.zero_grad()
            loss = crossentropyloss(input_batch, target_batch, model, device)
            loss.backward()
            optimizer.step()
            tokens_seen1 += input_batch.numel()
            global_step += 1
            
            if global_step % eval_freq == 0 :
                train_loss1, val_loss1 = evaluate_model(model, device, eval_iter, train_loader, val_loader)
                train_loss.append(train_loss1)
                val_loss.append(val_loss1)
                tokens_seen.append(tokens_seen1)
                print(f"Ep {epoch+1} (Step {global_step:06d}): "
                    f"Train loss {train_loss[-1]:.3f}, Val loss {val_loss[-1]:.3f}")

                
        generate_print_sample(model, device, tokenizer, start_context)
    return train_loss, val_loss, tokens_seen

In [392]:
# Note:
# Uncomment the following code to calculate the execution time
import time
start_time = time.time()

torch.manual_seed(123)
model = GPT(cfg)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)

num_epochs = 5
train_loss, val_loss, tokens_seen = train_model(
    model, train_loader, val_loader, device, eval_iter=5, eval_freq=5,
    num_epochs=num_epochs, start_context="There was a ", optimizer=optimizer, tokenizer=tokenizer
)

# Note:
# Uncomment the following code to show the execution time
end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

Ep 1 (Step 000000): Train loss 10.099, Val loss 10.863
Ep 1 (Step 000005): Train loss 7.941, Val loss 8.550
There was a        . >  >  the  .  
Ep 2 (Step 000010): Train loss 6.721, Val loss 7.750
Ep 2 (Step 000015): Train loss 5.974, Val loss 7.185
There was a  with ofp., of  the of: p,-:.   and the
Ep 3 (Step 000020): Train loss 5.836, Val loss 7.194
Ep 3 (Step 000025): Train loss 5.740, Val loss 7.269
There was a  on  � � the or� Gutenberg p  Gutenberg�   thep. 
Ep 4 (Step 000030): Train loss 5.710, Val loss 7.389
Ep 4 (Step 000035): Train loss 5.720, Val loss 7.453
There was a  the the.  a�   . p, with   >  you
Ep 5 (Step 000040): Train loss 5.682, Val loss 7.527
There was a .p. to>™<     for; of    the   of
Training completed in 16.63 minutes.
