In [188]:
GPT_CONFIG_124M = {
    "vocab_size" : 50257 , #vocabulary size
    "context_length" : 256, #context length{orignal = 1024}
    "emb_dim"  : 768, #output embeddings dimension
    "n_heads" : 12, #number of attention heads
    "n_layers" : 12, # number of layers
    "drop_rate" : 0.1, #dropout rate 
    "qkv_bias" : False #key - Query -value bias
}

starting with tokenizing

In [189]:
import tiktoken

def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0) # remove batch dimension
    return tokenizer.decode(flat.tolist())

start_context = "Every effort moves you"



token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(start_context, tokenizer),
    max_new_tokens=10,
    context_size=GPT_CONFIG_124M["context_length"]
)

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 Every effort moves youEvery feminists Sadfellipes Being accumulation Burn famousMine


In [190]:
from torch.utils.data import DataLoader,  Dataset
class TextDataset(Dataset):
    def __init__(self, text, tokenizer,max_len,stride):
        self.text = text
        self.tokenizer = tokenizer
        idx=tokenizer.encode(text)
        self.input_ids=[]
        self.target_ids=[]
        for i in range(0,len(idx)-max_len,stride):
          self.input_ids.append(idx[i:i+max_len])
          self.target_ids.append(idx[i+1:i+max_len+1])
    def __getitem__(self, index):
        return torch.tensor(self.input_ids[index]),torch.tensor(self.target_ids[index])
    def __len__(self):
      return len(self.input_ids)
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset =   TextDataset(text=txt, tokenizer=tokenizer, max_len=max_length, stride=stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader


In [191]:
import torch
import torch.nn as nn

class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        """
        Multi-Head Attention mechanism
        
        Args:
        - d_in: Input dimension (embedding size of each token)
        - d_out: Output dimension (total features after attention)
        - context_length: Number of tokens in a sequence
        - dropout: Dropout rate
        - num_heads: Number of attention heads
        - qkv_bias: Whether to use bias in Q, K, V projections
        """
        super().__init__()
        self.d_out = d_out  # Total output dimension
        self.num_heads = num_heads  # Number of heads
        self.head_dim = d_out // num_heads  # Dimension per head (each head processes this much info)
        
        # Linear layers for query, key, and value projections
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        
        # Final projection layer after multi-head attention
        self.out_proj = nn.Linear(d_out, d_out)
        
        # Dropout for regularization
        self.dropout = nn.Dropout(dropout)
        
        # Upper triangular mask for causal attention (prevents attending to future tokens)
        self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))
        
    def forward(self, x):
        """
        Forward pass of multi-head attention.
        
        Args:
        - x: Input tensor of shape (batch_size, num_tokens, d_in)
        
        Returns:
        - context_vec: Output tensor after attention (batch_size, num_tokens, d_out)
        """
        b, num_tokens, d_in = x.shape  # Batch size, number of tokens, input feature size
        
        # Compute queries, keys, and values
        keys = self.W_key(x)  # (b, num_tokens, d_out)
        queries = self.W_query(x)  # (b, num_tokens, d_out)
        values = self.W_value(x)  # (b, num_tokens, d_out)
        
        # Reshape into multiple heads: (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        
        # Rearrange dimensions: (b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)
        
        # Compute attention scores using dot product: (b, num_heads, num_tokens, num_tokens)
        attn_scores = queries @ keys.transpose(2, 3)
        
        # Convert the mask to boolean and match size to (num_tokens, num_tokens)
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
        
        # Apply mask (set masked positions to -inf so softmax gives zero probability)
        attn_scores.masked_fill_(mask_bool, -torch.inf)
        
        # Compute attention weights using softmax (scaled by sqrt(head_dim) for stability)
        attn_weights = torch.softmax(attn_scores / self.head_dim ** 0.5, dim=-1)
        
        # Apply dropout to attention weights
        attn_weights = self.dropout(attn_weights)
        
        # Compute the context vector (weighted sum of values): (b, num_heads, num_tokens, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2)  # Swap back num_heads and num_tokens
        
        # Reshape context vector back to (b, num_tokens, d_out)
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        
        # Apply final output projection layer
        context_vec = self.out_proj(context_vec)
        
        return context_vec


In [192]:
import torch
import torch.nn as nn

# Layer Normalization: Normalizes inputs to have zero mean and unit variance per feature
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):  # emb_dim: Dimensionality of input embeddings
        super().__init__()  # Initialize the parent class (nn.Module)
        self.eps = 1e-5  # Small constant to prevent division by zero during normalization
        self.scale = nn.Parameter(torch.ones(emb_dim))  # Learnable scaling factor for normalization
        self.shift = nn.Parameter(torch.zeros(emb_dim))  # Learnable shifting factor for normalization
        
    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)  # Compute mean along the last dimension (per feature)
        var = x.var(dim=-1, keepdim=True)  # Compute variance along the last dimension (per feature)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)  # Normalize input (zero mean, unit variance)
        return self.scale * norm_x + self.shift  # Apply learnable scale and shift

# Gaussian Error Linear Unit (GELU): A smoother alternative to ReLU activation function
class GELU(nn.Module):
    def __init__(self):
        super().__init__()  # Initialize parent class (nn.Module)
        
    def forward(self, x):
        # GELU formula approximated using tanh function
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) * (x + 0.044715 * torch.pow(x, 3))
        ))

# Feed Forward Network: Used in Transformer models to project embeddings to higher dimensions and back
class FeedForward(nn.Module):
    def __init__(self, cfg):  # cfg: Dictionary containing model configuration
        super().__init__()  # Initialize parent class (nn.Module)
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),  # Expand embedding dimension by 4x
            GELU(),  # Apply GELU activation function for non-linearity
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"])  # Reduce back to original embedding dimension
        )
    
    def forward(self, x):
        return self.layers(x)  # Pass input through the sequential feed-forward layers


In [193]:
'''we will make a transformer block to align things which we defined and
studied earlier this part of code requires care and attention because if you do not
understand things from theory it will be very hard to place things at right position for this 
i suggest to use a diagram to code'''
class TransformerBlock(nn.Module):
    """
    A single Transformer block that consists of:
    - Multi-Head Self-Attention
    - Feed-Forward Network
    - Layer Normalization
    - Residual Connections
    - Dropout
    """
    def __init__(self, cfg):
        super().__init__()
        
        # Multi-Head Self-Attention Layer
        # It allows the model to focus on different parts of the input sequence
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],           # Input embedding dimension
            d_out=cfg["emb_dim"],          # Output dimension remains the same
            context_length=cfg["context_length"],  # Maximum sequence length
            num_heads=cfg["n_heads"],      # Number of attention heads
            dropout=cfg["drop_rate"],      # Dropout for regularization
            qkv_bias=cfg["qkv_bias"]       # Whether to include bias in QKV projections
        )
        
        # Feed-Forward Network
        # A two-layer MLP to process each token independently after attention
        self.ff = FeedForward(cfg)
        
        # Layer Normalization to stabilize training and normalize activations
        self.norm1 = LayerNorm(cfg["emb_dim"])  # Applied before attention
        self.norm2 = LayerNorm(cfg["emb_dim"])  # Applied before feed-forward network
        
        # Dropout for preventing overfitting and improving generalization
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"]) 
        
    def forward(self, x):
        """
        Forward pass of the Transformer block with residual connections.
        """
        
        # Save the original input for the residual (shortcut) connection
        shortcut = x  
        
        # Apply Layer Normalization before Attention (Pre-Norm Transformer)
        x = self.norm1(x)  
        
        # Apply Multi-Head Self-Attention to capture dependencies between tokens
        x = self.att(x)  
        
        # Apply dropout to the attention output for regularization
        x = self.drop_shortcut(x)  
        
        # Add the residual connection (original input + attention output)
        x = x + shortcut  
        
        # Save the current output as a new shortcut for the next residual connection
        shortcut = x  
        
        # Apply Layer Normalization before the Feed-Forward Network
        x = self.norm2(x)  
        
        # Apply the Feed-Forward Network (MLP) to process each token independently
        x = self.ff(x)  
        
        # Apply dropout to the feed-forward output
        x = self.drop_shortcut(x)  
        
        # Add the residual connection (original input + FFN output)
        x = x + shortcut  
        
        return x  # Return the final transformed output

        

In [194]:
import torch
import torch.nn as nn

class GPTModel(nn.Module):
    """ 
    GPT Model: Implements a simplified GPT-style transformer.
    """

    def __init__(self, cfg):
        """
        Initializes the GPT model.
        
        Args:
            cfg (dict): Configuration dictionary containing:
                - vocab_size (int): Number of unique tokens in vocabulary.
                - emb_dim (int): Dimensionality of token embeddings.
                - context_length (int): Maximum sequence length.
                - drop_rate (float): Dropout probability.
                - n_layers (int): Number of transformer blocks.
                - n_heads (int): Number of attention heads per transformer block.
        """
        super().__init__()

        # Token Embedding: Converts token indices to dense vectors of size `emb_dim`
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])

        # Positional Embedding: Provides position information to each token
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])

        # Dropout layer for embeddings to prevent overfitting
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        # Transformer Blocks: Stack of multiple Transformer layers
        self.trf_blocks = nn.Sequential(*[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])

        # Final Layer Normalization to stabilize outputs before prediction
        self.final_norm = LayerNorm(cfg["emb_dim"])

        # Output Linear Layer: Maps final embeddings to vocabulary logits for next-token prediction
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

    def forward(self, in_idx):
        """
        Forward pass through the GPT model.
        
        Args:
            in_idx (Tensor): Input tensor of shape (batch_size, seq_length) with token indices.

        Returns:
            logits (Tensor): Output tensor of shape (batch_size, seq_length, vocab_size).
        """
        # Extract batch size and sequence length from input shape
        batch_size, seq_len = in_idx.shape

        # Token embeddings: Converts token indices into dense vector representations
        tok_embeddings = self.tok_emb(in_idx)

        # Positional embeddings: Create a sequence of position indices and get embeddings
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))

        # Sum token embeddings and positional embeddings (element-wise)
        x = tok_embeddings + pos_embeds

        # Apply dropout to embeddings
        x = self.drop_emb(x)

        # Pass through stacked Transformer blocks
        x = self.trf_blocks(x)

        # Apply final layer normalization
        x = self.final_norm(x)

        # Pass through output head to get logits for each token position
        logits = self.out_head(x)

        return logits


In [195]:

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)

In [196]:
'''now coding a function to predict next token based on propability '''

def generate_text_simple(model , idx , max_new_tokens , context_size):
    for _ in range(max_new_tokens):
        #if LLM supports only 5 tokens and the context size is 10
        #then only the last 5 tokens are used as context
        idx_cond = idx[: , -context_size:]
        
        with torch.no_grad():
            logits = model(idx_cond)
        
        #focus only on last time step
        logits = logits[: , -1 , :]
        
        #apply softmax to get probabilities
        probas = torch.softmax(logits , dim = -1) #(batch,vocab_size)
        
        #get the idx of the vocab entry with highest probability value
        idx_next = torch.argmax(probas , dim = -1 , keepdim = True)
        
        idx = torch.cat((idx , idx_next), dim = 1) #(batch , n_tokens+1)
        
    return idx
        

In [None]:
def load_text_from_local_file(file_path):
    if os.path.exists(file_path):
        with open(file_path, "r", encoding="utf-8") as file:
            text_data = file.read()
        return text_data
    else:
        print(f"File not found at: {file_path}")
        return None


file_path = "D:/ChromeDownloads/the-verdict.txt"

text_data = load_text_from_local_file(file_path)



In [198]:
# Train/validation ratio
train_ratio = 0.90
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]


torch.manual_seed(123)

train_loader = create_dataloader_v1(
    train_data,
    batch_size=1,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloader_v1(
    val_data,
    batch_size=1,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

In [199]:
def cacl_loss_batch(input_batch , target_batch , model , device):
     input_batch,target_batch = input_batch.to(device) , target_batch.to(device)
     logits = model(input_batch)
     loss = torch.nn.functional.cross_entropy(logits.flatten(0,1) , target_batch.flatten())
     return loss


def calc_loss_loader(data_loader , model , device , num_batches = None):
    total_loss = 0
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None :
        num_batches = len(data_loader)
        
    else:
        #reduce the number of batches to match the total number of batches in the data loader
        #if num_batches exceeds the number of batches in the data loader
        num_batches = min(num_batches , len(data_loader))
    for i , (input_batch , target_batch) in enumerate(data_loader):
        if i <num_batches:
            loss = cacl_loss_batch(input_batch,target_batch,model,device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

In [200]:
def evaluate_model(model , train_loader , val_loader , device ,eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader , model ,device , num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader , model ,device , num_batches=eval_iter)
    model.train()
    return train_loss,val_loss

In [201]:
def generate_and_print_sample(model,tokenizer,device , start_context):
  model.eval()
  context_size = model.pos_emb.weight.shape[0]
  encoded = text_to_token_ids(start_context,tokenizer).to(device)
  with torch.no_grad():
    token_ids = generate_text_simple(model=model , idx = encoded , max_new_tokens=50 , context_size= context_size
                                     )
    decoded_text = token_ids_to_text(token_ids , tokenizer)
    print(decoded_text.replace("\n"," ")) #compact print format
    model.train()

In [206]:
def train_model_simple(model, train_loader , optimizer , device , num_epochs , 
                       eval_freq , eval_iter , start_context , tokenizer):
    #initialize lists to track losses and tokens seen
    train_losses , val_losses , track_tokens_seen = [],[],[]
    tokens_seen , global_step = 0,-1
    
    #main training loop 
    for epoch in range (num_epochs):
        model.train() #Set model to Training mode
        
        for input_batch , target_batch in train_loader:
            optimizer.zero_grad() # Reset loss gradient from previous batch iteration
            loss = cacl_loss_batch(input_batch , target_batch , model ,device)
            loss.backward() #Calculate loss gradients
            optimizer.step() # update model weights using loss gradients
            tokens_seen+= input_batch.numel() #returns the total number of elements (or tokens)
            global_step+= 1
            
            #evaluation step
            if global_step % eval_freq ==0:
                train_loss , val_loss = evaluate_model(model , train_loader , val_loader , device , eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"Ep{epoch+1}(Step{global_step:06d}):"
                      f"Train loss {train_loss:.3f} , val loss {val_loss:.3f}")
                
        #print a Sample text after each epoch
        generate_and_print_sample(model , tokenizer , device , start_context
        )
        
        
    return train_losses , val_losses , track_tokens_seen

In [208]:
import time
start_time = time.time()

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)

num_epochs = 10
train_losses, val_losses, tokens_seen = train_model_simple(
    model, train_loader, optimizer, device, num_epochs,
    5, 5, "Every effort moves you", tokenizer
)

end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")


Ep1(Step000000):Train loss 10.033 , val loss 10.127
Ep1(Step000005):Train loss 8.244 , val loss 8.528
Ep1(Step000010):Train loss 7.199 , val loss 7.505
Ep1(Step000015):Train loss 6.697 , val loss 6.868
Every effort moves you,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Ep2(Step000020):Train loss 6.315 , val loss 6.645
Ep2(Step000025):Train loss 6.191 , val loss 6.555
Ep2(Step000030):Train loss 5.854 , val loss 6.547
Ep2(Step000035):Train loss 5.763 , val loss 6.564
Every effort moves youHe laughed""                                              
Ep3(Step000040):Train loss 5.381 , val loss 6.443
Ep3(Step000045):Train loss 5.574 , val loss 6.674
Ep3(Step000050):Train loss 5.342 , val loss 6.501
Every effort moves you. the picture.      the, I had the of the of the picture. the his the of the of the picture. the of the his the picture. the picture. the of the of the man of the picture. the
Ep4(Step000055):Train loss 5.327 , val loss 6.405
Ep4(Step000060):Train loss 5.199 , val loss 6.