<a href="https://colab.research.google.com/github/abdulrehman898998/gpt2/blob/main/Untitled66.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [92]:
config = {
    "vocab_size": 50257,               # Size of the vocabulary (e.g., BERT tokenizer vocab size)
    "token_embedding_dim": 768,       # Dimension of token embeddings
    "context_len": 500,               # Maximum length of input sequence (context length)
    "dropout": 0.1,                   # Dropout rate
    "num_of_transformer": 8,         # Number of transformer blocks/layers
    "num_heads": 8,                  # Number of attention heads
    "qkv_bias": True,                 # Whether to use bias in QKV projections
}


In [93]:
import torch
import torch.nn as nn

# Layer Normalization class: Normalizes the input tensor
class LayerNorm(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()
        self.eps = 1e-5  # Small epsilon value added to the denominator for numerical stability
        self.scale = nn.Parameter(torch.ones(embed_dim))  # Learnable scaling parameter
        self.shift = nn.Parameter(torch.zeros(embed_dim))  # Learnable shifting parameter

    def forward(self, x):
        # x shape: (b, n_tokens, embed_dim)
        # b: batch size, n_tokens: number of tokens in the sequence, embed_dim: embedding dimension

        mean = x.mean(dim=-1, keepdim=True)  # Mean along the embedding dimension (embed_dim)
        std = x.std(dim=-1, keepdim=True)    # Standard deviation along the embedding dimension (embed_dim)

        # Normalize: (x - mean) / (std + eps)
        normalized_x = (x - mean) / (std + self.eps)  # Shape: (b, n_tokens, embed_dim)

        # Apply the scale and shift parameters
        # (b, n_tokens, embed_dim) * scale + shift
        output = self.scale * normalized_x + self.shift  # Shape: (b, n_tokens, embed_dim)

        return output


# GELU activation function class: Applies GELU activation to the input tensor
class Gelu(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        # GELU activation function: 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
        return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2 / torch.pi)) * (x + 0.044715 * torch.pow(x, 3))))


# Feed-Forward neural network class: Implements a simple FFNN with a hidden layer
class FeedForward(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()

        # Sequential container for the layers of the FeedForward network
        self.net = nn.Sequential(
            nn.Linear(embed_dim, 4 * embed_dim),  # Linear transformation (embed_dim -> 4 * embed_dim)
            Gelu(),  # GELU activation function
            nn.Linear(4 * embed_dim, embed_dim)   # Linear transformation (4 * embed_dim -> embed_dim)
        )

    def forward(self, x):
        # x shape: (b, n_tokens, embed_dim)
        # Apply the FeedForward network
        return self.net(x)  # Output shape: (b, n_tokens, embed_dim)


In [94]:
import torch
import torch.nn as nn

class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, n_heads, qkv_bias=False, context_len=None):
        super().__init__()
        self.d_in = d_in  # The dimensionality of the input
        self.d_out = d_out  # The output dimensionality of the attention
        self.n_heads = n_heads  # The number of attention heads

        # Check if the output dimensionality is divisible by the number of heads
        if d_out % n_heads != 0:
            raise ValueError("d_out must be divisible by n_heads")

        self.head_dim = d_out // n_heads  # Dimensionality of each attention head

        # Define linear transformations to get Q, K, and V
        self.w_q = nn.Linear(d_in, d_out, bias=qkv_bias)  # Linear layer for query
        self.w_k = nn.Linear(d_in, d_out, bias=qkv_bias)  # Linear layer for key
        self.w_v = nn.Linear(d_in, d_out, bias=qkv_bias)  # Linear layer for value

        # Mask for causal attention (used during autoregressive generation)
        if context_len is not None:
            self.register_buffer(
                "mask",
                torch.triu(torch.ones(context_len, context_len), diagonal=1).bool()  # Upper triangular mask
            )
        else:
            self.mask = None

    def forward(self, x):
        b, n_tokens, _ = x.shape  # (batch_size, seq_len, d_in)

        # Apply the linear transformations to get Q, K, and V
        q = self.w_q(x)  # (b, n_tokens, d_out)
        k = self.w_k(x)  # (b, n_tokens, d_out)
        v = self.w_v(x)  # (b, n_tokens, d_out)

        # Reshape Q, K, V for multi-head attention
        # q: (b, n_tokens, n_heads, head_dim) -> (b, n_heads, n_tokens, head_dim)
        q = q.view(b, n_tokens, self.n_heads, self.head_dim).transpose(1, 2)
        # k: (b, n_tokens, n_heads, head_dim) -> (b, n_heads, n_tokens, head_dim)
        k = k.view(b, n_tokens, self.n_heads, self.head_dim).transpose(1, 2)
        # v: (b, n_tokens, n_heads, head_dim) -> (b, n_heads, n_tokens, head_dim)
        v = v.view(b, n_tokens, self.n_heads, self.head_dim).transpose(1, 2)

        # q, k, v are now shaped as (b, n_heads, n_tokens, head_dim)

        # Compute attention scores: Q * K^T / sqrt(head_dim)
        scores = torch.matmul(q, k.transpose(-2, -1))  # (b, n_heads, n_tokens, n_tokens)
        scores = scores / torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32))  # Scaling by sqrt(head_dim)

        # Apply mask if available (for causal attention during autoregressive generation)
        if self.mask is not None:
            # Mask is of shape (n_tokens, n_tokens), we slice it to match the batch size
            mask_bool = self.mask[:n_tokens, :n_tokens]  # (n_tokens, n_tokens)
            scores = scores.masked_fill(mask_bool, float('-inf'))  # Set masked positions to -inf

        # Compute attention weights using softmax
        attention_weights = torch.softmax(scores, dim=-1)  # (b, n_heads, n_tokens, n_tokens)

        # Output is the weighted sum of values
        out = torch.matmul(attention_weights, v)  # (b, n_heads, n_tokens, head_dim)

        # Reshape the output back to (b, n_tokens, d_out)
        out = out.transpose(1, 2).contiguous().view(b, n_tokens, self.n_heads * self.head_dim)
        # out is now of shape (b, n_tokens, d_out)

        return out


In [95]:
import torch
import torch.nn as nn

# Assuming you have defined the following modules:
# MultiHeadAttention, FeedForward, and LayerNorm

class TransformerBlock(nn.Module):
    def __init__(self, config):
        super().__init__()

        embed_dim = config['token_embedding_dim']

        # Multi-head attention layer
        self.attention = MultiHeadAttention(
            d_in=embed_dim,
            d_out=embed_dim,
            n_heads=config['num_heads'],
            qkv_bias=config.get('qkv_bias', False),
            context_len=config['context_len']
        )

        # Feed-forward network
        self.feed_forward = FeedForward(embed_dim)

        # Layer normalization for attention and feed-forward outputs
        self.norm1 = LayerNorm(embed_dim)
        self.norm2 = LayerNorm(embed_dim)

        # Dropout layer for regularization
        self.dropout = nn.Dropout(config['dropout'])

    def forward(self, x):
        # Apply attention with residual connection and layer normalization
        attn_output = self.attention(self.norm1(x))  # (batch_size, seq_len, embed_dim)
        x = x + self.dropout(attn_output)  # Add residual connection + dropout

        # Apply feed-forward network with residual connection and layer normalization
        ffn_output = self.feed_forward(self.norm2(x))  # (batch_size, seq_len, embed_dim)
        x = x + self.dropout(ffn_output)  # Add residual connection + dropout

        return x


In [96]:
import torch
import torch.nn as nn

# Assuming TransformerBlock and any other necessary classes (like LayerNorm) are defined elsewhere

class MyGPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        # Token embeddings: map each token index to a vector representation
        self.token_embed_lookup = nn.Embedding(config['vocab_size'], config['token_embedding_dim'])

        # Positional embeddings: each position gets a corresponding embedding
        self.positional_embed = nn.Embedding(config['context_len'], config['token_embedding_dim'])

        # Dropout layer for regularization
        self.dropout = nn.Dropout(config['dropout'])

        # Sequence of transformer blocks
        self.transformers = nn.Sequential(
            *[TransformerBlock(config) for _ in range(config['num_of_transformer'])]
        )

        # Final LayerNorm to normalize the output of the transformer blocks
        self.final_norm = nn.LayerNorm(config['token_embedding_dim'])

        # Linear layer to project the output to the vocab size (logits)
        self.out_head = nn.Linear(config['token_embedding_dim'], config['vocab_size'])

    def forward(self, idx):
        # Get the batch size and sequence length from the input
        batch_size, seq_len = idx.shape

        # Get token embeddings (batch_size, seq_len, token_embedding_dim)
        token_embed = self.token_embed_lookup(idx)

        # Create positional indices for the sequence (1, seq_len)
        positional_indices = torch.arange(seq_len, device=idx.device).unsqueeze(0)  # Shape: (1, seq_len)

        # Get positional embeddings (1, seq_len, token_embedding_dim)
        positional_embed = self.positional_embed(positional_indices)

        # Combine token embeddings and positional embeddings (batch_size, seq_len, token_embedding_dim)
        x = token_embed + positional_embed

        # Apply dropout to the combined embeddings
        x = self.dropout(x)

        # Pass the embeddings through the transformer blocks (batch_size, seq_len, token_embedding_dim)
        x = self.transformers(x)

        # Apply final layer normalization (batch_size, seq_len, token_embedding_dim)
        x = self.final_norm(x)

        # Project the output of the transformer to logits (batch_size, seq_len, vocab_size)
        logits = self.out_head(x)

        return logits


In [97]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [98]:
model=MyGPT(config)
model.to(device)



MyGPT(
  (token_embed_lookup): Embedding(50257, 768)
  (positional_embed): Embedding(500, 768)
  (dropout): Dropout(p=0.1, inplace=False)
  (transformers): Sequential(
    (0): TransformerBlock(
      (attention): MultiHeadAttention(
        (w_q): Linear(in_features=768, out_features=768, bias=True)
        (w_k): Linear(in_features=768, out_features=768, bias=True)
        (w_v): Linear(in_features=768, out_features=768, bias=True)
      )
      (feed_forward): FeedForward(
        (net): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): Gelu()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (attention): MultiHeadAttention(
        (w_q): Linear(in_features=768, out_features=768, bias=True)
        (w_k): Linear(in_features=768, out_features=768, bias=True)
     

In [99]:
def generate_text_simple(model, max_len, idx, context_len):
    model.eval()  # Set the model to evaluation mode (no gradient computation)

    for _ in range(max_len):
        # Crop the current context to use the last `context_len` tokens
        idx_cond = idx[:, -context_len:]  # (batch_size, context_len)

        with torch.no_grad():  # No gradient computation
            logits = model(idx_cond)  # (batch_size, context_len, vocab_size)

            # Focus on the last time step (batch_size, vocab_size)
            logits = logits[:, -1, :]  # (batch_size, vocab_size)

            # Apply softmax to get probabilities (batch_size, vocab_size)
            probs = torch.softmax(logits, dim=-1)  # (batch_size, vocab_size)

            # Get the index of the highest probability (batch_size, 1)
            idx_next = torch.argmax(probs, dim=-1, keepdim=True)  # (batch_size, 1)

            # Append the new token index to the sequence (batch_size, n_tokens+1)
            idx = torch.cat((idx, idx_next), dim=1)  # (batch_size, n_tokens+1)

    return idx


In [100]:
!pip install tiktoken



In [101]:
import tiktoken
import torch

# Initialize the GPT-2 tokenizer
tokenizer = tiktoken.get_encoding("gpt2")

def text_to_token_ids(start_seq, tokenizer, device):
    """
    Converts a text sequence into token IDs using the tokenizer.

    Parameters:
    - start_seq: The input text string to tokenize.
    - tokenizer: The GPT-2 tokenizer instance.
    - device: The device (CPU/GPU) to which the tensor will be moved.

    Returns:
    - encoded_tensor: A tensor containing token IDs for the input sequence,
                      with a batch dimension added.
    """
    # Encode the input text into token IDs.
    # Special tokens like '<|endoftext|>' are allowed in the encoded output.
    encoded = tokenizer.encode(start_seq, allowed_special={'<|endoftext|>'})
    # Example: For `start_seq = "Every effort moves you"`, `encoded` might be [2130, 1015, 3187, 345, 345].

    # Convert the list of token IDs into a PyTorch tensor and add a batch dimension.
    encoded_tensor = torch.tensor(encoded).unsqueeze(0).to(device)  # Shape: (1, len(encoded))
    # Example Output: tensor([[2130, 1015, 3187, 345, 345]]) (on the specified device)

    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    """
    Converts a tensor of token IDs back into text using the tokenizer.

    Parameters:
    - token_ids: A tensor of token IDs to decode.
    - tokenizer: The GPT-2 tokenizer instance.

    Returns:
    - Decoded text string.
    """
    # Remove the batch dimension by squeezing the tensor.
    flat = token_ids.squeeze(0)  # Shape: (sequence length,)
    # Example: If `token_ids = tensor([[2130, 1015, 3187, 345, 345]])`, `flat` is tensor([2130, 1015, 3187, 345, 345]).

    # Decode the token IDs back into text.
    return tokenizer.decode(flat.tolist())
    # Example Output: "Every effort moves you"

# Example input text to generate from
start_context = "Every effort moves you"

# Convert the input text into token IDs and prepare for generation.
token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(start_seq=start_context, tokenizer=tokenizer, device=device),  # Tokenized input
    max_len=10,  # Maximum length of the generated sequence
    context_len=config["context_len"]  # Context length for the model
)

# Convert the generated token IDs back into text for readability.
print("Output text:\n", token_ids_to_text(token_ids, tokenizer))


Output text:
 Every effort moves you:\ ScrollThat hengetgow unarmed flame Dresdenodied


In [102]:
import os
import urllib.request

file_path = "the-verdict.txt"
url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt"

if not os.path.exists(file_path):
    with urllib.request.urlopen(url) as response:
        text_data = response.read().decode('utf-8')
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(text_data)
else:
    with open(file_path, "r", encoding="utf-8") as file:
        text_data = file.read()

In [103]:
from torch.utils.data import DataLoader,  Dataset
class TextDataset(Dataset):
    def __init__(self, text, tokenizer,max_len,stride):
        self.text = text
        self.tokenizer = tokenizer
        idx=tokenizer.encode(text)
        self.input_ids=[]
        self.target_ids=[]
        for i in range(0,len(idx)-max_len,stride):
          self.input_ids.append(idx[i:i+max_len])
          self.target_ids.append(idx[i+1:i+max_len+1])
    def __getitem__(self, index):
        return torch.tensor(self.input_ids[index]),torch.tensor(self.target_ids[index])
    def __len__(self):
      return len(self.input_ids)
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset =   TextDataset(text=txt, tokenizer=tokenizer, max_len=max_length, stride=stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader



In [104]:
# Train/validation ratio
train_ratio = 0.90
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]


torch.manual_seed(123)

train_loader = create_dataloader_v1(
    train_data,
    batch_size=1,
    max_length=config["context_len"],
    stride=config["context_len"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloader_v1(
    val_data,
    batch_size=1,
    max_length=config["context_len"],
    stride=config["context_len"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

In [105]:
def calculate_loss_batch(model, inputs, targets, device):
    """
    Calculates the cross-entropy loss for a batch of inputs and targets.

    Parameters:
    - model: The neural network model that takes inputs and outputs logits.
    - inputs: The input data for the batch (e.g., shape: [B, T, ...]).
    - targets: The ground truth labels for the batch (e.g., shape: [B, T]).
    - device: The device (CPU or GPU) to which tensors should be moved.

    Returns:
    - loss: The computed cross-entropy loss for the batch.
    """

    # Move inputs and targets to the specified device (CPU or GPU).
    inputs, targets = inputs.to(device), targets.to(device)

    # Pass inputs through the model to get logits.
    # logits shape: (B, T, C), where B is batch size, T is sequence length, C is the number of classes.
    logits = model(inputs)

    # Compute the cross-entropy loss.
    # - logits.flatten(0, 1): Flattens the batch and sequence dimensions to shape (B*T, C).
    # - targets.flatten(0, 1): Flattens the batch and sequence dimensions to shape (B*T,).
    # This ensures compatibility with CrossEntropyLoss, which expects inputs of shape (N, C) and (N,).
    loss = nn.CrossEntropyLoss()(logits.flatten(0, 1), targets.flatten(0, 1))

    return loss


In [106]:
def generate_and_print_sample(model,tokenizer,start_seq,max_len=50,device=device):
  model.eval()
  encoded=text_to_token_ids(start_seq,tokenizer,device)
  context_len=config["context_len"]
  with torch.no_grad():
    token_ids = generate_text_simple(model, max_len, idx=encoded, context_len=context_len)
  decoded=token_ids_to_text(token_ids,tokenizer)
  print(decoded.replace("\n", " "))  # Compact print format
  model.train()


In [107]:
def calculate_loss_loader(model,data_loader,device,batch_size):
  total_loss = 0.
  if batch_size is None:
    batch_size=len(data_loader)
  else:
    batch_size=min(batch_size,len(data_loader))
  for i,(inputs,targets) in enumerate(data_loader):
    if i==batch_size:
      break
    total_loss+=calculate_loss_batch(model,inputs,targets,device)
  return total_loss/batch_size

In [108]:
def evaluation(model,train_loader,val_loader,device,eval_iter):
  model.eval()
  train_loss=calculate_loss_loader(model,train_loader,device,batch_size=eval_iter)
  val_loss=calculate_loss_loader(model,val_loader,device,batch_size=eval_iter)
  return train_loss,val_loss


In [109]:
def training(model,epoch,train_loader,start_seq,optimizer,val_loader,device,eval_freq,eval_iter,tokenizer=tokenizer):
  token_seen,global_step=0,0
  train_losses, val_losses, track_tokens_seen = [], [], []
  for i in range(epoch):
    model.train()
    for inputs,targets in  train_loader:
      optimizer.zero_grad()
      loss=calculate_loss_batch(model,inputs,targets,device)
      loss.backward()
      optimizer.step()
      token_seen+=inputs.numel()
      global_step += 1
      if global_step % eval_freq:
        train_loss,val_loss=evaluation(model,train_loader,val_loader,device,eval_iter)
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        track_tokens_seen.append(token_seen)
        print(f"Ep {epoch+1} (Step {global_step:06d}): "
                      f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")


        # Print a sample text after each epoch
    generate_and_print_sample(
        model, tokenizer, start_context
        )

  return train_losses, val_losses, track_tokens_seen



In [112]:
import time
start_time = time.time()
torch.manual_seed(123)

model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)

num_epochs = 5
train_losses, val_losses, tokens_seen = training(
    model=model, train_loader=train_loader, val_loader=val_loader, optimizer=optimizer, device=device,
    epoch=num_epochs, eval_freq=5, eval_iter=5,
    start_seq="Every effort moves you"
)

# Note:
# Uncomment the following code to show the execution time
end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 3.06 MiB is free. Process 22276 has 14.74 GiB memory in use. Of the allocated memory 14.55 GiB is allocated by PyTorch, and 63.67 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [113]:
def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None):

    # For-loop is the same as before: Get logits, and only focus on last time step
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]

        # New: Filter logits with top_k sampling
        if top_k is not None:
            # Keep only top_k values
            top_logits, _ = torch.topk(logits, top_k)
            min_val = top_logits[:, -1]
            logits = torch.where(logits < min_val, torch.tensor(float("-inf")).to(logits.device), logits)

        # New: Apply temperature scaling
        if temperature > 0.0:
            logits = logits / temperature

            # Apply softmax to get probabilities
            probs = torch.softmax(logits, dim=-1)  # (batch_size, context_len)

            # Sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (batch_size, 1)

        # Otherwise same as before: get idx of the vocab entry with the highest logits value
        else:
            idx_next = torch.argmax(logits, dim=-1, keepdim=True)  # (batch_size, 1)

        if idx_next == eos_id:  # Stop generating early if end-of-sequence token is encountered and eos_id is specified
            break

        # Same as before: append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch_size, num_tokens+1)

    return idx