# Building an LLM from Scratch with Weights, Biases, and Hyperparameter Tuning

This notebook demonstrates:
1. Manual initialization of weights and biases
2. Building a small transformer-based language model
3. Hyperparameter tuning
4. Training and text generation

In [None]:
# Install required packages
!pip install torch numpy matplotlib -q

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
import math
import time

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## 1. Hyperparameters Configuration

In [None]:
class HyperParams:
    """Hyperparameters for the LLM model"""
    def __init__(self):
        # Model architecture
        self.vocab_size = 100  # Small vocabulary for demo
        self.d_model = 128  # Embedding dimension
        self.n_heads = 4  # Number of attention heads
        self.n_layers = 2  # Number of transformer layers
        self.d_ff = 512  # Feed-forward dimension
        self.max_seq_len = 32  # Maximum sequence length
        self.dropout = 0.1
        
        # Training hyperparameters
        self.batch_size = 16
        self.learning_rate = 3e-4
        self.epochs = 10
        self.weight_decay = 0.01
        self.grad_clip = 1.0
        
        # Weight initialization
        self.init_std = 0.02  # Standard deviation for weight initialization
        self.init_method = 'xavier'  # 'xavier', 'kaiming', or 'normal'

# Create hyperparameters instance
hp = HyperParams()
print("Hyperparameters initialized:")
for key, value in vars(hp).items():
    print(f"  {key}: {value}")

## 2. Custom Weight Initialization Functions

In [None]:
def initialize_weights(module, method='xavier', std=0.02):
    """
    Custom weight initialization
    
    Args:
        module: PyTorch module to initialize
        method: Initialization method ('xavier', 'kaiming', 'normal')
        std: Standard deviation for normal initialization
    """
    if isinstance(module, nn.Linear):
        if method == 'xavier':
            nn.init.xavier_uniform_(module.weight)
        elif method == 'kaiming':
            nn.init.kaiming_uniform_(module.weight, a=math.sqrt(5))
        elif method == 'normal':
            nn.init.normal_(module.weight, mean=0.0, std=std)
        
        # Initialize bias to zeros
        if module.bias is not None:
            nn.init.zeros_(module.bias)
            
    elif isinstance(module, nn.Embedding):
        nn.init.normal_(module.weight, mean=0.0, std=std)
        
    elif isinstance(module, nn.LayerNorm):
        nn.init.ones_(module.weight)
        nn.init.zeros_(module.bias)

print("Weight initialization functions defined")

## 3. Building the Transformer Model Components

In [None]:
class MultiHeadAttention(nn.Module):
    """Multi-head self-attention mechanism with explicit weights and biases"""
    
    def __init__(self, d_model, n_heads, dropout=0.1):
        super().__init__()
        assert d_model % n_heads == 0
        
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_k = d_model // n_heads
        
        # Q, K, V projection weights and biases
        self.W_q = nn.Linear(d_model, d_model, bias=True)
        self.W_k = nn.Linear(d_model, d_model, bias=True)
        self.W_v = nn.Linear(d_model, d_model, bias=True)
        
        # Output projection
        self.W_o = nn.Linear(d_model, d_model, bias=True)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask=None):
        batch_size, seq_len, d_model = x.size()
        
        # Project to Q, K, V
        Q = self.W_q(x).view(batch_size, seq_len, self.n_heads, self.d_k).transpose(1, 2)
        K = self.W_k(x).view(batch_size, seq_len, self.n_heads, self.d_k).transpose(1, 2)
        V = self.W_v(x).view(batch_size, seq_len, self.n_heads, self.d_k).transpose(1, 2)
        
        # Scaled dot-product attention
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        
        attention = F.softmax(scores, dim=-1)
        attention = self.dropout(attention)
        
        # Apply attention to values
        out = torch.matmul(attention, V)
        
        # Concatenate heads and project
        out = out.transpose(1, 2).contiguous().view(batch_size, seq_len, d_model)
        out = self.W_o(out)
        
        return out

print("MultiHeadAttention module defined")

In [None]:
class FeedForward(nn.Module):
    """Position-wise feed-forward network with explicit weights and biases"""
    
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        # Two linear transformations with ReLU activation
        self.linear1 = nn.Linear(d_model, d_ff, bias=True)
        self.linear2 = nn.Linear(d_ff, d_model, bias=True)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        # FFN(x) = W2 * ReLU(W1 * x + b1) + b2
        return self.linear2(self.dropout(F.relu(self.linear1(x))))

print("FeedForward module defined")

In [None]:
class TransformerBlock(nn.Module):
    """Single transformer block with attention and feed-forward"""
    
    def __init__(self, d_model, n_heads, d_ff, dropout=0.1):
        super().__init__()
        self.attention = MultiHeadAttention(d_model, n_heads, dropout)
        self.feed_forward = FeedForward(d_model, d_ff, dropout)
        
        # Layer normalization with learnable weights and biases
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask=None):
        # Multi-head attention with residual connection
        attn_out = self.attention(x, mask)
        x = self.norm1(x + self.dropout(attn_out))
        
        # Feed-forward with residual connection
        ff_out = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_out))
        
        return x

print("TransformerBlock module defined")

In [None]:
class SimpleLLM(nn.Module):
    """Complete Language Model with explicit weights, biases, and embeddings"""
    
    def __init__(self, hp):
        super().__init__()
        self.hp = hp
        
        # Token embeddings (learnable weights)
        self.token_embedding = nn.Embedding(hp.vocab_size, hp.d_model)
        
        # Positional embeddings (learnable weights)
        self.positional_embedding = nn.Embedding(hp.max_seq_len, hp.d_model)
        
        # Transformer blocks
        self.blocks = nn.ModuleList([
            TransformerBlock(hp.d_model, hp.n_heads, hp.d_ff, hp.dropout)
            for _ in range(hp.n_layers)
        ])
        
        # Final layer norm
        self.norm = nn.LayerNorm(hp.d_model)
        
        # Output projection to vocabulary (weights and bias)
        self.lm_head = nn.Linear(hp.d_model, hp.vocab_size, bias=True)
        
        self.dropout = nn.Dropout(hp.dropout)
        
        # Initialize all weights
        self.apply(lambda m: initialize_weights(m, hp.init_method, hp.init_std))
        
        # Count parameters
        self.n_params = sum(p.numel() for p in self.parameters())
        
    def forward(self, idx, targets=None):
        batch_size, seq_len = idx.shape
        
        # Get token embeddings
        tok_emb = self.token_embedding(idx)  # (B, T, C)
        
        # Get positional embeddings
        pos = torch.arange(0, seq_len, dtype=torch.long, device=idx.device)
        pos_emb = self.positional_embedding(pos)  # (T, C)
        
        # Combine embeddings
        x = self.dropout(tok_emb + pos_emb)
        
        # Create causal mask
        mask = torch.tril(torch.ones(seq_len, seq_len, device=idx.device)).view(1, 1, seq_len, seq_len)
        
        # Pass through transformer blocks
        for block in self.blocks:
            x = block(x, mask)
        
        # Final normalization
        x = self.norm(x)
        
        # Project to vocabulary
        logits = self.lm_head(x)  # (B, T, vocab_size)
        
        # Calculate loss if targets provided
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        
        return logits, loss
    
    def generate(self, idx, max_new_tokens, temperature=1.0):
        """Generate text autoregressively"""
        for _ in range(max_new_tokens):
            # Crop context if too long
            idx_cond = idx if idx.size(1) <= self.hp.max_seq_len else idx[:, -self.hp.max_seq_len:]
            
            # Get predictions
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :] / temperature
            
            # Sample from distribution
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            
            # Append to sequence
            idx = torch.cat((idx, idx_next), dim=1)
        
        return idx

# Create model
model = SimpleLLM(hp).to(device)
print(f"\nModel created with {model.n_params:,} parameters")
print(f"\nModel architecture:\n{model}")

## 4. Inspect Weights and Biases

In [None]:
def inspect_parameters(model):
    """Inspect model weights and biases"""
    print("\n" + "="*80)
    print("WEIGHT AND BIAS INSPECTION")
    print("="*80)
    
    total_params = 0
    
    for name, param in model.named_parameters():
        num_params = param.numel()
        total_params += num_params
        
        print(f"\n{name}:")
        print(f"  Shape: {list(param.shape)}")
        print(f"  Parameters: {num_params:,}")
        print(f"  Mean: {param.data.mean().item():.6f}")
        print(f"  Std: {param.data.std().item():.6f}")
        print(f"  Min: {param.data.min().item():.6f}")
        print(f"  Max: {param.data.max().item():.6f}")
        
        if len(list(param.shape)) <= 2:
            print(f"  Sample values: {param.data.flatten()[:5].tolist()}")
    
    print(f"\n{'='*80}")
    print(f"TOTAL PARAMETERS: {total_params:,}")
    print(f"{'='*80}")

inspect_parameters(model)

## 5. Create Synthetic Training Data

In [None]:
class TextDataset(Dataset):
    """Simple dataset for language modeling"""
    
    def __init__(self, num_sequences=1000, seq_len=32, vocab_size=100):
        self.seq_len = seq_len
        self.vocab_size = vocab_size
        
        # Generate synthetic sequences with patterns
        self.data = []
        for _ in range(num_sequences):
            # Create sequences with some repeating patterns
            pattern_length = np.random.randint(3, 8)
            pattern = np.random.randint(0, vocab_size, pattern_length)
            
            sequence = []
            while len(sequence) < seq_len:
                sequence.extend(pattern)
            
            self.data.append(sequence[:seq_len])
        
        self.data = torch.tensor(self.data, dtype=torch.long)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        sequence = self.data[idx]
        # Input is all tokens except last, target is all tokens except first
        x = sequence[:-1]
        y = sequence[1:]
        return x, y

# Create dataset and dataloader
train_dataset = TextDataset(num_sequences=800, seq_len=hp.max_seq_len, vocab_size=hp.vocab_size)
val_dataset = TextDataset(num_sequences=200, seq_len=hp.max_seq_len, vocab_size=hp.vocab_size)

train_loader = DataLoader(train_dataset, batch_size=hp.batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=hp.batch_size, shuffle=False)

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")
print(f"\nSample sequence: {train_dataset[0][0][:10].tolist()}...")

## 6. Training Loop with Hyperparameter Tracking

In [None]:
# Initialize optimizer with weight decay
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=hp.learning_rate,
    weight_decay=hp.weight_decay
)

# Learning rate scheduler
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=hp.epochs)

def train_epoch(model, train_loader, optimizer, device, grad_clip):
    """Train for one epoch"""
    model.train()
    total_loss = 0
    
    for batch_idx, (x, y) in enumerate(train_loader):
        x, y = x.to(device), y.to(device)
        
        # Forward pass
        logits, loss = model(x, y)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
        
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(train_loader)

@torch.no_grad()
def evaluate(model, val_loader, device):
    """Evaluate model"""
    model.eval()
    total_loss = 0
    
    for x, y in val_loader:
        x, y = x.to(device), y.to(device)
        logits, loss = model(x, y)
        total_loss += loss.item()
    
    return total_loss / len(val_loader)

print("Training functions defined")

In [None]:
# Training loop
train_losses = []
val_losses = []
learning_rates = []

print("\nStarting training...\n")
print(f"{'Epoch':<6} {'Train Loss':<12} {'Val Loss':<12} {'LR':<12} {'Time (s)':<10}")
print("-" * 60)

for epoch in range(hp.epochs):
    start_time = time.time()
    
    # Train
    train_loss = train_epoch(model, train_loader, optimizer, device, hp.grad_clip)
    
    # Evaluate
    val_loss = evaluate(model, val_loader, device)
    
    # Step scheduler
    scheduler.step()
    
    # Record metrics
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    current_lr = optimizer.param_groups[0]['lr']
    learning_rates.append(current_lr)
    
    epoch_time = time.time() - start_time
    
    print(f"{epoch+1:<6} {train_loss:<12.4f} {val_loss:<12.4f} {current_lr:<12.6f} {epoch_time:<10.2f}")

print("\nTraining complete!")

## 7. Visualize Training Metrics

In [None]:
# Plot training curves
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Loss curves
axes[0].plot(train_losses, label='Train Loss', marker='o')
axes[0].plot(val_losses, label='Val Loss', marker='s')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Training and Validation Loss')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Learning rate schedule
axes[1].plot(learning_rates, marker='o', color='green')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Learning Rate')
axes[1].set_title('Learning Rate Schedule')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nFinal Train Loss: {train_losses[-1]:.4f}")
print(f"Final Val Loss: {val_losses[-1]:.4f}")

## 8. Analyze Weights After Training

In [None]:
def analyze_weight_changes(model):
    """Analyze weight statistics after training"""
    print("\n" + "="*80)
    print("WEIGHT ANALYSIS AFTER TRAINING")
    print("="*80)
    
    weight_stats = []
    
    for name, param in model.named_parameters():
        if param.requires_grad:
            grad_norm = param.grad.norm().item() if param.grad is not None else 0.0
            weight_stats.append({
                'name': name,
                'mean': param.data.mean().item(),
                'std': param.data.std().item(),
                'grad_norm': grad_norm
            })
    
    # Print top 10 layers by gradient norm
    weight_stats.sort(key=lambda x: x['grad_norm'], reverse=True)
    
    print("\nTop 10 layers by gradient norm:")
    print(f"{'Layer':<50} {'Mean':<12} {'Std':<12} {'Grad Norm':<12}")
    print("-" * 86)
    
    for stat in weight_stats[:10]:
        print(f"{stat['name']:<50} {stat['mean']:<12.6f} {stat['std']:<12.6f} {stat['grad_norm']:<12.6f}")
    
    # Visualize weight distributions
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    
    # Token embeddings
    token_emb = model.token_embedding.weight.data.cpu().numpy().flatten()
    axes[0, 0].hist(token_emb, bins=50, alpha=0.7, color='blue')
    axes[0, 0].set_title('Token Embedding Weights')
    axes[0, 0].set_xlabel('Weight Value')
    axes[0, 0].set_ylabel('Frequency')
    
    # First attention weights
    attn_weights = model.blocks[0].attention.W_q.weight.data.cpu().numpy().flatten()
    axes[0, 1].hist(attn_weights, bins=50, alpha=0.7, color='green')
    axes[0, 1].set_title('First Layer Attention Weights')
    axes[0, 1].set_xlabel('Weight Value')
    axes[0, 1].set_ylabel('Frequency')
    
    # Feed-forward weights
    ff_weights = model.blocks[0].feed_forward.linear1.weight.data.cpu().numpy().flatten()
    axes[1, 0].hist(ff_weights, bins=50, alpha=0.7, color='red')
    axes[1, 0].set_title('First Layer Feed-Forward Weights')
    axes[1, 0].set_xlabel('Weight Value')
    axes[1, 0].set_ylabel('Frequency')
    
    # Output head weights
    head_weights = model.lm_head.weight.data.cpu().numpy().flatten()
    axes[1, 1].hist(head_weights, bins=50, alpha=0.7, color='purple')
    axes[1, 1].set_title('Output Head Weights')
    axes[1, 1].set_xlabel('Weight Value')
    axes[1, 1].set_ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()

analyze_weight_changes(model)

## 9. Generate Text Samples

In [None]:
# Generate some sequences
print("\n" + "="*80)
print("GENERATED SEQUENCES")
print("="*80)

model.eval()

for i in range(5):
    # Start with a random token
    start_token = torch.randint(0, hp.vocab_size, (1, 1), device=device)
    
    # Generate sequence
    generated = model.generate(start_token, max_new_tokens=20, temperature=0.8)
    
    print(f"\nSample {i+1}:")
    print(f"Tokens: {generated[0].tolist()}")

print("\n" + "="*80)

## 10. Hyperparameter Tuning Experiment

In [None]:
def train_with_hyperparams(learning_rate, weight_decay, dropout, n_layers):
    """Train model with specific hyperparameters"""
    # Create new hyperparameters
    hp_exp = HyperParams()
    hp_exp.learning_rate = learning_rate
    hp_exp.weight_decay = weight_decay
    hp_exp.dropout = dropout
    hp_exp.n_layers = n_layers
    hp_exp.epochs = 5  # Fewer epochs for quick comparison
    
    # Create and train model
    model_exp = SimpleLLM(hp_exp).to(device)
    optimizer_exp = torch.optim.AdamW(
        model_exp.parameters(),
        lr=hp_exp.learning_rate,
        weight_decay=hp_exp.weight_decay
    )
    
    # Train
    for epoch in range(hp_exp.epochs):
        train_loss = train_epoch(model_exp, train_loader, optimizer_exp, device, hp_exp.grad_clip)
    
    # Evaluate
    val_loss = evaluate(model_exp, val_loader, device)
    
    return val_loss

# Hyperparameter grid search
print("\n" + "="*80)
print("HYPERPARAMETER TUNING EXPERIMENT")
print("="*80)
print("\nTesting different hyperparameter combinations...\n")

results = []

# Test different learning rates
learning_rates_test = [1e-4, 3e-4, 5e-4]
for lr in learning_rates_test:
    val_loss = train_with_hyperparams(
        learning_rate=lr,
        weight_decay=0.01,
        dropout=0.1,
        n_layers=2
    )
    results.append({'lr': lr, 'wd': 0.01, 'dropout': 0.1, 'layers': 2, 'val_loss': val_loss})
    print(f"LR={lr:.1e}, Val Loss={val_loss:.4f}")

# Test different weight decay values
weight_decays_test = [0.0, 0.01, 0.05]
for wd in weight_decays_test:
    val_loss = train_with_hyperparams(
        learning_rate=3e-4,
        weight_decay=wd,
        dropout=0.1,
        n_layers=2
    )
    results.append({'lr': 3e-4, 'wd': wd, 'dropout': 0.1, 'layers': 2, 'val_loss': val_loss})
    print(f"Weight Decay={wd:.2f}, Val Loss={val_loss:.4f}")

# Find best configuration
best_result = min(results, key=lambda x: x['val_loss'])
print(f"\nBest configuration:")
print(f"  Learning Rate: {best_result['lr']:.1e}")
print(f"  Weight Decay: {best_result['wd']:.2f}")
print(f"  Dropout: {best_result['dropout']:.2f}")
print(f"  Layers: {best_result['layers']}")
print(f"  Validation Loss: {best_result['val_loss']:.4f}")

print("\n" + "="*80)

## 11. Summary and Key Takeaways

In [None]:
print("\n" + "="*80)
print("SUMMARY")
print("="*80)

print("\n✓ Built a transformer-based LLM from scratch")
print("✓ Explicitly defined all weights and biases:")
print("  - Token embeddings")
print("  - Positional embeddings")
print("  - Multi-head attention (Q, K, V, O projections)")
print("  - Feed-forward networks")
print("  - Layer normalization")
print("  - Output projection head")

print("\n✓ Implemented custom weight initialization:")
print("  - Xavier initialization")
print("  - Kaiming initialization")
print("  - Normal initialization")

print("\n✓ Configured hyperparameters:")
print(f"  - Model size: {model.n_params:,} parameters")
print(f"  - Learning rate: {hp.learning_rate}")
print(f"  - Weight decay: {hp.weight_decay}")
print(f"  - Dropout: {hp.dropout}")
print(f"  - Gradient clipping: {hp.grad_clip}")

print("\n✓ Training features:")
print("  - Adam optimizer with weight decay")
print("  - Cosine annealing learning rate schedule")
print("  - Gradient clipping")
print("  - Cross-entropy loss")

print("\n✓ Performed hyperparameter tuning")
print("✓ Visualized training dynamics")
print("✓ Generated text samples")

print("\n" + "="*80)
print("\nThis notebook demonstrates the complete pipeline of building an LLM")
print("with explicit control over weights, biases, and hyperparameters!")
print("\n" + "="*80)