# TinyLlama Deep Dive: Weights, Biases, and Hyperparameters

This notebook demonstrates:
1. Loading TinyLlama (1.1B parameters)
2. Inspecting model architecture, weights, and biases
3. Understanding hyperparameters
4. Fine-tuning with custom hyperparameters
5. Analyzing weight changes during training

## 1. Installation and Setup

In [None]:
# Install required packages
!pip install torch transformers accelerate datasets bitsandbytes peft trl -q
!pip install matplotlib seaborn pandas numpy -q

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import Dataset
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

## 2. Load TinyLlama Model and Tokenizer

In [None]:
# Model configuration
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

print(f"Loading {MODEL_NAME}...")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Load model
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float32,  # Use float32 for CPU or float16 for GPU
    device_map="auto",
    low_cpu_mem_usage=True
)

print(f"\n✓ Model loaded successfully!")
print(f"✓ Tokenizer vocabulary size: {len(tokenizer)}")

## 3. Model Architecture Overview

In [None]:
# Get model configuration
config = model.config

print("="*80)
print("TINYLLAMA MODEL ARCHITECTURE")
print("="*80)
print(f"\nModel Type: {config.model_type}")
print(f"\nKey Hyperparameters:")
print(f"  Vocabulary Size: {config.vocab_size:,}")
print(f"  Hidden Size (d_model): {config.hidden_size}")
print(f"  Number of Layers: {config.num_hidden_layers}")
print(f"  Number of Attention Heads: {config.num_attention_heads}")
print(f"  Number of Key-Value Heads: {config.num_key_value_heads}")
print(f"  Intermediate Size (FFN): {config.intermediate_size}")
print(f"  Max Position Embeddings: {config.max_position_embeddings:,}")
print(f"  RMS Norm Epsilon: {config.rms_norm_eps}")
print(f"  Rope Theta: {config.rope_theta}")
print(f"  Attention Dropout: {config.attention_dropout}")
print(f"  Hidden Activation: {config.hidden_act}")

# Calculate total parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"\nParameter Count:")
print(f"  Total Parameters: {total_params:,} ({total_params/1e9:.2f}B)")
print(f"  Trainable Parameters: {trainable_params:,} ({trainable_params/1e9:.2f}B)")
print(f"  Model Size (float32): {total_params * 4 / 1e9:.2f} GB")
print("="*80)

## 4. Detailed Weight and Bias Inspection

In [None]:
def inspect_model_weights(model, num_layers_to_show=2):
    """
    Inspect weights and biases in the model
    """
    print("\n" + "="*80)
    print("WEIGHT AND BIAS INSPECTION")
    print("="*80)
    
    weight_info = []
    
    for name, param in model.named_parameters():
        # Only show first few layers to avoid clutter
        layer_num = None
        if 'layers.' in name:
            layer_num = int(name.split('layers.')[1].split('.')[0])
            if layer_num >= num_layers_to_show:
                continue
        
        weight_info.append({
            'name': name,
            'shape': list(param.shape),
            'numel': param.numel(),
            'dtype': str(param.dtype),
            'requires_grad': param.requires_grad,
            'mean': param.data.float().mean().item(),
            'std': param.data.float().std().item(),
            'min': param.data.float().min().item(),
            'max': param.data.float().max().item()
        })
    
    # Display information
    for info in weight_info:
        print(f"\n{info['name']}:")
        print(f"  Shape: {info['shape']}")
        print(f"  Parameters: {info['numel']:,}")
        print(f"  Data Type: {info['dtype']}")
        print(f"  Trainable: {info['requires_grad']}")
        print(f"  Statistics:")
        print(f"    Mean: {info['mean']:.6f}")
        print(f"    Std:  {info['std']:.6f}")
        print(f"    Min:  {info['min']:.6f}")
        print(f"    Max:  {info['max']:.6f}")
    
    return weight_info

# Inspect weights from first 2 layers
weight_info = inspect_model_weights(model, num_layers_to_show=2)

## 5. Visualize Weight Distributions

In [None]:
def visualize_weight_distributions(model):
    """
    Visualize weight distributions across different layers
    """
    # Get weights from different parts of the model
    weights_to_plot = {
        'Embeddings': model.model.embed_tokens.weight.data.cpu().float().numpy().flatten(),
        'Layer 0 Self-Attn Q': model.model.layers[0].self_attn.q_proj.weight.data.cpu().float().numpy().flatten(),
        'Layer 0 Self-Attn K': model.model.layers[0].self_attn.k_proj.weight.data.cpu().float().numpy().flatten(),
        'Layer 0 Self-Attn V': model.model.layers[0].self_attn.v_proj.weight.data.cpu().float().numpy().flatten(),
        'Layer 0 MLP Gate': model.model.layers[0].mlp.gate_proj.weight.data.cpu().float().numpy().flatten(),
        'Layer 0 MLP Up': model.model.layers[0].mlp.up_proj.weight.data.cpu().float().numpy().flatten(),
        'Layer 0 MLP Down': model.model.layers[0].mlp.down_proj.weight.data.cpu().float().numpy().flatten(),
        'Output Head (LM Head)': model.lm_head.weight.data.cpu().float().numpy().flatten()
    }
    
    # Create subplots
    fig, axes = plt.subplots(2, 4, figsize=(20, 10))
    axes = axes.flatten()
    
    for idx, (name, weights) in enumerate(weights_to_plot.items()):
        # Sample weights if too many (for performance)
        if len(weights) > 100000:
            weights = np.random.choice(weights, 100000, replace=False)
        
        axes[idx].hist(weights, bins=100, alpha=0.7, color=f'C{idx}', edgecolor='black')
        axes[idx].set_title(name, fontsize=10, fontweight='bold')
        axes[idx].set_xlabel('Weight Value')
        axes[idx].set_ylabel('Frequency')
        axes[idx].axvline(x=0, color='red', linestyle='--', linewidth=1, alpha=0.5)
        
        # Add statistics
        mean = weights.mean()
        std = weights.std()
        axes[idx].text(0.02, 0.98, f'μ={mean:.4f}\nσ={std:.4f}',
                      transform=axes[idx].transAxes,
                      verticalalignment='top',
                      bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5),
                      fontsize=8)
    
    plt.tight_layout()
    plt.suptitle('Weight Distributions Across TinyLlama Layers', y=1.02, fontsize=14, fontweight='bold')
    plt.show()

visualize_weight_distributions(model)

## 6. Layer-by-Layer Parameter Analysis

In [None]:
def analyze_layer_parameters(model):
    """
    Analyze parameters across all transformer layers
    """
    layer_stats = []
    
    for layer_idx in range(len(model.model.layers)):
        layer = model.model.layers[layer_idx]
        
        # Get all parameters for this layer
        layer_params = sum(p.numel() for p in layer.parameters())
        
        # Attention parameters
        attn_params = (
            layer.self_attn.q_proj.weight.numel() +
            layer.self_attn.k_proj.weight.numel() +
            layer.self_attn.v_proj.weight.numel() +
            layer.self_attn.o_proj.weight.numel()
        )
        
        # MLP parameters
        mlp_params = (
            layer.mlp.gate_proj.weight.numel() +
            layer.mlp.up_proj.weight.numel() +
            layer.mlp.down_proj.weight.numel()
        )
        
        # Compute weight statistics
        all_weights = []
        for param in layer.parameters():
            all_weights.extend(param.data.cpu().float().numpy().flatten())
        all_weights = np.array(all_weights)
        
        layer_stats.append({
            'layer': layer_idx,
            'total_params': layer_params,
            'attn_params': attn_params,
            'mlp_params': mlp_params,
            'mean': all_weights.mean(),
            'std': all_weights.std(),
            'abs_mean': np.abs(all_weights).mean()
        })
    
    # Create DataFrame
    df = pd.DataFrame(layer_stats)
    
    # Display table
    print("\n" + "="*80)
    print("LAYER-BY-LAYER PARAMETER ANALYSIS")
    print("="*80)
    print(df.to_string(index=False))
    
    # Plot parameter distribution
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    
    # Total parameters per layer
    axes[0].bar(df['layer'], df['total_params'] / 1e6, color='steelblue', edgecolor='black')
    axes[0].set_xlabel('Layer Index')
    axes[0].set_ylabel('Parameters (Millions)')
    axes[0].set_title('Total Parameters per Layer')
    axes[0].grid(axis='y', alpha=0.3)
    
    # Attention vs MLP parameters
    width = 0.35
    x = np.arange(len(df))
    axes[1].bar(x - width/2, df['attn_params'] / 1e6, width, label='Attention', color='coral', edgecolor='black')
    axes[1].bar(x + width/2, df['mlp_params'] / 1e6, width, label='MLP', color='lightgreen', edgecolor='black')
    axes[1].set_xlabel('Layer Index')
    axes[1].set_ylabel('Parameters (Millions)')
    axes[1].set_title('Attention vs MLP Parameters')
    axes[1].legend()
    axes[1].grid(axis='y', alpha=0.3)
    
    # Weight magnitude across layers
    axes[2].plot(df['layer'], df['abs_mean'], marker='o', linewidth=2, markersize=8, color='purple')
    axes[2].set_xlabel('Layer Index')
    axes[2].set_ylabel('Mean Absolute Weight Value')
    axes[2].set_title('Weight Magnitude Across Layers')
    axes[2].grid(alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    return df

layer_df = analyze_layer_parameters(model)

## 7. Test Model Inference (Before Training)

In [None]:
def generate_text(model, tokenizer, prompt, max_length=100, temperature=0.7):
    """
    Generate text using the model
    """
    model.eval()
    
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            temperature=temperature,
            do_sample=True,
            top_p=0.95,
            pad_token_id=tokenizer.eos_token_id
        )
    
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

# Test prompts
test_prompts = [
    "Once upon a time",
    "The key to machine learning is",
    "In a world where AI"
]

print("\n" + "="*80)
print("BASELINE TEXT GENERATION (Before Fine-tuning)")
print("="*80)

for prompt in test_prompts:
    print(f"\nPrompt: {prompt}")
    print("-" * 80)
    generated = generate_text(model, tokenizer, prompt, max_length=80)
    print(generated)
    print("-" * 80)

## 8. Create Training Dataset

In [None]:
# Create a simple training dataset
training_texts = [
    "Machine learning is a subset of artificial intelligence that enables systems to learn from data.",
    "Deep learning uses neural networks with multiple layers to extract features from raw data.",
    "Natural language processing helps computers understand and generate human language.",
    "Computer vision enables machines to interpret and understand visual information from the world.",
    "Reinforcement learning trains agents to make decisions by rewarding desired behaviors.",
    "Transfer learning allows models to leverage knowledge from one task to improve performance on another.",
    "Neural networks are composed of layers of interconnected nodes called neurons.",
    "Backpropagation is an algorithm used to train neural networks by computing gradients.",
    "Convolutional neural networks are particularly effective for image recognition tasks.",
    "Recurrent neural networks are designed to process sequential data like text and time series.",
    "Transformer models use self-attention mechanisms to process input sequences in parallel.",
    "Large language models are trained on massive amounts of text data to understand language.",
    "Fine-tuning adapts pre-trained models to specific tasks with relatively small datasets.",
    "Overfitting occurs when a model learns training data too well and fails to generalize.",
    "Regularization techniques help prevent overfitting by constraining model complexity.",
    "Batch normalization improves training stability by normalizing layer inputs.",
    "Dropout is a regularization technique that randomly deactivates neurons during training.",
    "Learning rate determines how quickly a model updates its weights during training.",
    "Gradient descent is an optimization algorithm that minimizes the loss function.",
    "The attention mechanism allows models to focus on relevant parts of the input.",
] * 5  # Repeat to have more training samples

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=128, padding="max_length")

# Create dataset
train_dataset = Dataset.from_dict({"text": training_texts})
tokenized_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

print(f"Training dataset size: {len(tokenized_dataset)} samples")
print(f"Sample: {training_texts[0]}")

## 9. Configure Training Hyperparameters

In [None]:
# Training hyperparameters
class TrainingHyperparameters:
    """Container for all training hyperparameters"""
    
    # Optimizer hyperparameters
    learning_rate = 2e-5
    weight_decay = 0.01
    adam_beta1 = 0.9
    adam_beta2 = 0.999
    adam_epsilon = 1e-8
    max_grad_norm = 1.0  # Gradient clipping
    
    # Training schedule
    num_train_epochs = 3
    per_device_train_batch_size = 2
    gradient_accumulation_steps = 4
    warmup_ratio = 0.1
    lr_scheduler_type = "cosine"
    
    # Logging and checkpointing
    logging_steps = 10
    save_steps = 50
    eval_steps = 50
    save_total_limit = 2
    
    # Model specific
    fp16 = torch.cuda.is_available()  # Use mixed precision if GPU available
    gradient_checkpointing = False  # Memory optimization
    
hp_train = TrainingHyperparameters()

# Display hyperparameters
print("\n" + "="*80)
print("TRAINING HYPERPARAMETERS")
print("="*80)
for attr in dir(hp_train):
    if not attr.startswith('_'):
        value = getattr(hp_train, attr)
        print(f"  {attr}: {value}")
print("="*80)

## 10. Capture Initial Weight Statistics

In [None]:
def capture_weight_statistics(model, stage="initial"):
    """
    Capture statistics of model weights at a given stage
    """
    stats = {}
    
    # Sample specific layers to track
    layers_to_track = [
        'model.embed_tokens.weight',
        'model.layers.0.self_attn.q_proj.weight',
        'model.layers.0.mlp.gate_proj.weight',
        'model.layers.5.self_attn.q_proj.weight',
        'model.layers.5.mlp.gate_proj.weight',
        'lm_head.weight'
    ]
    
    for name, param in model.named_parameters():
        if name in layers_to_track:
            weight_data = param.data.cpu().float().numpy().flatten()
            stats[name] = {
                'mean': float(weight_data.mean()),
                'std': float(weight_data.std()),
                'min': float(weight_data.min()),
                'max': float(weight_data.max()),
                'abs_mean': float(np.abs(weight_data).mean()),
                'l2_norm': float(np.linalg.norm(weight_data))
            }
    
    return stats

# Capture initial weights
initial_weights = capture_weight_statistics(model, "initial")

print("\n" + "="*80)
print("INITIAL WEIGHT STATISTICS (Before Training)")
print("="*80)
for layer_name, stats in initial_weights.items():
    print(f"\n{layer_name}:")
    for stat_name, value in stats.items():
        print(f"  {stat_name}: {value:.6f}")

## 11. Fine-tune the Model

In [None]:
# Configure training arguments
training_args = TrainingArguments(
    output_dir="./tinyllama-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=hp_train.num_train_epochs,
    per_device_train_batch_size=hp_train.per_device_train_batch_size,
    gradient_accumulation_steps=hp_train.gradient_accumulation_steps,
    learning_rate=hp_train.learning_rate,
    weight_decay=hp_train.weight_decay,
    adam_beta1=hp_train.adam_beta1,
    adam_beta2=hp_train.adam_beta2,
    adam_epsilon=hp_train.adam_epsilon,
    max_grad_norm=hp_train.max_grad_norm,
    warmup_ratio=hp_train.warmup_ratio,
    lr_scheduler_type=hp_train.lr_scheduler_type,
    logging_steps=hp_train.logging_steps,
    save_steps=hp_train.save_steps,
    save_total_limit=hp_train.save_total_limit,
    fp16=hp_train.fp16,
    logging_dir="./logs",
    report_to="none",  # Disable wandb/tensorboard for simplicity
)

# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Causal language modeling
)

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

print("\n" + "="*80)
print("STARTING FINE-TUNING")
print("="*80)
print(f"Total optimization steps: {trainer.state.max_steps if hasattr(trainer.state, 'max_steps') else 'calculating...'}")
print(f"Effective batch size: {hp_train.per_device_train_batch_size * hp_train.gradient_accumulation_steps}")
print("="*80)

# Train the model
train_result = trainer.train()

print("\n" + "="*80)
print("TRAINING COMPLETED")
print("="*80)
print(f"Training loss: {train_result.training_loss:.4f}")
print(f"Training time: {train_result.metrics['train_runtime']:.2f} seconds")
print(f"Samples per second: {train_result.metrics['train_samples_per_second']:.2f}")
print("="*80)

## 12. Analyze Weight Changes After Training

In [None]:
# Capture final weights
final_weights = capture_weight_statistics(model, "final")

# Compare weight changes
print("\n" + "="*80)
print("WEIGHT CHANGES AFTER TRAINING")
print("="*80)

weight_changes = []

for layer_name in initial_weights.keys():
    initial = initial_weights[layer_name]
    final = final_weights[layer_name]
    
    mean_change = final['mean'] - initial['mean']
    std_change = final['std'] - initial['std']
    l2_change = final['l2_norm'] - initial['l2_norm']
    l2_pct_change = (l2_change / initial['l2_norm']) * 100
    
    print(f"\n{layer_name}:")
    print(f"  Mean change: {mean_change:+.6f} ({initial['mean']:.6f} → {final['mean']:.6f})")
    print(f"  Std change: {std_change:+.6f} ({initial['std']:.6f} → {final['std']:.6f})")
    print(f"  L2 norm change: {l2_change:+.2f} ({l2_pct_change:+.4f}%)")
    
    weight_changes.append({
        'layer': layer_name.split('.')[-2] if '.' in layer_name else layer_name,
        'l2_pct_change': abs(l2_pct_change),
        'mean_change': abs(mean_change)
    })

# Visualize weight changes
df_changes = pd.DataFrame(weight_changes)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# L2 norm change
axes[0].bar(range(len(df_changes)), df_changes['l2_pct_change'], color='steelblue', edgecolor='black')
axes[0].set_xticks(range(len(df_changes)))
axes[0].set_xticklabels(df_changes['layer'], rotation=45, ha='right')
axes[0].set_ylabel('L2 Norm Change (%)')
axes[0].set_title('Weight Magnitude Changes After Training')
axes[0].grid(axis='y', alpha=0.3)

# Mean change
axes[1].bar(range(len(df_changes)), df_changes['mean_change'], color='coral', edgecolor='black')
axes[1].set_xticks(range(len(df_changes)))
axes[1].set_xticklabels(df_changes['layer'], rotation=45, ha='right')
axes[1].set_ylabel('Absolute Mean Change')
axes[1].set_title('Mean Weight Value Changes')
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## 13. Compare Text Generation (Before vs After)

In [None]:
print("\n" + "="*80)
print("TEXT GENERATION COMPARISON (After Fine-tuning)")
print("="*80)

for prompt in test_prompts:
    print(f"\nPrompt: {prompt}")
    print("-" * 80)
    generated = generate_text(model, tokenizer, prompt, max_length=80)
    print(generated)
    print("-" * 80)

## 14. Visualize Training Metrics

In [None]:
# Extract training logs
if hasattr(trainer.state, 'log_history'):
    logs = trainer.state.log_history
    
    # Extract loss and learning rate
    steps = []
    losses = []
    learning_rates = []
    
    for log in logs:
        if 'loss' in log:
            steps.append(log.get('step', 0))
            losses.append(log['loss'])
        if 'learning_rate' in log:
            learning_rates.append(log['learning_rate'])
    
    # Plot training curves
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Training loss
    if losses:
        axes[0].plot(steps, losses, marker='o', linewidth=2, markersize=6, color='darkred')
        axes[0].set_xlabel('Training Steps')
        axes[0].set_ylabel('Loss')
        axes[0].set_title('Training Loss Curve')
        axes[0].grid(alpha=0.3)
    
    # Learning rate schedule
    if learning_rates:
        axes[1].plot(range(len(learning_rates)), learning_rates, marker='o', linewidth=2, markersize=6, color='darkgreen')
        axes[1].set_xlabel('Training Steps')
        axes[1].set_ylabel('Learning Rate')
        axes[1].set_title('Learning Rate Schedule')
        axes[1].ticklabel_format(style='scientific', axis='y', scilimits=(0,0))
        axes[1].grid(alpha=0.3)
    
    plt.tight_layout()
    plt.show()
else:
    print("Training logs not available")

## 15. Gradient Analysis (if available)

In [None]:
# Perform a forward and backward pass to inspect gradients
model.train()

# Get a sample batch
sample_input = tokenized_dataset[0]
input_ids = torch.tensor([sample_input['input_ids']]).to(model.device)
labels = input_ids.clone()

# Forward pass
outputs = model(input_ids=input_ids, labels=labels)
loss = outputs.loss

# Backward pass
loss.backward()

# Collect gradient statistics
print("\n" + "="*80)
print("GRADIENT ANALYSIS (Sample Batch)")
print("="*80)

gradient_stats = []

for name, param in model.named_parameters():
    if param.grad is not None and 'layers.0' in name:  # Show only first layer
        grad_data = param.grad.data.cpu().float().numpy().flatten()
        
        gradient_stats.append({
            'name': name,
            'grad_mean': float(grad_data.mean()),
            'grad_std': float(grad_data.std()),
            'grad_norm': float(np.linalg.norm(grad_data)),
            'grad_max': float(grad_data.max()),
            'grad_min': float(grad_data.min())
        })

# Display gradient statistics
for grad_stat in gradient_stats:
    print(f"\n{grad_stat['name']}:")
    print(f"  Gradient mean: {grad_stat['grad_mean']:.8f}")
    print(f"  Gradient std: {grad_stat['grad_std']:.8f}")
    print(f"  Gradient norm: {grad_stat['grad_norm']:.4f}")
    print(f"  Gradient range: [{grad_stat['grad_min']:.8f}, {grad_stat['grad_max']:.8f}]")

# Clear gradients
model.zero_grad()

## 16. Hyperparameter Sensitivity Analysis

In [None]:
print("\n" + "="*80)
print("HYPERPARAMETER IMPACT SUMMARY")
print("="*80)

hyperparameter_impacts = [
    {
        'Parameter': 'Learning Rate',
        'Current Value': hp_train.learning_rate,
        'Impact': 'Controls step size in weight updates. Too high → instability, too low → slow convergence',
        'Typical Range': '1e-5 to 1e-3'
    },
    {
        'Parameter': 'Weight Decay',
        'Current Value': hp_train.weight_decay,
        'Impact': 'L2 regularization to prevent overfitting. Higher values → simpler models',
        'Typical Range': '0.0 to 0.1'
    },
    {
        'Parameter': 'Batch Size',
        'Current Value': hp_train.per_device_train_batch_size,
        'Impact': 'Number of samples per update. Larger batches → more stable gradients, more memory',
        'Typical Range': '1 to 64'
    },
    {
        'Parameter': 'Gradient Accumulation',
        'Current Value': hp_train.gradient_accumulation_steps,
        'Impact': 'Simulates larger batches by accumulating gradients. Saves memory',
        'Typical Range': '1 to 16'
    },
    {
        'Parameter': 'Max Grad Norm',
        'Current Value': hp_train.max_grad_norm,
        'Impact': 'Clips gradient magnitude to prevent exploding gradients',
        'Typical Range': '0.5 to 5.0'
    },
    {
        'Parameter': 'Warmup Ratio',
        'Current Value': hp_train.warmup_ratio,
        'Impact': 'Fraction of training with LR warmup. Stabilizes initial training',
        'Typical Range': '0.0 to 0.2'
    },
]

df_hp = pd.DataFrame(hyperparameter_impacts)
print(df_hp.to_string(index=False))
print("\n" + "="*80)

## 17. Summary and Key Takeaways

In [None]:
print("\n" + "="*80)
print("SUMMARY: TINYLLAMA WEIGHTS, BIASES & HYPERPARAMETERS")
print("="*80)

print("\n✓ MODEL ARCHITECTURE:")
print(f"  - Total Parameters: {total_params:,} ({total_params/1e9:.2f}B)")
print(f"  - Hidden Size: {config.hidden_size}")
print(f"  - Number of Layers: {config.num_hidden_layers}")
print(f"  - Attention Heads: {config.num_attention_heads}")
print(f"  - Vocabulary Size: {config.vocab_size:,}")

print("\n✓ WEIGHTS & BIASES EXPLORED:")
print("  - Token embeddings: Maps vocabulary to continuous vectors")
print("  - Q, K, V projections: Self-attention mechanism weights")
print("  - MLP layers: Feed-forward transformation weights")
print("  - Layer norms: Normalization parameters (weights & biases)")
print("  - Output head: Final projection to vocabulary")

print("\n✓ HYPERPARAMETERS CONFIGURED:")
print(f"  - Learning Rate: {hp_train.learning_rate}")
print(f"  - Weight Decay: {hp_train.weight_decay}")
print(f"  - Batch Size: {hp_train.per_device_train_batch_size}")
print(f"  - Gradient Accumulation: {hp_train.gradient_accumulation_steps}")
print(f"  - Epochs: {hp_train.num_train_epochs}")
print(f"  - Warmup Ratio: {hp_train.warmup_ratio}")
print(f"  - LR Schedule: {hp_train.lr_scheduler_type}")
print(f"  - Gradient Clipping: {hp_train.max_grad_norm}")

print("\n✓ TRAINING COMPLETED:")
print(f"  - Fine-tuning dataset: {len(tokenized_dataset)} samples")
print(f"  - Weight changes observed across all layers")
print(f"  - Model adapted to domain-specific data")

print("\n✓ KEY INSIGHTS:")
print("  - All layers contain explicit weights (no biases in most LLaMA components)")
print("  - Weights are initialized and updated through backpropagation")
print("  - Hyperparameters control training dynamics and model behavior")
print("  - Fine-tuning modifies weights to adapt to new data")
print("  - Weight distributions change subtly but measurably during training")

print("\n" + "="*80)
print("This notebook demonstrates the complete lifecycle of working with")
print("a production LLM: architecture, weights, biases, hyperparameters,")
print("and training dynamics!")
print("="*80)