In [None]:
# Import required modules
from rms_norm import create_rmsnorm_analyzer, print_flop_analysis
from plotting_utils import create_complete_analysis_report

# RMSNorm Analysis Notebook

This notebook provides comprehensive performance analysis for the RMSNorm module, including:
- FLOP calculations and memory bandwidth analysis
- Scaling behavior across batch sizes, sequence lengths, and embedding dimensions  
- Performance visualizations and optimization insights

The computational code has been moved to `rms_norm.py` for better modularity and reuse.

In [None]:
# FLOP Analysis for Common Configurations

# Define typical transformer configurations
configurations = [
    {"name": "Small", "B": 8, "S": 512, "D": 768},
    {"name": "Base", "B": 16, "S": 1024, "D": 1024}, 
    {"name": "Large", "B": 32, "S": 2048, "D": 4096},
    {"name": "XL", "B": 64, "S": 4096, "D": 8192},
]

# Print FLOP analysis
print_flop_analysis(configurations)

In [None]:
# Quick Memory Bandwidth Test

import torch

if torch.cuda.is_available():
    print("Running quick memory bandwidth test on a few configurations...")
    
    # Create RMSNorm analyzer
    analyzer = create_rmsnorm_analyzer()
    
    # Quick test configurations (cache-busting sizes)
    quick_configs = [
        (32, 2048, 4096),   # ~0.5GB
        (64, 2048, 4096),   # ~1.0GB  
        (32, 4096, 4096),   # ~1.0GB
    ]
    
    from rms_norm import BenchmarkConfig
    
    print(f"\nTesting {len(quick_configs)} cache-busting configurations:")
    print("-" * 60)
    
    for i, (B, S, D) in enumerate(quick_configs, 1):
        tensor_size_gb = B * S * D * 2 / 1e9  # FP16
        print(f"\nConfig {i}: B={B}, S={S}, D={D}")
        print(f"Tensor size: {tensor_size_gb:.2f} GB")
        
        try:
            # Test with fast config for demonstration
            config = BenchmarkConfig(warmup_runs=3, benchmark_runs=10)
            bandwidth, actual_size = analyzer.safe_benchmark(B, S, D, config)
            
            if bandwidth > 0:
                utilization = (bandwidth / 1008) * 100  # RTX 4090 peak
                print(f"✅ Bandwidth: {bandwidth:.1f} GB/s ({utilization:.1f}% utilization)")
            else:
                print("❌ OOM or benchmark failed")
                
        except Exception as e:
            print(f"❌ Error: {e}")
            
        # Clear memory
        torch.cuda.empty_cache()
        
else:
    print("CUDA not available - skipping bandwidth test")

In [None]:
# Comprehensive Analysis with Visualizations

import torch

if torch.cuda.is_available():
    print("Running comprehensive RMSNorm bandwidth analysis...")
    print("This will test 1000+ parameter combinations and generate detailed visualizations.")
    print("⚠️  This may take 30-90 minutes depending on GPU memory and configuration count.")
    print("\nTo run the analysis, uncomment and execute the code below:")
    
    analysis_code = '''
# Create RMSNorm analyzer
analyzer = create_rmsnorm_analyzer()

# Run comprehensive analysis with expanded parameter ranges
from rms_norm import BenchmarkConfig

config = BenchmarkConfig(
    warmup_runs=1,           # Fast for large-scale testing
    benchmark_runs=3,        # Fast for large-scale testing  
    memory_factor=6,         # RMSNorm memory access pattern
    percentile_filter=(0.1, 0.9)  # Filter outliers
)

# Run the analysis
df_results = analyzer.run_comprehensive_analysis(
    config=config,
    expanded_ranges=True,    # Use expanded parameter ranges (powers of 2 + multiples)
    peak_bandwidth_gbps=1008 # RTX 4090 theoretical peak
)

# Create complete analysis report with plots
create_complete_analysis_report(
    df_results,
    module_name="RMSNorm",
    peak_bandwidth_gbps=1008,
    show_plots=True,
    top_n=15
)

# Store results for further analysis
globals()["bandwidth_results"] = df_results
print(f"\\nResults stored in 'bandwidth_results' variable with {len(df_results):,} configurations")
'''

    print(f"Code to run comprehensive analysis:")
    print("-" * 50) 
    print(analysis_code)
    
else:
    print("CUDA not available - cannot run comprehensive GPU analysis")
    print("The analysis requires a CUDA-capable GPU for memory bandwidth measurements.")