# AdaAttn Research Quickstart

This notebook provides a quick introduction to using AdaAttn for research.

## Setup

In [None]:
import sys
import os
sys.path.append('/workspace/AdaAttn/src')

import torch
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, HTML

# AdaAttn imports
from adaattn.attention import AdaAttention, AdaptiveRankAttention, AdaptivePrecisionAttention, DenseAttention
from adaattn.utils.research_logger import get_research_logger, log_metrics
from adaattn.utils.research_monitor import create_monitor

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name()}")
    
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## 1. Basic Attention Comparison

Let's compare different attention mechanisms on a simple task.

In [None]:
# Setup experiment parameters
batch_size = 4
seq_len = 512
hidden_size = 512
num_heads = 8

# Create sample data
query = torch.randn(batch_size, seq_len, hidden_size, device=device)
key = torch.randn(batch_size, seq_len, hidden_size, device=device)
value = torch.randn(batch_size, seq_len, hidden_size, device=device)

print(f"Input shape: {query.shape}")

# Initialize different attention mechanisms
attention_configs = {
    'Dense': DenseAttention(hidden_size, num_heads),
    'AdaptiveRank': AdaptiveRankAttention(hidden_size, num_heads, rank_ratio=0.5),
    'AdaptivePrecision': AdaptivePrecisionAttention(hidden_size, num_heads),
    'AdaAttention': AdaAttention(hidden_size, num_heads, enable_gpu_optimization=torch.cuda.is_available())
}

# Move to device
for name, attention in attention_configs.items():
    attention_configs[name] = attention.to(device)

print("Attention mechanisms initialized successfully!")

## 2. Performance Benchmarking

In [None]:
import time
from collections import defaultdict

# Benchmark each attention mechanism
results = defaultdict(list)
num_runs = 10

for name, attention in attention_configs.items():
    attention.eval()
    
    print(f"Benchmarking {name}...")
    
    # Warm up
    for _ in range(3):
        with torch.no_grad():
            _ = attention(query, key, value)
    
    # Benchmark
    times = []
    memories = []
    
    for run in range(num_runs):
        torch.cuda.empty_cache() if torch.cuda.is_available() else None
        
        if torch.cuda.is_available():
            torch.cuda.reset_max_memory_allocated()
        
        start_time = time.time()
        
        with torch.no_grad():
            output = attention(query, key, value)
        
        if torch.cuda.is_available():
            torch.cuda.synchronize()
        
        end_time = time.time()
        
        times.append(end_time - start_time)
        
        if torch.cuda.is_available():
            memories.append(torch.cuda.max_memory_allocated() / 1024**3)  # GB
        else:
            memories.append(0)
    
    results[name] = {
        'mean_time': np.mean(times),
        'std_time': np.std(times),
        'mean_memory': np.mean(memories),
        'output_shape': list(output.shape)
    }
    
    print(f"  Mean time: {np.mean(times)*1000:.2f} ± {np.std(times)*1000:.2f} ms")
    print(f"  Mean memory: {np.mean(memories):.3f} GB")

print("\nBenchmarking complete!")

## 3. Visualization of Results

In [None]:
# Create comparison plots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Time comparison
names = list(results.keys())
times = [results[name]['mean_time'] * 1000 for name in names]  # Convert to ms
time_errors = [results[name]['std_time'] * 1000 for name in names]

ax1.bar(names, times, yerr=time_errors, capsize=5, alpha=0.7)
ax1.set_ylabel('Time (ms)')
ax1.set_title('Attention Mechanism Performance')
ax1.tick_params(axis='x', rotation=45)

# Memory comparison
memories = [results[name]['mean_memory'] for name in names]
ax2.bar(names, memories, alpha=0.7, color='orange')
ax2.set_ylabel('Memory (GB)')
ax2.set_title('Memory Usage Comparison')
ax2.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Print summary table
print("\n=== Performance Summary ===")
print(f"{'Attention Type':<20} {'Time (ms)':<12} {'Memory (GB)':<12} {'Speedup':<10}")
print("-" * 60)

baseline_time = results['Dense']['mean_time']
for name in names:
    time_ms = results[name]['mean_time'] * 1000
    memory_gb = results[name]['mean_memory']
    speedup = baseline_time / results[name]['mean_time']
    
    print(f"{name:<20} {time_ms:<12.2f} {memory_gb:<12.3f} {speedup:<10.2f}x")

## 4. Adaptive Behavior Analysis

Let's analyze the adaptive behavior of AdaAttention.

In [None]:
# Initialize logger for detailed analysis
logger = get_research_logger(
    experiment_name="adaptive_behavior_analysis",
    log_dir="/workspace/logs",
    enable_tensorboard=True
)

print("Starting adaptive behavior analysis...")

# Test AdaAttention with different input patterns
ada_attention = AdaAttention(hidden_size, num_heads, enable_gpu_optimization=torch.cuda.is_available()).to(device)
ada_attention.eval()

# Create different input patterns
patterns = {
    'random': torch.randn(batch_size, seq_len, hidden_size, device=device),
    'structured': torch.zeros(batch_size, seq_len, hidden_size, device=device),
    'sparse': torch.randn(batch_size, seq_len, hidden_size, device=device) * 0.1
}

# Make structured pattern
patterns['structured'][:, :seq_len//4] = 1.0
patterns['structured'][:, seq_len//2:3*seq_len//4] = -1.0

adaptation_stats = {}

for pattern_name, pattern_input in patterns.items():
    print(f"\nAnalyzing {pattern_name} pattern...")
    
    with torch.no_grad():
        output = ada_attention(pattern_input, pattern_input, pattern_input)
    
    # Get adaptation statistics
    if hasattr(ada_attention, 'get_statistics'):
        stats = ada_attention.get_statistics()
        adaptation_stats[pattern_name] = stats
        
        print(f"  Low-rank usage: {stats.get('low_rank_usage', 0):.1%}")
        print(f"  Precision distribution: {stats.get('precision_distribution', {})}")
        
        # Log to research logger
        logger.log_metrics(
            custom_metrics={
                f"pattern/{pattern_name}/low_rank_usage": stats.get('low_rank_usage', 0),
                f"pattern/{pattern_name}/mean_entropy": stats.get('mean_entropy', 0)
            }
        )

print("\nAdaptive behavior analysis complete!")
print("Check TensorBoard for detailed metrics: http://localhost:6006")

## 5. Real-time Monitoring Example

This shows how to use the monitoring system during training.

In [None]:
# Simulate a training loop with monitoring
print("Simulating training with real-time monitoring...")

# Note: In a real scenario, you'd start the monitor in a separate process
# monitor = create_monitor("training_simulation")

# Simulate training epochs
num_epochs = 5
steps_per_epoch = 10

for epoch in range(num_epochs):
    epoch_loss = 0
    
    for step in range(steps_per_epoch):
        # Simulate forward pass
        with torch.no_grad():
            output = ada_attention(query, key, value)
        
        # Simulate loss calculation
        fake_loss = 1.0 - (epoch * steps_per_epoch + step) * 0.01 + np.random.normal(0, 0.05)
        fake_accuracy = 0.5 + (epoch * steps_per_epoch + step) * 0.01 + np.random.normal(0, 0.02)
        
        epoch_loss += fake_loss
        
        # Log metrics
        logger.log_metrics(
            epoch=epoch,
            step=epoch * steps_per_epoch + step,
            loss=fake_loss,
            accuracy=fake_accuracy,
            learning_rate=0.001 * (0.9 ** epoch)
        )
        
        if step % 3 == 0:
            print(f"Epoch {epoch+1}/{num_epochs}, Step {step+1}/{steps_per_epoch}: "
                  f"Loss = {fake_loss:.4f}, Acc = {fake_accuracy:.3f}")
    
    print(f"Epoch {epoch+1} complete. Average loss: {epoch_loss/steps_per_epoch:.4f}")

print("\nTraining simulation complete!")
print("Metrics logged to TensorBoard and research logger.")

## 6. Research Configuration Management

Learn how to manage experiment configurations.

In [None]:
import yaml
from pathlib import Path

# Create a research configuration
config = {
    'experiment': {
        'name': 'notebook_experiment_demo',
        'description': 'Demonstration experiment from Jupyter notebook',
        'tags': ['demo', 'notebook', 'research']
    },
    'model': {
        'hidden_size': hidden_size,
        'num_heads': num_heads,
        'max_seq_len': seq_len,
        'dropout': 0.1
    },
    'attention': {
        'type': 'adaattn',
        'enable_gpu_optimization': torch.cuda.is_available(),
        'rank_adaptation': {
            'enabled': True,
            'rank_ratio': 0.5,
            'entropy_threshold': 0.5
        },
        'precision_adaptation': {
            'enabled': True,
            'policy': 'balanced'
        }
    },
    'logging': {
        'level': 'INFO',
        'log_attention_stats': True,
        'log_gpu_memory': True
    }
}

# Save configuration
config_dir = Path('/workspace/configs')
config_dir.mkdir(exist_ok=True)

config_file = config_dir / 'notebook_demo.yaml'
with open(config_file, 'w') as f:
    yaml.dump(config, f, indent=2)

print(f"Configuration saved to: {config_file}")
print("\nConfiguration content:")
print(yaml.dump(config, indent=2))

## 7. Next Steps

This notebook showed you the basics of AdaAttn research. Here's what you can do next:

1. **Use the CLI tool**: Run `python scripts/research_cli.py` for interactive research
2. **Docker Environment**: Use `docker-compose up` for full research setup
3. **TensorBoard**: Visit http://localhost:6006 to see logged metrics
4. **Custom Experiments**: Create your own configurations and experiments
5. **Real Training**: Apply AdaAttn to your own models and datasets

### Research Tips:

- Use GPU when available for best performance
- Monitor adaptive behavior across different input patterns
- Compare against baseline attention for your specific tasks
- Log everything - TensorBoard integration makes analysis easy
- Use the real-time monitoring for long training runs

### Documentation:

- Check `docs/` for detailed documentation
- See `examples/` for more advanced usage patterns
- Use `configs/` to manage experiment configurations

In [None]:
# Cleanup
if logger:
    logger.cleanup()

print("�� Research quickstart complete!")
print("Ready to start your adaptive attention research!")