In [1]:
# !pip install torch transformers numpy psutil

In [2]:
# !pip install 'accelerate>=0.26.0'

In [3]:
!nvidia-smi

Sun Dec 15 20:14:40 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 470.57.02    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Quadro RTX 6000     Off  | 00000000:AF:00.0 Off |                    0 |
| N/A   26C    P8    25W / 250W |      0MiB / 22698MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [8]:
!pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
[33mDEPRECATION: celery 4.1.0 has a non-standard dependency specifier pytz>dev. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of celery or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0mInstalling collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1


In [9]:
import os
from dotenv import load_dotenv

# Load HF token from .env
load_dotenv()
hf_token = os.getenv('HUGGINGFACE_TOKEN')

# Login to HuggingFace
from huggingface_hub import login
login(token=hf_token)

In [10]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import time
import json
import psutil
import numpy as np
from dataclasses import dataclass, asdict, field
import logging
from pathlib import Path
from typing import List, Dict, Optional
import gc
from contextlib import contextmanager
import os
import statistics
import threading

os.environ["TOKENIZERS_PARALLELISM"] = "false"

@dataclass
class HardwareConfig:
    gpu_mem_total: int = 22698  # RTX 6000 memory in MB
    cpu_count: int = 8
    cpu_mem_total: int = 1024 * 1024  # Keep as is unless you want to adjust

@dataclass
class BenchmarkConfig:
    model_name: str = "meta-llama/Llama-3.2-1B"
    context_lengths: List[int] = field(default_factory=lambda: [512, 1024, 2048])
    output_lengths: List[int] = field(default_factory=lambda: [32, 64])
    batch_size: int = 1
    num_runs: int = 3
    warmup_runs: int = 1
    output_dir: str = "benchmark_results"
    hardware: HardwareConfig = field(default_factory=HardwareConfig)
    decode_strategy: str = "gpu"

# Add memory optimization to model loading
def load_model(self):
    """Load model to GPU with memory optimizations"""
    self.logger.info(f"Loading model: {self.config.model_name}")
    
    try:
        # Use mixed precision
        self.model = AutoModelForCausalLM.from_pretrained(
            self.config.model_name,
            torch_dtype=torch.float16,
            device_map="auto",
            low_cpu_mem_usage=True
        )
        
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.config.model_name,
        )
        
        model_size = sum(p.numel() for p in self.model.parameters()) * 2 / (1024**3)
        self.logger.info(f"Model size: {model_size:.2f} GB")
        
    except Exception as e:
        self.logger.error(f"Failed to load model: {str(e)}")
        raise

class ResourceMonitor:
    def __init__(self, sampling_rate: float = 0.1):
        self.sampling_rate = sampling_rate
        self.cpu_percentages = []
        self._stop_monitoring = False
    
    @staticmethod
    def get_gpu_memory_usage():
        if torch.cuda.is_available():
            return {
                'allocated': torch.cuda.memory_allocated() / 1024**2,
                'reserved': torch.cuda.memory_reserved() / 1024**2,
                'max_allocated': torch.cuda.max_memory_allocated() / 1024**2
            }
        return {}

    @staticmethod
    def get_cpu_memory_usage():
        vm = psutil.virtual_memory()
        return {
            'total': vm.total / 1024**2,
            'available': vm.available / 1024**2,
            'used': vm.used / 1024**2,
            'cached': getattr(vm, 'cached', 0) / 1024**2
        }
    
    def _monitor_cpu(self):
        while not self._stop_monitoring:
            self.cpu_percentages.append(psutil.cpu_percent(percpu=True))
            time.sleep(self.sampling_rate)
    
    @contextmanager
    def track_resources(self):
        """Enhanced resource tracking with CPU utilization"""
        torch.cuda.reset_peak_memory_stats()
        self.cpu_percentages = []
        self._stop_monitoring = False
        resources = {}
        
        monitor_thread = threading.Thread(target=self._monitor_cpu)
        monitor_thread.daemon = True
        monitor_thread.start()
        
        try:
            yield resources
        finally:
            self._stop_monitoring = True
            monitor_thread.join(timeout=1.0)
            
            gpu_mem = self.get_gpu_memory_usage()
            cpu_mem = self.get_cpu_memory_usage()
            
            if self.cpu_percentages:
                cpu_stats = {
                    'mean_per_core': [statistics.mean(core_vals) for core_vals in zip(*self.cpu_percentages)],
                    'max_per_core': [max(core_vals) for core_vals in zip(*self.cpu_percentages)],
                    'overall_mean': statistics.mean([sum(vals)/len(vals) for vals in self.cpu_percentages])
                }
            else:
                cpu_stats = {
                    'mean_per_core': [0],
                    'max_per_core': [0],
                    'overall_mean': 0
                }
            
            resources.update({
                'gpu': gpu_mem,
                'cpu': cpu_mem,
                'cpu_utilization': cpu_stats
            })

class ModelBenchmark:
    def __init__(self, config: BenchmarkConfig):
        self.config = config
        self.logger = self._setup_logging()
        self._setup_directories()
        self.monitor = ResourceMonitor()
        self.kv_cache = None        
        self.current_sequence_length = 0
        self.initial_sequence_length = 0
    
    def _setup_logging(self):
        """Setup logging configuration"""
        logger = logging.getLogger(__name__)
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
        )
        return logger

    def _setup_directories(self):
        """Create necessary directories"""
        Path(self.config.output_dir).mkdir(exist_ok=True)
        
    def load_model(self):
        """Load model to GPU initially"""
        self.logger.info(f"Loading model: {self.config.model_name}")
        
        try:
            # Always load to GPU first
            self.model = AutoModelForCausalLM.from_pretrained(
                self.config.model_name,
                torch_dtype=torch.bfloat16,
                device_map="auto"  # Start with everything on GPU
            )
            
            # Debug: Log model architecture
            self.logger.info("Model architecture:")
            for name, module in self.model.named_children():
                self.logger.info(f"- {name}: {type(module)}")
            
            self.tokenizer = AutoTokenizer.from_pretrained(
                self.config.model_name,
            )
            
            model_size = sum(p.numel() for p in self.model.parameters()) * 2 / (1024**3)
            self.logger.info(f"Model size: {model_size:.2f} GB")
            
        except Exception as e:
            self.logger.error(f"Failed to load model: {str(e)}")
            raise

    def _log_state(self, phase: str):
        """Log current state for debugging"""
        self.logger.info(f"\n=== State during {phase} ===")
        if self.kv_cache is not None:
            # Log KV cache details
            kv_first_layer = self.kv_cache[0]
            k, v = kv_first_layer
        #     self.logger.info(f"KV cache first layer shapes: K={k.shape}, V={v.shape}")
        #     self.logger.info(f"KV cache device: {k.device}")
        # self.logger.info(f"Current sequence length: {self.current_sequence_length}")
        
    def prefill_phase(self, input_ids):
        """Run prefill phase on GPU"""
        # self.logger.info("Running prefill phase on GPU")
        
        with torch.no_grad():
            # Store initial sequence length
            self.initial_sequence_length = input_ids.shape[1]
            self.current_sequence_length = self.initial_sequence_length
            
            self._log_state("before prefill")
            
            outputs = self.model(
                input_ids,
                use_cache=True,
                return_dict=True
            )
            
            self.kv_cache = outputs.past_key_values
            next_token_logits = outputs.logits[:, -1, :]
            
            self._log_state("after prefill")
            
        return next_token_logits

    def prepare_decode_strategy(self):
        """Prepare model for decode phase based on strategy"""
        if self.config.decode_strategy == "cpu":
            # self.logger.info("Moving decoder to CPU for decode phase")
            
            # For OPT models, the decoder is at model.model.decoder
            if hasattr(self.model, 'model') and hasattr(self.model.model, 'decoder'):
                self.model.model.decoder.to("cpu")
                
                if self.kv_cache is not None:
                    # Move KV cache to CPU and log shapes before/after
                    self._log_state("before moving KV cache")
                    self.kv_cache = tuple(
                        tuple(t.to("cpu") for t in layer)
                        for layer in self.kv_cache
                    )
                    self._log_state("after moving KV cache")
            else:
                self.logger.warning("Could not find decoder in expected location")

    def decode_step(self, input_ids, attention_mask=None):
        """Single decode step using stored KV cache"""
        with torch.no_grad():
            device = "cpu" if self.config.decode_strategy == "cpu" else "cuda"
            
            # Ensure input_ids has the right shape
            if len(input_ids.shape) == 1:
                input_ids = input_ids.unsqueeze(0)
            if len(input_ids.shape) == 2 and input_ids.shape[1] != 1:
                input_ids = input_ids[:, -1:]
                
            input_ids = input_ids.to(device)
            
            # Get sequence length from KV cache
            k, v = self.kv_cache[0]  # First layer
            past_seq_len = k.shape[2]  # seq_len dimension in KV cache
            
            # Create attention mask including both past and current token
            batch_size = input_ids.shape[0]
            total_seq_len = past_seq_len + 1  # Include current token
            
            full_attention_mask = torch.ones(
                (batch_size, total_seq_len),
                dtype=torch.long,
                device=device
            )
            
            # self.logger.info(f"Attention mask shape: {full_attention_mask.shape}")
            # self.logger.info(f"Past sequence length: {past_seq_len}")
            # self.logger.info(f"Total sequence length: {total_seq_len}")
            
            self._log_state("before decode step")
            
            outputs = self.model(
                input_ids=input_ids,
                attention_mask=full_attention_mask,
                use_cache=True,
                past_key_values=self.kv_cache,
                return_dict=True
            )
            
            # Update KV cache
            self.kv_cache = outputs.past_key_values
            self.current_sequence_length = total_seq_len
            
            self._log_state("after decode step")
            
            return outputs.logits[:, -1, :]

    def run_single_generation(self, tokens, output_length: int):
        """Run full generation with separate prefill and decode phases"""
        with self.monitor.track_resources() as resources:
            torch.cuda.synchronize()
            start_time = time.time()
            
            with torch.no_grad():
                # Reset sequence tracking
                self.current_sequence_length = 0
                
                # Prefill phase
                input_ids = tokens.input_ids
                next_token_logits = self.prefill_phase(input_ids)
                
                # Prepare decode strategy
                self.prepare_decode_strategy()
                
                # Generate tokens
                generated_tokens = []
                for i in range(output_length):
                    self.logger.info(f"Generating token {i+1}/{output_length}")
                    next_token = torch.argmax(next_token_logits, dim=-1)
                    generated_tokens.append(next_token)
                    
                    # Prepare input for next decode step
                    current_input = next_token.unsqueeze(0)
                    
                    # Decode step
                    next_token_logits = self.decode_step(current_input)
            
            torch.cuda.synchronize()
            end_time = time.time()
            
            # Move back to GPU for next run if needed
            if self.config.decode_strategy == "cpu":
                if hasattr(self.model, 'model') and hasattr(self.model.model, 'decoder'):
                    self.model.model.decoder.to("cuda")
        
        return {
            'time': end_time - start_time,
            'resources': resources,
            'output_length': len(generated_tokens)
        }
        

    def clean_memory(self):
        """Clean up GPU memory between runs"""
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            torch.cuda.reset_peak_memory_stats()
        gc.collect()

    def run_benchmark(self):
        """Run the complete benchmark suite"""
        self.load_model()
        results = []
        
        # Create sample input text if no document provided
        sample_text = "This is a sample text for benchmarking. " * 100
        
        for context_length in self.config.context_lengths:
            for output_length in self.config.output_lengths:
                self.logger.info(f"\nTesting context length: {context_length}, output length: {output_length}")
                self.clean_memory()
                
                # Prepare input
                tokens = self.tokenizer(
                    sample_text,
                    truncation=True,
                    max_length=context_length,
                    return_tensors="pt"
                ).to("cuda")
                
                # Warmup
                self.logger.info("Performing warmup runs...")
                for _ in range(self.config.warmup_runs):
                    _ = self.run_single_generation(tokens, output_length)
                
                # Benchmark runs
                run_results = []
                for run in range(self.config.num_runs):
                    self.logger.info(f"Run {run + 1}/{self.config.num_runs}")
                    try:
                        result = self.run_single_generation(tokens, output_length)
                        run_results.append(result)
                        
                        self.logger.info(f"Generation time: {result['time']:.2f}s")
                        self.logger.info(f"Tokens per second: {output_length/result['time']:.2f}")
                        
                        cpu_util = result['resources']['cpu_utilization']['overall_mean']
                        self.logger.info(f"CPU utilization: {cpu_util:.1f}%")
                    except Exception as e:
                        self.logger.error(f"Error in run {run + 1}: {str(e)}")
                        continue
                
                if not run_results:
                    continue
                    
                # Aggregate results
                result = {
                    'context_length': context_length,
                    'output_length': output_length,
                    'avg_time': statistics.mean([r['time'] for r in run_results]),
                    'std_time': statistics.stdev([r['time'] for r in run_results]) if len(run_results) > 1 else 0,
                    'tokens_per_second': output_length / statistics.mean([r['time'] for r in run_results]),
                    'gpu_memory_peak': max([r['resources']['gpu']['max_allocated'] for r in run_results]),
                    'cpu_utilization': {
                        'mean': statistics.mean([r['resources']['cpu_utilization']['overall_mean'] for r in run_results]),
                        'peak': max([max(r['resources']['cpu_utilization']['max_per_core']) for r in run_results])
                    }
                }
                
                results.append(result)
        
        return results

In [11]:
# Initial test configuration
test_config = BenchmarkConfig(
    context_lengths=[256],  # Start small
    output_lengths=[16],    # Start small
    num_runs=1,
    warmup_runs=1
)

# Initialize and run benchmark
benchmark = ModelBenchmark(test_config)
results = benchmark.run_benchmark()

2024-12-15 20:17:31,074 - __main__ - INFO - Loading model: meta-llama/Llama-3.2-1B


config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

2024-12-15 20:18:41,435 - accelerate.utils.modeling - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

2024-12-15 20:18:43,089 - __main__ - INFO - Model architecture:
2024-12-15 20:18:43,090 - __main__ - INFO - - model: <class 'transformers.models.llama.modeling_llama.LlamaModel'>
2024-12-15 20:18:43,090 - __main__ - INFO - - lm_head: <class 'torch.nn.modules.linear.Linear'>


tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

2024-12-15 20:18:44,447 - __main__ - INFO - Model size: 2.30 GB
2024-12-15 20:18:44,448 - __main__ - INFO - 
Testing context length: 256, output length: 16
2024-12-15 20:18:44,581 - __main__ - INFO - Performing warmup runs...
2024-12-15 20:18:44,583 - __main__ - INFO - 
=== State during before prefill ===
2024-12-15 20:18:45,118 - __main__ - INFO - 
=== State during after prefill ===
2024-12-15 20:18:45,119 - __main__ - INFO - Generating token 1/16
2024-12-15 20:18:45,120 - __main__ - INFO - 
=== State during before decode step ===
We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class (https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)
2024-12-15 20:18:45,237 - __main__ - INFO - 
=== State during after decode step ===
2024-12-15 20:18:45,238 - __main__ - INFO - Generating token 2/16
2024-12-15 20:18:45,239 - __main__ - INFO - 
=== State d

In [12]:
def analyze_results(results):
    """Analyze and print benchmark results"""
    print("\n=== Benchmark Results ===")
    
    # Group results by context length
    for result in results:
        print(f"\nContext Length: {result['context_length']}")
        print(f"Output Length: {result['output_length']}")
        print(f"Average Generation Time: {result['avg_time']:.2f}s ± {result['std_time']:.2f}s")
        print(f"Tokens per Second: {result['tokens_per_second']:.2f}")
        print("\nResource Usage:")
        print(f"Peak GPU Memory: {result['gpu_memory_peak']:.0f}MB")
        print(f"CPU Utilization:")
        print(f"  - Average: {result['cpu_utilization']['mean']:.1f}%")
        print(f"  - Peak: {result['cpu_utilization']['peak']:.1f}%")
        
    # Save results to file
    timestamp = time.strftime("%Y%m%d-%H%M%S")
    results_file = f"benchmark_results_{timestamp}.json"
    with open(results_file, 'w') as f:
        json.dump(results, f, indent=2)
    print(f"\nDetailed results saved to: {results_file}")

In [13]:
analyze_results(results)


=== Benchmark Results ===

Context Length: 256
Output Length: 16
Average Generation Time: 0.39s ± 0.00s
Tokens per Second: 40.91

Resource Usage:
Peak GPU Memory: 2448MB
CPU Utilization:
  - Average: 1.1%
  - Peak: 100.0%

Detailed results saved to: benchmark_results_20241215-201908.json


In [None]:
config_gpu = BenchmarkConfig(
    model_name="facebook/opt-1.3b",
    context_lengths=[1024],
    output_lengths=[50],
    num_runs=2,
    warmup_runs=1,
    decode_strategy="gpu"  # or "gpu"
)

benchmark_gpu = ModelBenchmark(config_gpu)
results_gpu = benchmark.run_benchmark()

In [8]:
analyze_results(results_gpu)


=== Benchmark Results ===

Context Length: 1024
Output Length: 50
Average Generation Time: 14.15s ± 0.14s
Tokens per Second: 3.53

Resource Usage:
Peak GPU Memory: 2798MB
CPU Utilization:
  - Average: 54.8%
  - Peak: 100.0%

Detailed results saved to: benchmark_results_20241113-182100.json


In [None]:
configs = [
    BenchmarkConfig(
        context_lengths=[512], 
        output_lengths=[32],
        decode_strategy="cpu"
    ),
    BenchmarkConfig(
        context_lengths=[512], 
        output_lengths=[32],
        decode_strategy="gpu"
    ),
    
    BenchmarkConfig(
        context_lengths=[2048], 
        output_lengths=[128],
        decode_strategy="cpu"
    ),
    BenchmarkConfig(
        context_lengths=[2048], 
        output_lengths=[128],
        decode_strategy="gpu"
    ),
]

# Run all configurations
all_results = []
for config in configs:
    benchmark = ModelBenchmark(config)
    results = benchmark.run_benchmark()
    all_results.append({
        'config': asdict(config),
        'results': results
    })

In [10]:
def compare_strategies(all_results):
    """Compare CPU vs GPU decode strategies"""
    print("\n=== Strategy Comparison ===")
    
    # Group by context and output lengths
    grouped_results = {}
    for run in all_results:
        config = run['config']
        results = run['results'][0]  # Take first result for each config
        
        key = (config['context_lengths'][0], config['output_lengths'][0])
        if key not in grouped_results:
            grouped_results[key] = {}
            
        grouped_results[key][config['decode_strategy']] = {
            'tokens_per_second': results['tokens_per_second'],
            'gpu_memory': results['gpu_memory_peak'],
            'cpu_util': results['cpu_utilization']['mean']
        }
    
    # Print comparisons
    for (ctx_len, out_len), strategies in grouped_results.items():
        print(f"\nContext Length: {ctx_len}, Output Length: {out_len}")
        cpu_stats = strategies.get('cpu', {})
        gpu_stats = strategies.get('gpu', {})
        
        if cpu_stats and gpu_stats:
            speedup = cpu_stats['tokens_per_second'] / gpu_stats['tokens_per_second']
            memory_saving = 1 - (cpu_stats['gpu_memory'] / gpu_stats['gpu_memory'])
            
            print(f"Speed Comparison (tokens/sec):")
            print(f"  CPU: {cpu_stats['tokens_per_second']:.2f}")
            print(f"  GPU: {gpu_stats['tokens_per_second']:.2f}")
            print(f"  Relative Speed: {speedup:.2f}x (>1 means CPU is faster)")
            
            print(f"\nMemory Usage (MB):")
            print(f"  CPU: {cpu_stats['gpu_memory']:.0f}")
            print(f"  GPU: {gpu_stats['gpu_memory']:.0f}")
            print(f"  Memory Savings: {memory_saving*100:.1f}%")
            
            print(f"\nCPU Utilization:")
            print(f"  CPU decode: {cpu_stats['cpu_util']:.1f}%")
            print(f"  GPU decode: {gpu_stats['cpu_util']:.1f}%")

compare_strategies(all_results)


=== Strategy Comparison ===

Context Length: 512, Output Length: 32
Speed Comparison (tokens/sec):
  CPU: 5.07
  GPU: 64.19
  Relative Speed: 0.08x (>1 means CPU is faster)

Memory Usage (MB):
  CPU: 2666
  GPU: 2770
  Memory Savings: 3.8%

CPU Utilization:
  CPU decode: 38.6%
  GPU decode: 35.3%

Context Length: 2048, Output Length: 128
Speed Comparison (tokens/sec):
  CPU: 3.66
  GPU: 65.15
  Relative Speed: 0.06x (>1 means CPU is faster)

Memory Usage (MB):
  CPU: 2797
  GPU: 2993
  Memory Savings: 6.5%

CPU Utilization:
  CPU decode: 40.3%
  GPU decode: 35.3%
