# Energy Measurement Harness - Kaggle Edition
## For Krishna: Complete Energy + Latency Measurement System

**Purpose:** Measure GPU power, energy, and latency for FP32, FP16, INT8 models

**Architecture:** 10-layer design from specifications
- Zero I/O during measurement
- Multi-trial support (5 trials for statistical confidence)
- PowerLogger for nvidia-smi
- CSV/JSON output

**Integration:** Uses Taara's pre-tokenized dataset, merges with accuracy results

## Setup and Imports

In [None]:
# Install dependencies if needed
# !pip install -q transformers datasets

import torch
import numpy as np
import pandas as pd
import json
import time
import subprocess
import threading
from pathlib import Path
from typing import Dict, List, Tuple, Optional
from dataclasses import dataclass, asdict
from datetime import datetime
from transformers import AutoModelForSequenceClassification

print("‚úì Imports complete")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## Layer 0: Config & Experiment Description

In [None]:
@dataclass
class ExperimentConfig:
    """Configuration for energy measurement experiments."""
    model_name: str = "distilbert-base-uncased-finetuned-sst-2-english"
    precision: str = "fp32"
    
    # ADJUSTED PARAMETERS for better sample usage
    batch_size: int = 16  # Reduced from 32 to extend data coverage
    seq_len: int = 128
    num_loops: int = 500  # Reduced from 1000 to 500
    warmup_loops: int = 50  # Reduced from 100 to 50
    
    # LOCAL PATH (for running locally)
    # Current dataset has 100 samples
    dataset_path: str = r"c:\Users\taara\UPENN JR FALL\ESE 5390\energy_aware_quantization\datasets\tokenized_data_large"
    
    # KAGGLE PATH (uncomment when running on Kaggle)
    # dataset_path: str = "/kaggle/working/tokenized_data"
    
    device: str = "cuda"
    num_trials: int = 5
    poll_interval_ms: int = 100

# Create default config
config = ExperimentConfig()

print("Configuration:")
print("-" * 60)
for key, value in asdict(config).items():
    print(f"  {key:20s}: {value}")
print("-" * 60)

# Calculate sample statistics
total_samples_needed = config.num_loops * config.batch_size
print(f"\nüìä Sample Usage Analysis:")
print(f"  Total inferences per trial: {total_samples_needed:,}")
print(f"  Dataset has ~100 samples")
print(f"  Each sample reused: ~{total_samples_needed // 100} times")
print(f"\n  ‚ö†Ô∏è  NOTE: For more unique samples, run create_large_dataset.py")
print(f"  ‚úì  This generates 500-872 samples to reduce reuse to ~16-32x")

## Layer 1: Dataset & Model Loading (Zero I/O)

In [None]:
def load_pre_tokenized_dataset(dataset_path: str, device: str):
    """Load pre-tokenized dataset - ALL data to GPU at once."""
    data_path = Path(dataset_path)
    
    print(f"\nLoading dataset from {dataset_path}...")
    
    # Check if files exist
    if not (data_path / 'input_ids.pt').exists():
        raise FileNotFoundError(
            f"Dataset not found at {dataset_path}\n"
            f"Looking for: {data_path / 'input_ids.pt'}\n"
            f"Please ensure:\n"
            f"  1. You've created the tokenized dataset\n"
            f"  2. The path is correct (use local path for local runs, Kaggle path for Kaggle)\n"
            f"  3. Files exist: input_ids.pt, attention_mask.pt, labels.pt"
        )
    
    input_ids = torch.load(data_path / 'input_ids.pt').to(device)
    attention_mask = torch.load(data_path / 'attention_mask.pt').to(device)
    labels = torch.load(data_path / 'labels.pt').to(device)
    
    print(f"‚úì Loaded {len(labels)} samples to {device}")
    print(f"  - Input shape: {input_ids.shape}")
    print(f"  - Zero I/O during measurement ‚úì")
    
    return input_ids, attention_mask, labels


def batched_iterator(input_ids, attention_mask, batch_size: int):
    """Infinite batch iterator with wraparound (zero I/O)."""
    N = input_ids.size(0)
    idx = 0
    
    while True:
        end_idx = idx + batch_size
        
        if end_idx <= N:
            yield input_ids[idx:end_idx], attention_mask[idx:end_idx]
            idx = end_idx
        else:
            # Wraparound: reuse samples from beginning
            idx = 0


def load_model(precision: str, model_name: str, device: str):
    """Load model with specified precision."""
    print(f"\nLoading {precision.upper()} model...")
    
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
    model.to(device)
    model.eval()
    
    if precision == "fp16":
        model = model.half()
        print("‚úì Converted to FP16")
    elif precision == "int8":
        print("‚ö† INT8 quantization not implemented yet")
        print("  Using FP32 as placeholder (Thomas will implement)")
    
    print(f"‚úì Model loaded on {device}")
    param_count = sum(p.numel() for p in model.parameters())
    print(f"  - Parameters: {param_count:,} ({param_count/1e6:.1f}M)")
    
    return model


print("‚úì Data/model loading functions defined")

## Layer 2: Warmup Phase

In [None]:
def warmup(model, batch_iter, num_iters: int):
    """Warmup to stabilize GPU clocks."""
    print(f"\nWarming up with {num_iters} iterations...")
    
    with torch.no_grad():
        for i in range(num_iters):
            input_ids, attention_mask = next(batch_iter)
            _ = model(input_ids, attention_mask=attention_mask)
            
            if (i + 1) % 10 == 0:
                print(f"  Warmup: {i+1}/{num_iters}")
    
    torch.cuda.synchronize()
    print("‚úì Warmup complete - GPU stabilized")


print("‚úì Warmup function defined")

## Layer 3: Timed Inference Loop

In [None]:
def run_inference_loop(model, batch_iter, num_loops: int) -> float:
    """Timed inference loop for latency measurement."""
    torch.cuda.synchronize()
    start = time.perf_counter()
    
    with torch.no_grad():
        for _ in range(num_loops):
            input_ids, attention_mask = next(batch_iter)
            _ = model(input_ids, attention_mask=attention_mask)
    
    torch.cuda.synchronize()
    end = time.perf_counter()
    
    return end - start


print("‚úì Inference loop function defined")

## Layer 4: Power Logger (CRITICAL - Krishna Must Test This)

In [None]:
class PowerLogger:
    """GPU power monitoring using nvidia-smi."""
    
    def __init__(self, gpu_id: int = 0, poll_interval_ms: int = 100):
        self.gpu_id = gpu_id
        self.poll_interval_ms = poll_interval_ms
        self.proc = None
        self.samples = []
        self.thread = None
        self.stop_flag = False
        
    def start(self):
        """Start power monitoring."""
        print(f"\n[PowerLogger] Starting (poll: {self.poll_interval_ms}ms)...")
        
        # CRITICAL: Test which command works on Kaggle
        # Option 1: Millisecond interval (if supported)
        cmd = [
            'nvidia-smi',
            '--query-gpu=power.draw',
            '--format=csv,noheader,nounits',
            f'--id={self.gpu_id}',
            '-lms', str(self.poll_interval_ms)
        ]
        
        # Option 2: Second interval (if -lms doesn't work)
        # cmd = [
        #     'nvidia-smi',
        #     '--query-gpu=power.draw',
        #     '--format=csv,noheader,nounits',
        #     f'--id={self.gpu_id}',
        #     '-l', '1'  # 1 second interval
        # ]
        
        try:
            self.proc = subprocess.Popen(
                cmd,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                universal_newlines=True,
                bufsize=1
            )
            
            self.stop_flag = False
            self.thread = threading.Thread(target=self._collect_samples)
            self.thread.daemon = True
            self.thread.start()
            
            print("[PowerLogger] ‚úì Started")
            
        except Exception as e:
            print(f"[PowerLogger] ‚úó Failed: {e}")
            raise
    
    def _collect_samples(self):
        """Collect power samples in background."""
        while not self.stop_flag and self.proc and self.proc.poll() is None:
            line = self.proc.stdout.readline()
            if line:
                try:
                    power = float(line.strip())
                    self.samples.append(power)
                except ValueError:
                    pass
    
    def stop(self) -> List[float]:
        """Stop and return samples."""
        print(f"[PowerLogger] Stopping...")
        
        self.stop_flag = True
        
        if self.proc:
            self.proc.terminate()
            try:
                self.proc.wait(timeout=2)
            except subprocess.TimeoutExpired:
                self.proc.kill()
        
        if self.thread:
            self.thread.join(timeout=2)
        
        print(f"[PowerLogger] ‚úì Stopped - {len(self.samples)} samples")
        
        if len(self.samples) == 0:
            print("[PowerLogger] ‚ö† WARNING: No samples collected!")
            print("  Krishna: Test nvidia-smi command manually")
        
        return self.samples.copy()


print("‚úì PowerLogger class defined")
print("‚ö† Krishna: TEST THIS FIRST before running experiments!")

## TEST POWERLOGGER FIRST! (Run this cell before anything else)

In [None]:
# CRITICAL TEST - Run this first!
print("Testing PowerLogger...")
print("This will run for 5 seconds")
print("="*60)

logger = PowerLogger(gpu_id=0, poll_interval_ms=100)
logger.start()
time.sleep(5)
samples = logger.stop()

print("="*60)
print(f"\nTest Results:")
print(f"  Samples collected: {len(samples)}")

if len(samples) > 0:
    print(f"  Sample values: {samples[:5]}")
    print(f"  Mean power: {np.mean(samples):.2f} W")
    print("\n‚úì PowerLogger WORKS! You can proceed.")
else:
    print("\n‚úó PowerLogger returned 0 samples!")
    print("\nDebugging steps:")
    print("1. Test nvidia-smi manually:")
    print("   !timeout 10 nvidia-smi --query-gpu=power.draw --format=csv,noheader,nounits -l 1")
    print("2. Try different interval formats:")
    print("   -lms 100  (milliseconds)")
    print("   -l 1      (seconds)")
    print("3. Update PowerLogger.start() command based on what works")
    print("\n‚ö† DO NOT PROCEED until this works!")

## Layer 5: Energy & Latency Computation

In [None]:
def compute_energy_metrics(power_samples: List[float], total_time: float, num_inferences: int) -> Dict:
    """Compute energy and latency metrics."""
    if len(power_samples) == 0:
        raise ValueError("No power samples - cannot compute energy")
    
    avg_power = float(np.mean(power_samples))
    std_power = float(np.std(power_samples))
    energy_total = avg_power * total_time
    energy_per_inference = energy_total / num_inferences
    latency_per_sample = total_time / num_inferences
    throughput = num_inferences / total_time
    
    return {
        "avg_power_w": avg_power,
        "std_power_w": std_power,
        "energy_total_j": energy_total,
        "energy_per_inference_j": energy_per_inference,
        "energy_per_inference_mj": energy_per_inference * 1000,
        "latency_per_sample_s": latency_per_sample,
        "latency_per_sample_ms": latency_per_sample * 1000,
        "throughput_samples_s": throughput,
        "total_time_s": total_time,
        "num_inferences": num_inferences,
        "num_power_samples": len(power_samples),
    }


print("‚úì Energy computation function defined")

## Layer 6: Measure with Power

In [None]:
def measure_with_power(model, batch_iter, num_loops: int, logger: PowerLogger):
    """Run inference with power monitoring."""
    logger.start()
    time.sleep(0.5)  # Let logger stabilize
    
    total_time = run_inference_loop(model, batch_iter, num_loops)
    
    power_samples = logger.stop()
    
    return total_time, power_samples


print("‚úì Measurement function defined")

## Layer 6.5: Per-Layer Energy Measurement

This section enables measuring energy consumption for each layer of DistilBERT individually.

In [None]:
def run_experiments_for_precision(config, precision: str, num_trials: int = 5):
    """Run multiple trials for one precision."""
    print("\n" + "="*70)
    print(f"RUNNING EXPERIMENTS: {precision.upper()}")
    print("="*70)
    
    # Load dataset once
    input_ids, attention_mask, labels = load_pre_tokenized_dataset(
        config.dataset_path, config.device
    )
    
    results = []
    
    for trial in range(num_trials):
        print(f"\n{'‚îÄ'*70}")
        print(f"Trial {trial + 1}/{num_trials}")
        print(f"{'‚îÄ'*70}")
        
        # Create batch iterator
        batch_iter = batched_iterator(input_ids, attention_mask, config.batch_size)
        
        # Load model
        model = load_model(precision, config.model_name, config.device)
        
        # Warmup
        warmup(model, batch_iter, config.warmup_loops)
        
        # New iterator for measurement
        batch_iter = batched_iterator(input_ids, attention_mask, config.batch_size)
        
        # Measure
        print(f"\nRunning {config.num_loops} measurement iterations...")
        logger = PowerLogger(gpu_id=0, poll_interval_ms=config.poll_interval_ms)
        
        try:
            total_time, power_samples = measure_with_power(
                model, batch_iter, config.num_loops, logger
            )
            
            num_inferences = config.num_loops * config.batch_size
            metrics = compute_energy_metrics(power_samples, total_time, num_inferences)
            
            metrics["precision"] = precision
            metrics["trial"] = trial
            metrics["batch_size"] = config.batch_size
            metrics["seq_len"] = config.seq_len
            
            results.append(metrics)
            
            # Print summary
            print(f"\n{'‚îÄ'*70}")
            print(f"Trial {trial + 1} Results:")
            print(f"{'‚îÄ'*70}")
            print(f"  Latency:    {metrics['latency_per_sample_ms']:.3f} ms")
            print(f"  Throughput: {metrics['throughput_samples_s']:.2f} samples/s")
            print(f"  Avg Power:  {metrics['avg_power_w']:.2f} W")
            print(f"  Energy:     {metrics['energy_per_inference_mj']:.3f} mJ")
            
        except Exception as e:
            print(f"\n‚úó Trial {trial + 1} failed: {e}")
            continue
        
        finally:
            del model
            torch.cuda.empty_cache()
    
    return results


print("‚úì Multi-trial runner defined")

In [None]:
class LayerEnergyProfiler:
    """
    Measures energy consumption per layer in a DistilBERT model.
    
    DistilBERT Architecture:
    - Embeddings (word + position)
    - 6 Transformer blocks (each with attention + FFN)
    - Pre-classifier
    - Classifier head
    """
    
    def __init__(self, model, device='cuda'):
        self.model = model
        self.device = device
        self.layer_names = []
        self.layer_modules = []
        
        # Register DistilBERT layers
        self._register_layers()
        
    def _register_layers(self):
        """Register all major layers for profiling."""
        # Embedding layer
        if hasattr(self.model, 'distilbert'):
            self.layer_names.append('embeddings')
            self.layer_modules.append(self.model.distilbert.embeddings)
            
            # Transformer blocks
            for i, layer in enumerate(self.model.distilbert.transformer.layer):
                self.layer_names.append(f'transformer_block_{i}')
                self.layer_modules.append(layer)
        
        # Classification head
        if hasattr(self.model, 'pre_classifier'):
            self.layer_names.append('pre_classifier')
            self.layer_modules.append(self.model.pre_classifier)
        
        if hasattr(self.model, 'classifier'):
            self.layer_names.append('classifier')
            self.layer_modules.append(self.model.classifier)
        
        print(f"‚úì Registered {len(self.layer_names)} layers for profiling")
        for name in self.layer_names:
            print(f"  - {name}")
    
    def run_layer_inference(self, input_ids, attention_mask, layer_idx: int) -> float:
        """
        Run inference through a specific layer multiple times and measure time.
        
        Returns: Total time in seconds
        """
        torch.cuda.synchronize()
        start = time.perf_counter()
        
        with torch.no_grad():
            # Run through model up to and including this layer
            if layer_idx == 0:
                # Embeddings only
                _ = self.model.distilbert.embeddings(input_ids)
            elif layer_idx <= 6:
                # Embeddings + transformer blocks up to layer_idx
                hidden_states = self.model.distilbert.embeddings(input_ids)
                for i in range(layer_idx):
                    hidden_states = self.model.distilbert.transformer.layer[i](
                        hidden_states, attention_mask
                    )[0]
            else:
                # Full forward pass
                _ = self.model(input_ids, attention_mask=attention_mask)
        
        torch.cuda.synchronize()
        end = time.perf_counter()
        
        return end - start
    
    def measure_layer_energy(
        self, 
        batch_iter, 
        layer_idx: int,
        num_loops: int = 100,
        logger: PowerLogger = None
    ) -> Dict:
        """
        Measure energy for a specific layer.
        
        Args:
            batch_iter: Iterator providing batches
            layer_idx: Index of layer to measure
            num_loops: Number of inference loops
            logger: PowerLogger instance
            
        Returns:
            Dict with energy metrics for this layer
        """
        layer_name = self.layer_names[layer_idx]
        print(f"\n  Measuring layer: {layer_name}")
        
        if logger is None:
            logger = PowerLogger(gpu_id=0, poll_interval_ms=100)
        
        # Start power logging
        logger.start()
        time.sleep(0.3)  # Brief stabilization
        
        # Run inference through this layer
        torch.cuda.synchronize()
        start_time = time.perf_counter()
        
        with torch.no_grad():
            for _ in range(num_loops):
                input_ids, attention_mask = next(batch_iter)
                
                if layer_idx == 0:
                    # Embeddings only
                    _ = self.model.distilbert.embeddings(input_ids)
                elif layer_idx <= 6:
                    # Through transformer block
                    hidden_states = self.model.distilbert.embeddings(input_ids)
                    for i in range(layer_idx):
                        if i < layer_idx - 1:
                            # Quick pass through previous layers
                            hidden_states = self.model.distilbert.transformer.layer[i](
                                hidden_states, attention_mask
                            )[0]
                        else:
                            # Measure this specific layer
                            hidden_states = self.model.distilbert.transformer.layer[i](
                                hidden_states, attention_mask
                            )[0]
                else:
                    # Classifier layers
                    outputs = self.model.distilbert(input_ids, attention_mask=attention_mask)
                    hidden_state = outputs[0]
                    pooled_output = hidden_state[:, 0]
                    if layer_idx == 7:
                        _ = self.model.pre_classifier(pooled_output)
                    else:
                        pooled_output = self.model.pre_classifier(pooled_output)
                        _ = self.model.classifier(pooled_output)
        
        torch.cuda.synchronize()
        total_time = time.perf_counter() - start_time
        
        # Stop power logging
        power_samples = logger.stop()
        
        # Compute metrics
        if len(power_samples) > 0:
            avg_power = float(np.mean(power_samples))
            energy_total = avg_power * total_time
            energy_per_inference = energy_total / num_loops
        else:
            avg_power = 0.0
            energy_total = 0.0
            energy_per_inference = 0.0
        
        return {
            'layer_name': layer_name,
            'layer_idx': layer_idx,
            'avg_power_w': avg_power,
            'total_time_s': total_time,
            'energy_total_j': energy_total,
            'energy_per_inference_mj': energy_per_inference * 1000,
            'num_loops': num_loops,
            'latency_per_loop_ms': (total_time / num_loops) * 1000,
            'num_power_samples': len(power_samples)
        }
    
    def profile_all_layers(
        self,
        input_ids,
        attention_mask,
        batch_size: int = 16,
        num_loops: int = 100
    ) -> List[Dict]:
        """
        Profile energy consumption for all layers.
        
        Returns:
            List of dicts with energy metrics per layer
        """
        results = []
        
        print("\n" + "="*70)
        print("PER-LAYER ENERGY PROFILING")
        print("="*70)
        print(f"Configuration:")
        print(f"  Batch size: {batch_size}")
        print(f"  Loops per layer: {num_loops}")
        print(f"  Total layers: {len(self.layer_names)}")
        
        for layer_idx in range(len(self.layer_names)):
            # Create iterator for this layer
            batch_iter = batched_iterator(input_ids, attention_mask, batch_size)
            
            # Measure this layer
            layer_result = self.measure_layer_energy(
                batch_iter,
                layer_idx,
                num_loops=num_loops
            )
            
            results.append(layer_result)
            
            # Print result
            print(f"    ‚úì {layer_result['layer_name']:25s}: "
                  f"{layer_result['energy_per_inference_mj']:.3f} mJ, "
                  f"{layer_result['latency_per_loop_ms']:.3f} ms, "
                  f"{layer_result['avg_power_w']:.2f} W")
            
            # Small delay between layers
            time.sleep(0.5)
        
        print("="*70)
        
        return results


def visualize_layer_energy(layer_results: List[Dict]):
    """Create visualization of per-layer energy consumption."""
    import matplotlib.pyplot as plt
    
    layer_names = [r['layer_name'] for r in layer_results]
    energies = [r['energy_per_inference_mj'] for r in layer_results]
    latencies = [r['latency_per_loop_ms'] for r in layer_results]
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
    
    # Energy plot
    ax1.barh(layer_names, energies, color='steelblue')
    ax1.set_xlabel('Energy per Inference (mJ)')
    ax1.set_title('Per-Layer Energy Consumption')
    ax1.grid(axis='x', alpha=0.3)
    
    # Latency plot
    ax2.barh(layer_names, latencies, color='coral')
    ax2.set_xlabel('Latency (ms)')
    ax2.set_title('Per-Layer Latency')
    ax2.grid(axis='x', alpha=0.3)
    
    plt.tight_layout()
    return fig


print("‚úì LayerEnergyProfiler class defined")
print("‚úì Per-layer profiling functions ready")

## EXPERIMENT: FP32 Per-Layer Energy Profiling

Run this to measure energy consumption for each layer individually.

In [None]:
# Per-layer energy profiling for FP32
print("="*70)
print("FP32 PER-LAYER ENERGY PROFILING")
print("="*70)

# Load dataset
input_ids, attention_mask, labels = load_pre_tokenized_dataset(
    config.dataset_path, config.device
)

# Load model
fp32_model = load_model("fp32", config.model_name, config.device)

# Create profiler
profiler = LayerEnergyProfiler(fp32_model, device=config.device)

# Warmup
print("\nWarming up...")
batch_iter = batched_iterator(input_ids, attention_mask, config.batch_size)
with torch.no_grad():
    for i in range(10):
        input_ids_batch, attention_mask_batch = next(batch_iter)
        _ = fp32_model(input_ids_batch, attention_mask=attention_mask_batch)
torch.cuda.synchronize()
print("‚úì Warmup complete")

# Profile all layers
layer_results_fp32 = profiler.profile_all_layers(
    input_ids,
    attention_mask,
    batch_size=config.batch_size,
    num_loops=100  # 100 loops per layer for good statistics
)

# Display results
print("\n" + "="*70)
print("PER-LAYER RESULTS SUMMARY")
print("="*70)

df_layers = pd.DataFrame(layer_results_fp32)
print(df_layers[['layer_name', 'energy_per_inference_mj', 'latency_per_loop_ms', 'avg_power_w']])

# Calculate percentages
total_energy = df_layers['energy_per_inference_mj'].sum()
df_layers['energy_pct'] = (df_layers['energy_per_inference_mj'] / total_energy * 100).round(2)

print("\n" + "="*70)
print("ENERGY BREAKDOWN BY LAYER")
print("="*70)
for _, row in df_layers.iterrows():
    print(f"  {row['layer_name']:25s}: {row['energy_per_inference_mj']:6.3f} mJ ({row['energy_pct']:5.2f}%)")

print(f"\n  {'TOTAL':25s}: {total_energy:6.3f} mJ (100.00%)")
print("="*70)

# Visualize
fig = visualize_layer_energy(layer_results_fp32)
plt.savefig('/kaggle/working/energy_results/fp32_per_layer_energy.png', dpi=150, bbox_inches='tight')
print("\n‚úì Visualization saved to: /kaggle/working/energy_results/fp32_per_layer_energy.png")

# Save results
df_layers.to_csv('/kaggle/working/energy_results/fp32_per_layer_energy.csv', index=False)
print("‚úì Results saved to: /kaggle/working/energy_results/fp32_per_layer_energy.csv")

# Cleanup
del fp32_model
torch.cuda.empty_cache()

## Layer 8: Aggregate Trials

In [None]:
def aggregate_trials(trial_results: List[Dict]) -> Dict:
    """Aggregate metrics across trials."""
    if not trial_results:
        raise ValueError("No trial results")
    
    metrics_keys = [
        'avg_power_w', 'energy_per_inference_j', 'energy_per_inference_mj',
        'latency_per_sample_s', 'latency_per_sample_ms', 'throughput_samples_s'
    ]
    
    aggregated = {
        'precision': trial_results[0]['precision'],
        'batch_size': trial_results[0]['batch_size'],
        'seq_len': trial_results[0]['seq_len'],
        'num_trials': len(trial_results)
    }
    
    for key in metrics_keys:
        values = [r[key] for r in trial_results]
        aggregated[f'{key}_mean'] = float(np.mean(values))
        aggregated[f'{key}_std'] = float(np.std(values))
        aggregated[f'{key}_min'] = float(np.min(values))
        aggregated[f'{key}_max'] = float(np.max(values))
    
    return aggregated


print("‚úì Aggregation function defined")

## EXPERIMENT 1: Test with FP32 (Small Scale)

In [None]:
# Quick test - 1 trial, reduced loops to match available data
# Use this to debug PowerLogger

test_config = ExperimentConfig(
    batch_size=16,
    num_loops=50,  # 50 loops √ó 16 batch = 800 samples (8x reuse with 100 samples)
    warmup_loops=10,
    num_trials=1
)

print("Running QUICK TEST (1 trial, 50 loops, batch_size=16)...")
print(f"Total samples: {test_config.num_loops * test_config.batch_size}")
test_results = run_experiments_for_precision(test_config, "fp32", num_trials=1)

if test_results:
    print("\n" + "="*70)
    print("‚úì TEST SUCCESSFUL")
    print("="*70)
    print("PowerLogger is working! Proceed to full experiments.")
else:
    print("\n" + "="*70)
    print("‚úó TEST FAILED")
    print("="*70)
    print("Fix PowerLogger before proceeding.")

## EXPERIMENT 2: Full FP32 (5 Trials)

In [None]:
# Full FP32 experiment
# Run this after test succeeds

full_config = ExperimentConfig(
    batch_size=32,  # Increased from 8
    num_loops=1000,  # Increased from 200
    warmup_loops=100,  # Increased from 50
    num_trials=5
)

print("Running FULL FP32 EXPERIMENT (5 trials, 1000 loops, batch_size=32)...")
print("Total samples per trial: 1000 √ó 32 = 32,000 samples")
print("This will take ~10-15 minutes")
print("="*70)

fp32_results = run_experiments_for_precision(full_config, "fp32", num_trials=5)

# Aggregate
if fp32_results:
    fp32_agg = aggregate_trials(fp32_results)
    
    print("\n" + "="*70)
    print("FP32 AGGREGATED RESULTS (5 trials)")
    print("="*70)
    print(f"Latency:    {fp32_agg['latency_per_sample_ms_mean']:.3f} ¬± {fp32_agg['latency_per_sample_ms_std']:.3f} ms")
    print(f"Throughput: {fp32_agg['throughput_samples_s_mean']:.2f} ¬± {fp32_agg['throughput_samples_s_std']:.2f} samples/s")
    print(f"Avg Power:  {fp32_agg['avg_power_w_mean']:.2f} ¬± {fp32_agg['avg_power_w_std']:.2f} W")
    print(f"Energy:     {fp32_agg['energy_per_inference_mj_mean']:.3f} ¬± {fp32_agg['energy_per_inference_mj_std']:.3f} mJ")
    print("="*70)

## EXPERIMENT 3: FP16 (When Thomas Provides Model)

In [None]:
# Uncomment when Thomas provides FP16 model

# fp16_results = run_experiments_for_precision(full_config, "fp16", num_trials=5)

# if fp16_results:
#     fp16_agg = aggregate_trials(fp16_results)
#     
#     print("\n" + "="*70)
#     print("FP16 AGGREGATED RESULTS")
#     print("="*70)
#     print(f"Latency:    {fp16_agg['latency_per_sample_ms_mean']:.3f} ¬± {fp16_agg['latency_per_sample_ms_std']:.3f} ms")
#     print(f"Throughput: {fp16_agg['throughput_samples_s_mean']:.2f} ¬± {fp16_agg['throughput_samples_s_std']:.2f} samples/s")
#     print(f"Avg Power:  {fp16_agg['avg_power_w_mean']:.2f} ¬± {fp16_agg['avg_power_w_std']:.2f} W")
#     print(f"Energy:     {fp16_agg['energy_per_inference_mj_mean']:.3f} ¬± {fp16_agg['energy_per_inference_mj_std']:.3f} mJ")

print("FP16 experiment ready (currently commented out)")

## EXPERIMENT 4: INT8 (When Thomas Provides Model)

In [None]:
# Uncomment when Thomas provides INT8 model

# int8_results = run_experiments_for_precision(full_config, "int8", num_trials=5)

# if int8_results:
#     int8_agg = aggregate_trials(int8_results)
#     
#     print("\n" + "="*70)
#     print("INT8 AGGREGATED RESULTS")
#     print("="*70)
#     print(f"Latency:    {int8_agg['latency_per_sample_ms_mean']:.3f} ¬± {int8_agg['latency_per_sample_ms_std']:.3f} ms")
#     print(f"Throughput: {int8_agg['throughput_samples_s_mean']:.2f} ¬± {int8_agg['throughput_samples_s_std']:.2f} samples/s")
#     print(f"Avg Power:  {int8_agg['avg_power_w_mean']:.2f} ¬± {int8_agg['avg_power_w_std']:.2f} W")
#     print(f"Energy:     {int8_agg['energy_per_inference_mj_mean']:.3f} ¬± {int8_agg['energy_per_inference_mj_std']:.3f} mJ")

print("INT8 experiment ready (currently commented out)")

## Save Results

In [None]:
# Save aggregated results
output_dir = Path('/kaggle/working/energy_results')
output_dir.mkdir(exist_ok=True)

# Collect all aggregated results
all_agg_results = []

if 'fp32_agg' in locals():
    all_agg_results.append(fp32_agg)

# if 'fp16_agg' in locals():
#     all_agg_results.append(fp16_agg)

# if 'int8_agg' in locals():
#     all_agg_results.append(int8_agg)

if all_agg_results:
    # Save as CSV
    df_agg = pd.DataFrame(all_agg_results)
    df_agg.to_csv(output_dir / 'energy_results_aggregated.csv', index=False)
    print(f"‚úì Saved: {output_dir / 'energy_results_aggregated.csv'}")
    
    # Save as JSON
    with open(output_dir / 'energy_results_aggregated.json', 'w') as f:
        json.dump(all_agg_results, f, indent=2)
    print(f"‚úì Saved: {output_dir / 'energy_results_aggregated.json'}")
    
    # Display
    print("\nResults Table:")
    display(df_agg[['precision', 'latency_per_sample_ms_mean', 'avg_power_w_mean', 'energy_per_inference_mj_mean']])
else:
    print("No results to save yet")

## Compare with Taara's Accuracy Results

In [None]:
# Validate latency matches Taara's results
if 'fp32_agg' in locals():
    print("Validation Check:")
    print("="*60)
    print(f"Your FP32 latency:  {fp32_agg['latency_per_sample_ms_mean']:.3f} ms")
    print(f"Taara's FP32 latency: 2.32 ms (from 0.116s / 50 samples)")
    
    diff_pct = abs(fp32_agg['latency_per_sample_ms_mean'] - 2.32) / 2.32 * 100
    print(f"Difference: {diff_pct:.1f}%")
    
    if diff_pct < 10:
        print("‚úì Latencies match within 10% - validation passed!")
    else:
        print("‚ö† Latencies differ by >10% - check configuration")

## Utility Functions (Save Config, Merge with Accuracy)

In [None]:
def save_config_to_file(config: ExperimentConfig, output_path: str):
    """Save configuration to JSON file."""
    with open(output_path, 'w') as f:
        json.dump(asdict(config), f, indent=2)
    print(f"‚úì Config saved to {output_path}")


def merge_with_accuracy_results(energy_results: Dict, accuracy_results_path: str) -> Dict:
    """Merge energy results with Taara's accuracy results."""
    with open(accuracy_results_path, 'r') as f:
        acc_results = json.load(f)
    
    merged = {**energy_results, **acc_results}
    return merged


# Save current config
save_config_to_file(config, '/kaggle/working/energy_results/experiment_config.json')

print("\n‚úì Utility functions defined")
print("  - save_config_to_file(): Save experiment config")
print("  - merge_with_accuracy_results(): Merge with Taara's data")

## Merge Energy + Accuracy (After Both Complete)

In [None]:
# Use this after energy experiments complete and Taara has accuracy results

# Example: Merge FP32 energy with FP32 accuracy
# if 'fp32_agg' in locals():
#     merged_fp32 = merge_with_accuracy_results(
#         energy_results=fp32_agg,
#         accuracy_results_path='/kaggle/working/results/fp32_baseline.json'
#     )
#     
#     print("\nMerged FP32 Results:")
#     print("="*60)
#     print(f"  Accuracy:   {merged_fp32['accuracy']*100:.2f}%")
#     print(f"  Latency:    {merged_fp32['latency_per_sample_ms_mean']:.3f} ms")
#     print(f"  Power:      {merged_fp32['avg_power_w_mean']:.2f} W")
#     print(f"  Energy:     {merged_fp32['energy_per_inference_mj_mean']:.3f} mJ")
#     print("="*60)

print("Merge function ready (currently commented out)")

## Summary

In [None]:
print("\n" + "="*70)
print("ENERGY MEASUREMENT HARNESS - SUMMARY")
print("="*70)

print("\nCompleted Experiments:")
if 'fp32_results' in locals() and fp32_results:
    print("  ‚úì FP32: 5 trials complete")
else:
    print("  ‚óã FP32: Not run yet")

# if 'fp16_results' in locals() and fp16_results:
#     print("  ‚úì FP16: 5 trials complete")
# else:
print("  ‚óã FP16: Waiting for Thomas")

# if 'int8_results' in locals() and int8_results:
#     print("  ‚úì INT8: 5 trials complete")
# else:
print("  ‚óã INT8: Waiting for Thomas")

print("\nOutput Files:")
if (output_dir / 'energy_results_aggregated.csv').exists():
    print(f"  ‚úì {output_dir / 'energy_results_aggregated.csv'}")
    print(f"  ‚úì {output_dir / 'energy_results_aggregated.json'}")

print("\nNext Steps:")
print("  1. Share energy results with Taara")
print("  2. Taara will merge with accuracy results")
print("  3. Generate comparison plots")
print("  4. Write report")

print("\n" + "="*70)
print("Krishna: Great work!")
print("="*70)