# Complete Accuracy + Energy Evaluation Pipeline - Kaggle Edition
## Day 3: Combined Accuracy and Energy Measurement System

**Purpose:** Evaluate model accuracy AND energy consumption for FP32, FP16, INT8

**Works NOW for:** FP32 baseline  
**Ready for:** FP16/INT8 when Thomas provides models  

---

**This notebook measures:**
1. ‚úÖ Model accuracy
2. ‚úÖ GPU energy consumption
3. ‚úÖ Inference latency
4. ‚úÖ GPU power draw
5. ‚úÖ Throughput
6. ‚úÖ Per-class statistics

## Part 1: Setup and Dependencies

In [None]:
!pip install -q transformers datasets scikit-learn seaborn

print("Dependencies installed")

In [None]:
import torch
import numpy as np
import pandas as pd
from pathlib import Path
import json
import time
import subprocess
import threading
from datetime import datetime
from dataclasses import dataclass, asdict
from typing import Dict, List, Optional

from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
from datasets import load_dataset
from sklearn.metrics import confusion_matrix, classification_report

import matplotlib.pyplot as plt
import seaborn as sns

print("All imports successful")

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

## Part 2: PowerLogger Class (Energy Measurement)

In [None]:
class PowerLogger:
    """GPU power monitoring using nvidia-smi."""
    
    def __init__(self, gpu_id: int = 0, poll_interval_ms: int = 100):
        self.gpu_id = gpu_id
        self.poll_interval_ms = poll_interval_ms
        self.proc = None
        self.samples = []
        self.thread = None
        self.stop_flag = False
        
    def start(self):
        """Start power monitoring."""
        print(f"[PowerLogger] Starting (poll: {self.poll_interval_ms}ms)...")
        
        cmd = [
            'nvidia-smi',
            '--query-gpu=power.draw',
            '--format=csv,noheader,nounits',
            f'--id={self.gpu_id}',
            '-lms', str(self.poll_interval_ms)
        ]
        
        try:
            self.proc = subprocess.Popen(
                cmd,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                universal_newlines=True,
                bufsize=1
            )
            
            self.stop_flag = False
            self.thread = threading.Thread(target=self._collect_samples)
            self.thread.daemon = True
            self.thread.start()
            
            print("[PowerLogger] ‚úì Started")
            
        except Exception as e:
            print(f"[PowerLogger] ‚úó Failed: {e}")
            raise
    
    def _collect_samples(self):
        """Collect power samples in background."""
        while not self.stop_flag and self.proc and self.proc.poll() is None:
            line = self.proc.stdout.readline()
            if line:
                try:
                    power = float(line.strip())
                    self.samples.append(power)
                except ValueError:
                    pass
    
    def stop(self) -> List[float]:
        """Stop and return samples."""
        print(f"[PowerLogger] Stopping...")
        
        self.stop_flag = True
        
        if self.proc:
            self.proc.terminate()
            try:
                self.proc.wait(timeout=2)
            except subprocess.TimeoutExpired:
                self.proc.kill()
        
        if self.thread:
            self.thread.join(timeout=2)
        
        print(f"[PowerLogger] ‚úì Stopped - {len(self.samples)} samples")
        
        if len(self.samples) == 0:
            print("[PowerLogger] ‚ö† WARNING: No samples collected!")
        
        return self.samples.copy()


print("‚úì PowerLogger class defined")

## Part 3: Test PowerLogger (CRITICAL - Run First!)

In [None]:
# CRITICAL TEST - Run this first!
print("Testing PowerLogger...")
print("This will run for 5 seconds")
print("="*60)

logger = PowerLogger(gpu_id=0, poll_interval_ms=100)
logger.start()
time.sleep(5)
samples = logger.stop()

print("="*60)
print(f"\nTest Results:")
print(f"  Samples collected: {len(samples)}")

if len(samples) > 0:
    print(f"  Sample values: {samples[:5]}")
    print(f"  Mean power: {np.mean(samples):.2f} W")
    print("\n‚úì PowerLogger WORKS! You can proceed.")
else:
    print("\n‚úó PowerLogger returned 0 samples!")
    print("\nDebugging steps:")
    print("1. Test nvidia-smi manually")
    print("2. Try different interval formats")
    print("\n‚ö† DO NOT PROCEED until this works!")

## Part 4: Combined Results Class (Accuracy + Energy)

In [None]:
@dataclass
class CombinedResults:
    """Container for combined accuracy and energy results."""
    precision_type: str
    
    # Accuracy metrics
    accuracy: float
    num_correct: int
    num_total: int
    per_class_accuracy: Dict[int, float]
    confusion_matrix: np.ndarray
    
    # Energy metrics
    avg_power_w: float
    std_power_w: float
    energy_total_j: float
    energy_per_inference_j: float
    energy_per_inference_mj: float
    
    # Latency metrics
    inference_time_s: float
    latency_per_sample_ms: float
    throughput_samples_s: float
    
    # Configuration
    batch_size: int
    num_batches: int
    num_power_samples: int
    
    def to_dict(self):
        """Convert to dictionary for JSON serialization."""
        result = asdict(self)
        result['confusion_matrix'] = result['confusion_matrix'].tolist()
        return result
    
    def summary(self) -> str:
        """Get a text summary of results."""
        lines = [
            f"{'='*70}",
            f"Combined Evaluation: {self.precision_type}",
            f"{'='*70}",
            f"",
            f"üìä ACCURACY METRICS:",
            f"  Overall Accuracy:    {self.accuracy*100:.2f}% ({self.num_correct}/{self.num_total})",
            f"  Per-Class Accuracy:",
        ]
        for label, acc in self.per_class_accuracy.items():
            lines.append(f"    Class {label}: {acc*100:.2f}%")
        
        lines.extend([
            f"",
            f"‚ö° ENERGY METRICS:",
            f"  Average Power:       {self.avg_power_w:.2f} W (¬±{self.std_power_w:.2f})",
            f"  Total Energy:        {self.energy_total_j:.3f} J",
            f"  Energy/Inference:    {self.energy_per_inference_mj:.3f} mJ",
            f"  Power Samples:       {self.num_power_samples}",
            f"",
            f"‚è±Ô∏è  LATENCY METRICS:",
            f"  Total Time:          {self.inference_time_s:.3f} s",
            f"  Latency/Sample:      {self.latency_per_sample_ms:.3f} ms",
            f"  Throughput:          {self.throughput_samples_s:.2f} samples/s",
            f"  Batches:             {self.num_batches} √ó {self.batch_size}",
            f"{'='*70}"
        ])
        return "\n".join(lines)


print("‚úì CombinedResults class defined")

## Part 5: Combined Evaluator (Accuracy + Energy)

In [None]:
class CombinedEvaluator:
    """Combined accuracy and energy evaluation system."""
    
    def __init__(self, device='cuda'):
        self.device = torch.device(device if torch.cuda.is_available() else 'cpu')
        print(f"CombinedEvaluator initialized on {self.device}")
    
    def evaluate(
        self,
        model: torch.nn.Module,
        dataset,
        batch_size: int = 8,
        precision_type: str = 'FP32',
        warmup_batches: int = 2,
        measure_energy: bool = True
    ) -> CombinedResults:
        """Evaluate model accuracy AND energy on dataset."""
        model.eval()
        
        # Warmup
        print(f"\nWarming up with {warmup_batches} batches...")
        with torch.no_grad():
            for i, batch in enumerate(dataset.get_batch(batch_size)):
                if i >= warmup_batches:
                    break
                _ = model(
                    input_ids=batch['input_ids'],
                    attention_mask=batch['attention_mask']
                )
        
        if self.device.type == 'cuda':
            torch.cuda.synchronize()
        
        print("‚úì Warmup complete")
        
        # Start power monitoring
        power_samples = []
        if measure_energy:
            logger = PowerLogger(gpu_id=0, poll_interval_ms=100)
            logger.start()
            time.sleep(0.5)  # Let logger stabilize
        
        # Actual evaluation
        print(f"\nRunning evaluation on {len(dataset)} samples...")
        all_predictions = []
        all_labels = []
        num_batches = 0
        
        start_time = time.perf_counter()
        
        with torch.no_grad():
            for batch in dataset.get_batch(batch_size):
                outputs = model(
                    input_ids=batch['input_ids'],
                    attention_mask=batch['attention_mask']
                )
                
                logits = outputs.logits if hasattr(outputs, 'logits') else outputs
                predictions = logits.argmax(dim=-1).cpu().numpy()
                labels = batch['labels'].cpu().numpy()
                
                all_predictions.extend(predictions)
                all_labels.extend(labels)
                num_batches += 1
        
        if self.device.type == 'cuda':
            torch.cuda.synchronize()
        
        end_time = time.perf_counter()
        inference_time = end_time - start_time
        
        # Stop power monitoring
        if measure_energy:
            power_samples = logger.stop()
        
        # Compute accuracy metrics
        all_predictions = np.array(all_predictions)
        all_labels = np.array(all_labels)
        
        num_correct = (all_predictions == all_labels).sum()
        num_total = len(all_labels)
        accuracy = num_correct / num_total
        
        per_class_acc = self._compute_per_class_accuracy(all_predictions, all_labels)
        conf_matrix = confusion_matrix(all_labels, all_predictions)
        
        # Compute energy metrics
        if len(power_samples) > 0:
            avg_power = float(np.mean(power_samples))
            std_power = float(np.std(power_samples))
            energy_total = avg_power * inference_time
            energy_per_inference = energy_total / num_total
        else:
            avg_power = 0.0
            std_power = 0.0
            energy_total = 0.0
            energy_per_inference = 0.0
        
        # Compute latency metrics
        latency_per_sample = (inference_time / num_total) * 1000  # ms
        throughput = num_total / inference_time if inference_time > 0 else 0
        
        return CombinedResults(
            precision_type=precision_type,
            accuracy=accuracy,
            num_correct=int(num_correct),
            num_total=num_total,
            per_class_accuracy=per_class_acc,
            confusion_matrix=conf_matrix,
            avg_power_w=avg_power,
            std_power_w=std_power,
            energy_total_j=energy_total,
            energy_per_inference_j=energy_per_inference,
            energy_per_inference_mj=energy_per_inference * 1000,
            inference_time_s=inference_time,
            latency_per_sample_ms=latency_per_sample,
            throughput_samples_s=throughput,
            batch_size=batch_size,
            num_batches=num_batches,
            num_power_samples=len(power_samples)
        )
    
    def _compute_per_class_accuracy(self, predictions: np.ndarray, labels: np.ndarray) -> Dict[int, float]:
        """Compute accuracy per class."""
        unique_labels = np.unique(labels)
        per_class = {}
        
        for label in unique_labels:
            mask = labels == label
            class_correct = (predictions[mask] == labels[mask]).sum()
            class_total = mask.sum()
            per_class[int(label)] = float(class_correct / class_total) if class_total > 0 else 0.0
        
        return per_class
    
    def plot_confusion_matrix(
        self,
        results: CombinedResults,
        save_path: Optional[str] = None,
        class_names: Optional[List[str]] = None
    ):
        """Plot confusion matrix."""
        fig, ax = plt.subplots(figsize=(8, 6))
        
        cm = results.confusion_matrix
        cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        
        sns.heatmap(
            cm_normalized,
            annot=True,
            fmt='.2f',
            cmap='Blues',
            xticklabels=class_names or range(len(cm)),
            yticklabels=class_names or range(len(cm)),
            ax=ax
        )
        
        ax.set_title(f'Confusion Matrix: {results.precision_type}', fontsize=14, fontweight='bold')
        ax.set_ylabel('True Label', fontsize=12)
        ax.set_xlabel('Predicted Label', fontsize=12)
        
        plt.tight_layout()
        
        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')
            print(f"‚úì Confusion matrix saved to {save_path}")
        
        plt.show()
    
    def save_results(self, results: CombinedResults, output_path: str):
        """Save results to JSON file."""
        output_path = Path(output_path)
        output_path.parent.mkdir(parents=True, exist_ok=True)
        
        with open(output_path, 'w') as f:
            json.dump(results.to_dict(), f, indent=2)
        
        print(f"‚úì Results saved to {output_path}")


print("‚úì CombinedEvaluator class defined")

## Part 6: Load Pre-tokenized Dataset

In [None]:
# First, create dataset if it doesn't exist (inline from Day 1)
data_path = Path('/kaggle/working/tokenized_data')

if not data_path.exists():
    print("Creating pre-tokenized dataset...")
    data_path.mkdir(parents=True, exist_ok=True)
    
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    dataset_raw = load_dataset("glue", "sst2", split="validation")
    dataset_raw = dataset_raw.shuffle(seed=42).select(range(50))
    
    texts = [example['sentence'] for example in dataset_raw]
    labels = [example['label'] for example in dataset_raw]
    
    encodings = tokenizer(
        texts,
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors='pt'
    )
    
    labels_tensor = torch.tensor(labels, dtype=torch.long)
    
    torch.save(encodings['input_ids'], data_path / 'input_ids.pt')
    torch.save(encodings['attention_mask'], data_path / 'attention_mask.pt')
    torch.save(labels_tensor, data_path / 'labels.pt')
    
    metadata = {
        'num_samples': 50,
        'max_length': 128,
        'dataset_name': 'sst2',
        'num_labels': 2,
        'seed': 42,
        'tokenizer': 'distilbert-base-uncased',
    }
    
    with open(data_path / 'metadata.json', 'w') as f:
        json.dump(metadata, f, indent=2)
    
    print("‚úì Dataset created")
else:
    print("‚úì Dataset already exists")

In [None]:
class PreTokenizedDataset:
    """Load pre-tokenized dataset with zero I/O overhead."""
    
    def __init__(self, data_dir: str = '/kaggle/working/tokenized_data'):
        data_path = Path(data_dir)
        
        print(f"Loading dataset from {data_dir}...")
        self.input_ids = torch.load(data_path / 'input_ids.pt')
        self.attention_mask = torch.load(data_path / 'attention_mask.pt')
        self.labels = torch.load(data_path / 'labels.pt')
        
        with open(data_path / 'metadata.json', 'r') as f:
            self.metadata = json.load(f)
        
        self.num_samples = len(self.labels)
        print(f"‚úì Loaded {self.num_samples} samples")
    
    def __len__(self):
        return self.num_samples
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }
    
    def get_batch(self, batch_size: int = 8):
        """Iterate over batches with zero I/O overhead."""
        for i in range(0, self.num_samples, batch_size):
            end_idx = min(i + batch_size, self.num_samples)
            yield {
                'input_ids': self.input_ids[i:end_idx],
                'attention_mask': self.attention_mask[i:end_idx],
                'labels': self.labels[i:end_idx]
            }
    
    def to_device(self, device):
        """Move all tensors to device (GPU) at once."""
        self.input_ids = self.input_ids.to(device)
        self.attention_mask = self.attention_mask.to(device)
        self.labels = self.labels.to(device)
        print(f"‚úì Dataset moved to {device}")
        return self


# Load dataset
dataset = PreTokenizedDataset('/kaggle/working/tokenized_data')

if torch.cuda.is_available():
    dataset.to_device(device)
    print("‚úì Zero I/O setup complete")

## Part 7: Evaluate FP32 Baseline (Accuracy + Energy)

In [None]:
print("="*70)
print("LOADING FP32 MODEL")
print("="*70)

model_fp32 = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased-finetuned-sst-2-english',
    num_labels=2
).to(device)

print(f"‚úì Model loaded on {device}")
param_count = sum(p.numel() for p in model_fp32.parameters())
print(f"Parameters: {param_count:,} ({param_count/1e6:.1f}M)")

In [None]:
print("\n" + "="*70)
print("EVALUATING FP32 (Accuracy + Energy)")
print("="*70)

evaluator = CombinedEvaluator(device=device)

results_fp32 = evaluator.evaluate(
    model=model_fp32,
    dataset=dataset,
    batch_size=8,
    precision_type='FP32',
    warmup_batches=2,
    measure_energy=True
)

print("\n" + results_fp32.summary())

In [None]:
# Save results
evaluator.save_results(results_fp32, '/kaggle/working/results/fp32_combined.json')

# Plot confusion matrix
evaluator.plot_confusion_matrix(
    results=results_fp32,
    save_path='/kaggle/working/results/confusion_matrix_fp32.png',
    class_names=['Negative', 'Positive']
)

## Part 8: Save to CSV

In [None]:
# Create comprehensive CSV with all metrics
results_data = {
    'timestamp': datetime.now().isoformat(),
    'precision': results_fp32.precision_type,
    'accuracy_%': round(results_fp32.accuracy * 100, 2),
    'num_correct': results_fp32.num_correct,
    'num_total': results_fp32.num_total,
    'avg_power_w': round(results_fp32.avg_power_w, 2),
    'std_power_w': round(results_fp32.std_power_w, 2),
    'energy_total_j': round(results_fp32.energy_total_j, 4),
    'energy_per_inference_mj': round(results_fp32.energy_per_inference_mj, 4),
    'latency_total_s': round(results_fp32.inference_time_s, 4),
    'latency_per_sample_ms': round(results_fp32.latency_per_sample_ms, 3),
    'throughput_samples_s': round(results_fp32.throughput_samples_s, 2),
    'batch_size': results_fp32.batch_size,
    'num_batches': results_fp32.num_batches,
    'num_power_samples': results_fp32.num_power_samples
}

df_results = pd.DataFrame([results_data])

output_path = Path('/kaggle/working/results')
output_path.mkdir(exist_ok=True, parents=True)

df_results.to_csv(output_path / 'combined_results.csv', index=False)
print(f"‚úì Results saved to {output_path / 'combined_results.csv'}")

print("\nResults Table:")
display(df_results)

## Part 9: Summary

In [None]:
print("\n" + "="*70)
print("DAY 3 COMPLETE: COMBINED ACCURACY + ENERGY EVALUATION")
print("="*70)

print("\n‚úì What You Measured:")
print(f"  üìä Accuracy:           {results_fp32.accuracy*100:.2f}%")
print(f"  ‚ö° Average Power:       {results_fp32.avg_power_w:.2f} W")
print(f"  ‚ö° Energy/Inference:    {results_fp32.energy_per_inference_mj:.3f} mJ")
print(f"  ‚è±Ô∏è  Latency/Sample:      {results_fp32.latency_per_sample_ms:.3f} ms")
print(f"  üöÄ Throughput:          {results_fp32.throughput_samples_s:.2f} samples/s")

print("\n‚úì Files Generated:")
for file in sorted(output_path.glob('*')):
    size = file.stat().st_size / 1024
    print(f"  - {file.name:35s} {size:8.2f} KB")

print("\n‚úì Ready For:")
print("  - FP16 evaluation (when Thomas provides model)")
print("  - INT8 evaluation (when Thomas provides model)")
print("  - Per-layer energy profiling")

print("\n" + "="*70)
print("All measurements complete! üéâ")
print("="*70)