In [None]:
import time
import torch
from accelerate import Accelerator
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from codecarbon import EmissionsTracker

In [None]:
# Load tokenizer and model

def load_model_tokenizer(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    model.eval()

pile_helm.py:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

arxiv%2Ftest.jsonl:   0%|          | 0.00/13.0M [00:00<?, ?B/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [None]:
# prep distributed settings

def prep_distrib_inf(model, tokenizer)
    accelerator = Accelerator()
    model, tokenizer = accelerator.prepare(model, tokenizer)



In [None]:
def run_inference(model_name, prompts, max_new_tokens=50):
    """
    Runs inference for each prompt in `prompts` and returns:
      - avg_latency_ms
      - avg_ttft_ms
      - throughput_qps
      - total_generated_tokens
    """
    # Initialize Accelerator
    prep_distrib_inf()
    
    # Load model/tokenizer (CPU -> GPU in a moment)
    load_model_tokenizer()
    
    # Prepare model (accelerate handles multi-GPU or single-GPU automatically)
    model = accelerator.prepare(model)
    model.eval()

    latencies = []
    total_tokens = 0
    ttft_values = []

    # We'll use the same device that accelerate has assigned.
    device = accelerator.device

    # Inference Loop
    with torch.no_grad():
        for prompt in prompts:
            input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

            # Time to first token measurement
            start_ttft = time.perf_counter()
            _ = model.generate(input_ids, max_new_tokens=1)
            end_ttft = time.perf_counter()
            ttft = (end_ttft - start_ttft) * 1000.0  # in milliseconds
            ttft_values.append(ttft)

            # Full generation latency measurement
            start_time = time.perf_counter()
            output = model.generate(
                input_ids,
                max_new_tokens=max_new_tokens,
                do_sample=False
            )
            end_time = time.perf_counter()

            # Calculate tokens generated
            output_tokens = output.shape[1] - input_ids.shape[1]
            total_tokens += output_tokens

            # Latency (ms) for the entire generation
            latencies.append((end_time - start_time) * 1000.0)

    # Average latency in ms
    avg_latency = sum(latencies) / len(latencies)
    # Average time to first token in ms
    avg_ttft = sum(ttft_values) / len(ttft_values)
    # Total time in seconds
    total_time = sum(latencies) / 1000.0
    # Throughput in queries/sec
    throughput = len(prompts) / total_time if total_time > 0 else 0

    return {
        "avg_latency_ms": avg_latency,
        "avg_ttft_ms": avg_ttft,
        "throughput_qps": throughput,
        "total_generated_tokens": total_tokens
    }

# ---------------------------------------------------------------------------------
# 4) Main Benchmark Cell
# ---------------------------------------------------------------------------------
def main():
    # Replace with real subset of HLEM Pile or any other set of 100+ prompts
    hlem_subset_prompts = [
        "Tell me a story about AI and humanity.",
        "What is the capital of France?",
        # ... (add up to 100 prompts from the HLEM Pile subset) ...
    ]
    
    # Example model name (adjust for your local or HF hub path)
    llama2_model_name = "meta-llama/Llama-2-7b-hf"
    
    # Initialize CodeCarbon tracker
    tracker = EmissionsTracker(
        output_dir="codecarbon_logs",
        measure_power_secs=1  # how often to measure power
    )
    tracker.start()

    # Run the inference function
    metrics = run_llama2_inference(llama2_model_name, hlem_subset_prompts)
    
    # Stop the tracker
    emissions_data = tracker.stop()
    
    # CodeCarbon returns energy_consumed in kWh
    energy_kwh = emissions_data.energy_consumed
    # Convert kWh to Joules: 1 kWh = 3.6e6 J
    energy_joules = energy_kwh * 3.6e6

    # Calculate tokens per joule
    tokens_per_joule = 0
    if energy_joules > 0:
        tokens_per_joule = metrics["total_generated_tokens"] / energy_joules
    
    # Print results
    print("=== Benchmarking Results ===")
    print(f"Total Runs: {metrics['num_runs']:}")
    print(f"Total Inference Tim (sec): {metrics['total_time']:.2f}")
    print(f"Average Latency (ms/sample): {metrics['avg_latency_ms']:.2f}")
    print(f"Average TTFT (ms): {metrics['avg_ttft_ms']:.2f}")
    print(f"Throughput (queries/sec): {metrics['throughput_qps']:.2f}")
    print(f"Throughput (tokens/sec): {metrics['tokens_per_sec:.2f']}")
    print(f"Total Tokens Generated: {metrics['total_generated_tokens']}")
    print(f"Energy Consumed (kWh): {metrics['energy_kwh']:.6f}")
    print(f"Energy Consumed (Joules): {metrics['energy_joules']:.2f}")
    print(f"Energy Efficiency (tokens/joule): {metrics['tokens_per_joule']:.6f}")

# Implement

In [None]:
model_name = "LLaMa3"

# load dataset

ds = load_dataset("lighteval/pile_helm", "arxiv")
# for this experiment,  select 100 samples.
dataset = dataset.select(range(100))