In [9]:
import os
import time
import torch
from accelerate import Accelerator
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from codecarbon import EmissionsTracker

In [10]:
# helper functions
def load_model_tokenizer(model_name):
    """
    Loads and returns the tokenizer and model for the given model name.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") # NB USING LESS PRECISION AS TOO BIG!
    model.eval()
    return model, tokenizer
    
def prep_distributed(model, tokenizer):
    """
    Prepares model and tokenizer for distributed inference using Accelerate.
    Returns the prepared model, tokenizer, and the accelerator instance.
    """
    accelerator = Accelerator()
    model, tokenizer = accelerator.prepare(model, tokenizer)
    return model, tokenizer, accelerator

def run_inference(model, tokenizer, accelerator, prompts, max_new_tokens=50):
    """
    Runs inference for each prompt in `prompts` and returns metrics:
      - avg_latency_ms: Average full-generation latency per sample (ms)
      - avg_ttft_ms: Average time-to-first-token (ms)
      - throughput_qps: Queries per second
      - tokens_per_sec: Tokens generated per second
      - total_generated_tokens: Total tokens generated (excluding prompt tokens)
      - num_runs: Total number of samples processed
      - total_time: Total inference time in seconds
    """
    latencies = []
    ttft_values = []
    total_tokens = 0

    device = accelerator.device

    # Loop over prompts 
    with torch.no_grad():
        for prompt in prompts:
            input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

            # Measure Time-To-First-Token (TTFT)
            start_ttft = time.perf_counter()
            _ = model.generate(input_ids, max_new_tokens=1)
            end_ttft = time.perf_counter()
            ttft_ms = (end_ttft - start_ttft) * 1000.0
            ttft_values.append(ttft_ms)

            # Measure full generation latency
            start_time = time.perf_counter()
            output = model.generate(
                input_ids,
                max_new_tokens=max_new_tokens,
                do_sample=False
            )
            end_time = time.perf_counter()
            latency_ms = (end_time - start_time) * 1000.0
            latencies.append(latency_ms)

            # Compute tokens generated (exclude input prompt tokens)
            generated_tokens = output.shape[1] - input_ids.shape[1]
            total_tokens += generated_tokens

    avg_latency_ms = sum(latencies) / len(latencies)
    avg_ttft_ms = sum(ttft_values) / len(ttft_values)
    total_time_sec = sum(latencies) / 1000.0  # convert total latency to seconds
    throughput_qps = len(prompts) / total_time_sec if total_time_sec > 0 else 0.0
    tokens_per_sec = total_tokens / total_time_sec if total_time_sec > 0 else 0.0

    return {
        "avg_latency_ms": avg_latency_ms,
        "avg_ttft_ms": avg_ttft_ms,
        "throughput_qps": throughput_qps,
        "tokens_per_sec": tokens_per_sec,
        "total_generated_tokens": total_tokens,
        "num_runs": len(prompts),
        "total_time": total_time_sec
    }

In [11]:
def main(model_name, prompts):
    
    model, tokenizer = load_model_tokenizer(model_name)
    
    model, tokenizer, accelerator = prep_distributed(model, tokenizer)
    
    os.makedirs("codecarbon_logs", exist_ok=True)
    
    tracker = EmissionsTracker(
        output_dir="codecarbon_logs",
        measure_power_secs=1  # measure power every second
    )
    tracker.start()

    metrics = run_inference(model, tokenizer, accelerator, prompts, max_new_tokens=50)
    
    emissions_data = tracker.stop()
    
    # CodeCarbon returns energy_consumed in kWh
    energy_kwh = emissions_data.energy_consumed if hasattr(emissions_data, "energy_consumed") else 0
    # Convert kWh to Joules: 1 kWh = 3.6e6 J
    energy_joules = energy_kwh * 3.6e6
    tokens_per_joule = (metrics["total_generated_tokens"] / energy_joules) if energy_joules > 0 else 0

    print("=== Benchmarking Results ===")
    print(f"Total Runs: {metrics['num_runs']}")
    print(f"Total Inference Time (sec): {metrics['total_time']:.2f}")
    print(f"Average Latency (ms/sample): {metrics['avg_latency_ms']:.2f}")
    print(f"Average TTFT (ms): {metrics['avg_ttft_ms']:.2f}")
    print(f"Throughput (queries/sec): {metrics['throughput_qps']:.2f}")
    print(f"Throughput (tokens/sec): {metrics['tokens_per_sec']:.2f}")
    print(f"Total Tokens Generated: {metrics['total_generated_tokens']}")
    print(f"Energy Consumed (kWh): {energy_kwh:.6f}")
    print(f"Energy Consumed (Joules): {energy_joules:.2f}")
    print(f"Energy Efficiency (tokens/joule): {tokens_per_joule:.6f}")

# Implement

In [12]:
if __name__ == "__main__":
    torch.cuda.empty_cache()
    
    model_name = "meta-llama/Llama-3.2-1B"

    # "lighteval/pile_helm" subset with "arxiv" configuration.
    ds = load_dataset("lighteval/pile_helm", "arxiv")["test"]
    # For the purpose of this experiment, select 100 samples.
    ds = ds.select(range(100))
    prompts = [sample["text"] for sample in ds]

    main(model_name, prompts)

[codecarbon ERROR @ 19:11:25] Error: Another instance of codecarbon is probably running as we find `/tmp/.codecarbon.lock`. Turn off the other instance to be able to run this one or use `allow_multiple_runs` or delete the file. Exiting.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad

=== Benchmarking Results ===
Total Runs: 100
Total Inference Time (sec): 124.35
Average Latency (ms/sample): 1243.49
Average TTFT (ms): 1062.21
Throughput (queries/sec): 0.80
Throughput (tokens/sec): 6.03
Total Tokens Generated: 750
Energy Consumed (kWh): 0.000000
Energy Consumed (Joules): 0.00
Energy Efficiency (tokens/joule): 0.000000
