In [None]:
import os
from datetime import datetime
import tempfile
import time
import json
import torch
from accelerate import Accelerator
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from codecarbon import EmissionsTracker
from vllm import LLM, SamplingParams
import pandas as pd

# Using HF Accelerate (not vLLM)
nb this works for causal models

In [None]:
# helper functions 
 
def load_model_tokenizer(model_name):
    """
    Loads and returns the tokenizer and causal model.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    model = AutoModelForCausalLM.from_pretrained(
        model_name, torch_dtype=torch.float16, device_map="auto"
    )
    model.eval()

    return model, tokenizer

    
def prep_distributed(model, tokenizer):
    """
    Prepares the model and tokenizer for distributed inference using Accelerate.
    Returns the prepared model, tokenizer, and the accelerator instance.
    """
    accelerator = Accelerator()
    model, tokenizer = accelerator.prepare(model, tokenizer)
    return model, tokenizer, accelerator

def run_text_gen_inference(model, tokenizer,accelerator,prompts,max_input_tokens=512, max_output_tokens=50, batch_size=8):
    """
    Runs inference in batches with token truncation applied early to prevent exceeding model max length.
    """
    task_type = "Text Generation"

    # Apply truncation early to prevent sorting long prompts
    truncated_prompts = [
        tokenizer.decode(
            tokenizer(
                p, truncation=True, max_length=max_input_tokens, return_tensors="pt"
            ).input_ids[0],
            skip_special_tokens=True
        )
        for p in prompts
    ]

    # Sort prompts by token length (for efficient batching)
    sorted_prompts = sorted(truncated_prompts, key=lambda x: len(tokenizer.tokenize(x)))
    
    latencies = []
    ttft_values = []
    total_tokens = 0

    device = accelerator.device
    num_batches = (len(sorted_prompts) + batch_size - 1) // batch_size

    for i in range(num_batches):
        batch = sorted_prompts[i * batch_size: (i + 1) * batch_size]

        # First batch: measure TTFT (Time-To-First-Token)
        if i == 0 and batch:
            encoded = tokenizer(batch[0], return_tensors="pt", truncation=True, max_length=max_input_tokens)
            input_ids = encoded.input_ids.to(device)
            start_ttft = time.perf_counter()
            _ = model.generate(input_ids, max_new_tokens=1)
            end_ttft = time.perf_counter()
            ttft_ms = (end_ttft - start_ttft) * 1000.0
            ttft_values.append(ttft_ms)

        # Tokenize batch with truncation
        encoded = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=max_input_tokens)
        input_ids = encoded.input_ids.to(device)

        # Generate outputs
        start_time = time.perf_counter()
        outputs = model.generate(input_ids, max_new_tokens=max_output_tokens, do_sample=False)
        end_time = time.perf_counter()
        latencies.append((end_time - start_time) * 1000.0)

        # Compute total generated tokens
        for j in range(len(batch)):
            prompt_len = input_ids[j].shape[0]
            gen_len = outputs[j].shape[0] - prompt_len
            total_tokens += gen_len

    avg_latency_ms = sum(latencies) / len(latencies) if latencies else 0.0
    avg_ttft_ms = sum(ttft_values) / len(ttft_values) if ttft_values else 0.0
    total_time_sec = sum(latencies) / 1000.0
    throughput_qps = len(sorted_prompts) / total_time_sec if total_time_sec > 0 else 0.0
    tokens_per_sec = total_tokens / total_time_sec if total_time_sec > 0 else 0.0

    return {
        "avg_latency_ms": avg_latency_ms,
        "avg_ttft_ms": avg_ttft_ms,
        "throughput_qps": throughput_qps,
        "tokens_per_sec": tokens_per_sec,
        "total_generated_tokens": total_tokens,
        "num_runs": len(sorted_prompts),
        "total_time": total_time_sec
    }

    
def main(model_name, prompts, max_input_tokens=512, max_output_tokens=50, batch_size=8):
    model, tokenizer = load_model_tokenizer(model_name)
    model, tokenizer, accelerator = prep_distributed(model, tokenizer)

    os.makedirs("codecarbon_logs", exist_ok=True)
    
    tracker = EmissionsTracker(measure_power_secs=1)

    tracker.start()

    metrics = run_text_gen_inference(model, 
                            tokenizer, 
                            accelerator, 
                            prompts, 
                            max_input_tokens=max_input_tokens, 
                            max_output_tokens=max_output_tokens, 
                            batch_size=batch_size)
    
    tracker.stop()
    
    _data = tracker.final_emissions_data
    
    energy_kwh = _data.energy_consumed 
    energy_joules = energy_kwh * 3.6e6  # 1 kWh = 3.6e6 Joules
    tokens_per_joule = (metrics["total_generated_tokens"] / energy_joules) if energy_joules > 0 else 0

    cpu_count = _data.cpu_count
    cpu_model = _data.cpu_model
    gpu_count = _data.gpu_count
    gpu_model = _data.gpu_model
    
    benchmark_results = {
        "experiment_setup": {
            "model": model_name,
            "cpu_count": cpu_count,
            "cpu_model": cpu_model,
            "gpu_count": gpu_count,
            "gpu_model": gpu_model,
            "total_runs": metrics["num_runs"],
        },
        "experiment_results": {
            "total_inference_time_sec": round(metrics["total_time"], 2),
            "average_latency_ms_per_batch": round(metrics["avg_latency_ms"], 2),
            "average_ttft_ms": round(metrics["avg_ttft_ms"], 2),
            "throughput_queries_per_sec": round(metrics["throughput_qps"], 2),
            "throughput_tokens_per_sec": round(metrics["tokens_per_sec"], 2),
            "total_tokens_generated": metrics["total_generated_tokens"],
            "energy_consumed_kwh": round(energy_kwh, 10),
            "energy_consumed_joules": round(energy_joules, 10),
            "energy_efficiency_tokens_per_joule": round(tokens_per_joule, 10),
        }
    }
    
    today_date = datetime.today().strftime("%Y-%m-%d")
    output_dir = f"benchmark_results/{task_type}"
    os.makedirs(output_dir, exist_ok=True)
    output_json_path = f"{output_dir}/{model_name}_{today_date}.json"

    with open(output_json_path, "w") as json_file:
        json.dump(benchmark_results, json_file, indent=4)

    # Print confirmation and the JSON structure
    print("\n=== BENCHMARKING RESULTS ===")
    print(json.dumps(benchmark_results, indent=4))
    print(f"\nResults saved to: {output_json_path}")

    print("\n=== BENCHMARKING RESULTS===")
    print("--- Experiment Set up ---")
    print(f"Model: {model_name}")
    print(f"CPU Count: {cpu_count}")
    print(f"CPU Model: {cpu_model}")
    print(f"GPU Count: {gpu_count}")
    print(f"GPU Model: {gpu_model}")
    print(f"Total Runs: {metrics['num_runs']}")
    print("\n--- Experiment results ---")
    print(f"Total Inference Time (sec): {metrics['total_time']:.2f}")
    print(f"Average Latency (ms/batch): {metrics['avg_latency_ms']:.2f}")
    print(f"Average TTFT (ms): {metrics['avg_ttft_ms']:.2f}")
    print(f"Throughput (queries/sec): {metrics['throughput_qps']:.2f}")
    print(f"Throughput (tokens/sec): {metrics['tokens_per_sec']:.2f}")
    print(f"Total Tokens Generated: {metrics['total_generated_tokens']}")
    print(f"Energy Consumed (kWh): {energy_kwh:.10f}")
    print(f"Energy Consumed (Joules): {energy_joules:.10f}")
    print(f"Energy Efficiency (tokens/joule): {tokens_per_joule:.10f}")


In [33]:
if __name__ == "__main__":
    model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
    
    # Load the dataset and select 1024 samples CHANGE BACK TO 1024 FOR EXPERIMENT
    ds = load_dataset("lighteval/pile_helm", "arxiv")["test"]
    ds = ds.select(range(5))
    prompts = [sample["text"] for sample in ds]

    main(model_name, 
         prompts, 
         max_input_tokens=512,
         max_output_tokens=50,
         batch_size=8)
    

[codecarbon INFO @ 20:10:07] [setup] RAM Tracking...
[codecarbon INFO @ 20:10:07] [setup] CPU Tracking...
 Linux OS detected: Please ensure RAPL files exist at \sys\class\powercap\intel-rapl to measure CPU

[codecarbon INFO @ 20:10:08] CPU Model on constant consumption mode: AMD EPYC 7742 64-Core Processor
[codecarbon INFO @ 20:10:08] [setup] GPU Tracking...
[codecarbon INFO @ 20:10:08] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 20:10:08] >>> Tracker's metadata:
[codecarbon INFO @ 20:10:08]   Platform system: Linux-5.15.0-113-generic-x86_64-with-glibc2.31
[codecarbon INFO @ 20:10:08]   Python version: 3.10.14
[codecarbon INFO @ 20:10:08]   CodeCarbon version: 2.8.3
[codecarbon INFO @ 20:10:08]   Available RAM : 503.532 GB
[codecarbon INFO @ 20:10:08]   CPU count: 128
[codecarbon INFO @ 20:10:08]   CPU model: AMD EPYC 7742 64-Core Processor
[codecarbon INFO @ 20:10:08]   GPU count: 4
[codecarbon INFO @ 20:10:08]   GPU model: 4 x NVIDIA A100-PCIE-40GB
[codecarbon INFO @ 20:10:12] 


=== BENCHMARKING RESULTS===
--- Experiment Set up ---
Model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
CPU Count: 128
CPU Model: AMD EPYC 7742 64-Core Processor
GPU Count: 4
GPU Model: 4 x NVIDIA A100-PCIE-40GB
Total Runs: 5

--- Experiment results ---
Total Inference Time (sec): 4.77
Average Latency (ms/batch): 4767.30
Average TTFT (ms): 118.70
Throughput (queries/sec): 1.05
Throughput (tokens/sec): 52.44
Total Tokens Generated: 250
Energy Consumed (kWh): 0.0011436906
Energy Consumed (Joules): 4117.2862626621
Energy Efficiency (tokens/joule): 0.0607196061


# vLLM 
only works for non-causal models

In [16]:
# helper functions 

def load_vllm_model(model_name, tensor_parallel_size=None):
    """
    Loads the vLLM model with optional multi-GPU support.
    Includes FlashAttention optimisation.
    
    Parameters:
        model_name: Name of the model.
        tensor_parallel_size: Number of GPUs to use. If None, auto-detects available GPUs.
    
    Returns:
        vLLM model instance.
    """
    if tensor_parallel_size is None:
        tensor_parallel_size = torch.cuda.device_count()  # Auto-detect GPUs

    print(f"Loading vLLM model on {tensor_parallel_size} GPUs with FlashAttention...")
    
    return LLM(
        model=model_name,
        tensor_parallel_size=tensor_parallel_size,
        dtype="bfloat16",      # Use bfloat16 for better memory efficiency
    )

def run_vllm_inference(model, prompts, max_new_tokens=50, batch_size=8):
    """
    Runs inference using vLLM in batches and returns performance metrics.
    Both tokenisation and output generation are performed within model.generate().
    Prompts are sorted by length to allow optimal batching (reducing padding overhead).
    
    Parameters:
        model: vLLM model instance.
        prompts: List of prompt strings.
        max_new_tokens: Maximum number of new tokens to generate.
        batch_size: Number of samples per batch.
    
    Returns:
        Dictionary with performance metrics.
    """
    # Sort prompts by length (for optimal batching)
    sorted_prompts = sorted(prompts, key=lambda x: len(x))
    
    latencies = []
    ttft_values = []
    total_tokens = 0
    
    sampling_params = SamplingParams(max_tokens=max_new_tokens, temperature=0.7, top_p=0.9)
    
    # Process prompts in batches
    num_batches = (len(sorted_prompts) + batch_size - 1) // batch_size
    for i in range(num_batches):
        batch = sorted_prompts[i * batch_size: (i + 1) * batch_size]
        
        # For the first batch, measure Time-To-First-Token (TTFT) for the first prompt
        if i == 0 and batch:
            start_ttft = time.perf_counter()
            # Generate only one token for TTFT measurement
            _ = model.generate(batch[0], SamplingParams(max_tokens=1, temperature=0.7, top_p=0.9))
            end_ttft = time.perf_counter()
            ttft_ms = (end_ttft - start_ttft) * 1000.0
            ttft_values.append(ttft_ms)
        
        start_time = time.perf_counter()
        # model.generate() handles tokenisation internally before generating output tokens
        outputs = model.generate(batch, sampling_params)
        end_time = time.perf_counter()
        
        batch_latency = (end_time - start_time) * 1000.0  # in milliseconds
        latencies.append(batch_latency)
        
        # Sum tokens generated from outputs (here, using simple whitespace splitting)
        for output in outputs:
            for item in output.outputs:
                total_tokens += len(item.text.split())
    
    avg_latency_ms = sum(latencies) / len(latencies) if latencies else 0.0
    avg_ttft_ms = sum(ttft_values) / len(ttft_values) if ttft_values else 0.0
    total_time_sec = sum(latencies) / 1000.0  # Convert total latency to seconds
    throughput_qps = len(sorted_prompts) / total_time_sec if total_time_sec > 0 else 0.0
    tokens_per_sec = total_tokens / total_time_sec if total_time_sec > 0 else 0.0

    return {
        "avg_latency_ms": avg_latency_ms,
        "avg_ttft_ms": avg_ttft_ms,
        "throughput_qps": throughput_qps,
        "tokens_per_sec": tokens_per_sec,
        "total_generated_tokens": total_tokens,
        "num_runs": len(sorted_prompts),
        "total_time": total_time_sec
    }
    
    
def main_vllm(model_name, prompts, tensor_parallel_size=None, batch_size=8):
    """
    Main function to run vLLM inference with energy consumption tracking using CodeCarbon.
    Processes test samples in batches (after sorting by length) and measures the total energy 
    required for 1024 samples in the dataset.
    
    Parameters:
        model_name: Name of the model.
        prompts: List of prompt strings.
        tensor_parallel_size: Number of GPUs to use.
        batch_size: Batch size for inference.
    """
    model = load_vllm_model(model_name, tensor_parallel_size=tensor_parallel_size)
        
    os.makedirs("codecarbon_logs", exist_ok=True)
    
    tracker = EmissionsTracker(
        output_dir="codecarbon_logs",
        measure_power_secs=1  # Measure power every second
    )
    tracker.start()

    metrics = run_vllm_inference(model, prompts, max_new_tokens=50, batch_size=batch_size)
    
    emissions_data = tracker.stop()
    
    # CodeCarbon returns energy_consumed in kWh (if available)
    energy_kwh = getattr(emissions_data, "energy_consumed", 0)
    # Convert kWh to Joules: 1 kWh = 3.6e6 J
    energy_joules = energy_kwh * 3.6e6
    tokens_per_joule = (metrics["total_generated_tokens"] / energy_joules) if energy_joules > 0 else 0

    print("=== Benchmarking Results ===")
    print(f"Total Runs: {metrics['num_runs']}")
    print(f"Total Inference Time (sec): {metrics['total_time']:.2f}")
    print(f"Average Latency (ms/batch): {metrics['avg_latency_ms']:.2f}")
    print(f"Average TTFT (ms): {metrics['avg_ttft_ms']:.2f}")
    print(f"Throughput (queries/sec): {metrics['throughput_qps']:.2f}")
    print(f"Throughput (tokens/sec): {metrics['tokens_per_sec']:.2f}")
    print(f"Total Tokens Generated: {metrics['total_generated_tokens']}")
    print(f"Energy Consumed (kWh): {energy_kwh:.6f}")
    print(f"Energy Consumed (Joules): {energy_joules:.2f}")
    print(f"Energy Efficiency (tokens/joule): {tokens_per_joule:.6f}")

In [18]:
if __name__ == "__main__":
    # Use 1024 samples per dataset for the experiment.
    model_name = "gpt2"

    # Load the "lighteval/pile_helm" subset with "arxiv" configuration.
    ds = load_dataset("lighteval/pile_helm", "arxiv")["test"]
    ds = ds.select(range(5))  # LATER CHANGE: Select 1024 samples for the experiment
    prompts = [sample["text"] for sample in ds]

    # Run the main function with a default batch size of 8.
    main_vllm(model_name, prompts, tensor_parallel_size=2, batch_size=8)

Loading vLLM model on 2 GPUs with FlashAttention...


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

INFO 03-03 18:14:45 config.py:2444] Downcasting torch.float32 to torch.bfloat16.
INFO 03-03 18:14:52 config.py:549] This model supports multiple tasks: {'generate', 'score', 'reward', 'classify', 'embed'}. Defaulting to 'generate'.
INFO 03-03 18:14:52 config.py:1382] Defaulting to use mp for distributed inference
INFO 03-03 18:14:52 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.3) with config: model='gpt2', speculative_config=None, tokenizer='gpt2', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=1024, download_dir=None, load_format=auto, tensor_parallel_size=2, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forw

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

INFO 03-03 18:14:59 __init__.py:207] Automatically detected platform cuda.
[1;36m(VllmWorkerProcess pid=2551505)[0;0m INFO 03-03 18:14:59 multiproc_worker_utils.py:229] Worker ready; awaiting tasks
[1;36m(VllmWorkerProcess pid=2551505)[0;0m INFO 03-03 18:15:00 cuda.py:229] Using Flash Attention backend.


[W303 18:18:33.252198404 TCPStore.cpp:358] [c10d] TCP client failed to connect/validate to host 10.1.23.20:48853 - retrying (try=0, timeout=600000ms, delay=73411ms): Interrupted system call
Exception raised from delay at ../torch/csrc/distributed/c10d/socket.cpp:117 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x96 (0x7f0c9a3f3446 in /home/228755@hertie-school.lan/thesis/thesis/lib/python3.10/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x15e1205 (0x7f0cd0a2a205 in /home/228755@hertie-school.lan/thesis/thesis/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so)
frame #2: <unknown function> + 0x6029f36 (0x7f0cd5472f36 in /home/228755@hertie-school.lan/thesis/thesis/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so)
frame #3: <unknown function> + 0x602a3a4 (0x7f0cd54733a4 in /home/228755@hertie-school.lan/thesis/thesis/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so)
frame #4: <unknown function> + 0x5fe8016 

KeyboardInterrupt: 

INFO 03-03 18:18:34 multiproc_worker_utils.py:128] Killing local vLLM worker processes
