## Prometheus Metrics
- Reference: https://nvidia.github.io/TensorRT-LLM/examples/prometheus_metrics.html

- Modified script for smoke tests measurement via Prometheus metrics

```
podman ps
CONTAINER ID  IMAGE                              COMMAND               CREATED        STATUS        PORTS       NAMES
2c0e74b44b1e  docker.io/vllm/vllm-openai:latest  -lc vllm serve /m...  7 minutes ago  Up 7 minutes              brave_wozniak
```

In [8]:
%%bash
curl -s http://localhost:8002/v1/models | jq

{
  "object": "list",
  "data": [
    {
      "id": "/mnt/elita/soundwave/models/llama3-70b-awq",
      "object": "model",
      "created": 1772321624,
      "owned_by": "vllm",
      "root": "/mnt/elita/soundwave/models/llama3-70b-awq",
      "parent": null,
      "max_model_len": 8192,
      "permission": [
        {
          "id": "modelperm-b988452af40a0099",
          "object": "model_permission",
          "created": 1772321624,
          "allow_create_engine": false,
          "allow_sampling": true,
          "allow_logprobs": true,
          "allow_search_indices": false,
          "allow_view": true,
          "allow_fine_tuning": false,
          "organization": "*",
          "group": null,
          "is_blocking": false
        }
      ]
    }
  ]
}


In [9]:
# !curl -s http://localhost:8002/metrics | grep -E '^(vllm|trtllm|# HELP|# TYPE)' | head -n 60

In [10]:
from urllib.request import urlopen

from openai import OpenAI

# Initialize the OpenAI client
client = OpenAI(
    base_url="http://localhost:8002/v1",
    api_key="tensorrt_llm",
)

# Prometheus metric prefix used by TensorRT-LLM
METRIC_PREFIX = "vllm:"

# Base URL for the metrics endpoint
METRICS_URL = "http://localhost:8002/metrics"



In [11]:
def fetch_metrics() -> dict | None:
    """Fetch metrics from the Prometheus endpoint."""
    try:
        response = urlopen(METRICS_URL)
        if response.status == 200:
            return response.read().decode("utf-8")
        else:
            print(f"Error fetching metrics: HTTP {response.status}")
            return None
    except Exception as e:
        print(f"Error fetching metrics: {e}")
        return None

In [12]:
def parse_and_display_metrics(metrics_data: dict) -> None:
    """Parse and display relevant TensorRT-LLM metrics."""
    if not metrics_data:
        return

    print("\n" + "=" * 80)
    print("TensorRT-LLM Prometheus Metrics")
    print("=" * 80)

    # Define metrics to display with descriptions
    metrics_of_interest = {
        f"{METRIC_PREFIX}request_success_total": "Total successful requests",
        f"{METRIC_PREFIX}e2e_request_latency_seconds": "End-to-end request latency",
        f"{METRIC_PREFIX}time_to_first_token_seconds": "Time to first token",
        f"{METRIC_PREFIX}request_queue_time_seconds": "Request queue time",
        f"{METRIC_PREFIX}kv_cache_hit_rate": "KV cache hit rate",
        f"{METRIC_PREFIX}kv_cache_utilization": "KV cache utilization",
        f"{METRIC_PREFIX}prefix_cache_queries_total": "Prefix cache queries (tokens)",
        f"{METRIC_PREFIX}prefix_cache_hits_total": "Prefix cache hits (cached tokens)",
        f"{METRIC_PREFIX}external_prefix_cache_queries_total": "External prefix cache queries",
        f"{METRIC_PREFIX}external_prefix_cache_hits_total": "External prefix cache hits",

    }

    found_metrics = []
    missing_metrics = []

    for metric_name, description in metrics_of_interest.items():
        if metric_name in metrics_data:
            found_metrics.append((metric_name, description))
        else:
            missing_metrics.append((metric_name, description))

    # Display found metrics
    if found_metrics:
        print("\n✓ Available Metrics:")
        print("-" * 80)
        for metric_name, description in found_metrics:
            # Extract the metric lines from the data
            lines = [
                line
                for line in metrics_data.split("\n")
                if line.startswith(metric_name) and not line.startswith("#")
            ]
            print(f"\n{description} ({metric_name}):")
            for line in lines:
                print(f"  {line}")

    # Display missing metrics
    if missing_metrics:
        print("\n✗ Not Yet Available:")
        print("-" * 80)
        for metric_name, description in missing_metrics:
            print(f"  {description} ({metric_name})")

    print("\n" + "=" * 80)

In [13]:
def main():
    print("Prometheus Metrics Example")
    print("=" * 80)
    print("This script will:")
    print("1. Send several completion requests to a running TensorRT-LLM server")
    print(
        "2. After each response, fetch and display Prometheus metrics from the /prometheus/metrics endpoint"
    )
    print()

    # Auto-detect first available model from the OpenAI-compatible /v1/models endpoint
    model_id = client.models.list().data[0].id

    # Make several completion requests to generate metrics
    print("Sending completion requests...")
    num_requests = 10
    for i in range(num_requests):
        try:
            response = client.completions.create(
                model=model_id,
                prompt=(
                    f"Hello, this is request {i + 1}. "
                    "Use your greatest imagination in this request. Tell me a lot about"
                ),
                max_tokens=1000,
                stream=False,
            )
            print(
                f"  Request {i + 1}/{num_requests} completed. Response: {response.choices[0].text[:50]}..."
            )

            # Fetch and display metrics after each response
            print(f"\n  Fetching metrics after request {i + 1}...")
            metrics_data = fetch_metrics()
            if metrics_data:
                parse_and_display_metrics(metrics_data)
            else:
                print("  ✗ Failed to fetch metrics")
            print()
        except Exception as e:
            print(f"  Error on request {i + 1}: {e}")
    print("All requests completed.")

In [7]:
if __name__ == "__main__":
    main()

Prometheus Metrics Example
This script will:
1. Send several completion requests to a running TensorRT-LLM server
2. After each response, fetch and display Prometheus metrics from the /prometheus/metrics endpoint

Sending completion requests...
  Request 1/10 completed. Response:  the town and its people.
The town I want you to d...

  Fetching metrics after request 1...

TensorRT-LLM Prometheus Metrics

✓ Available Metrics:
--------------------------------------------------------------------------------

Total successful requests (vllm:request_success_total):
  vllm:request_success_total{engine="0",finished_reason="stop",model_name="/mnt/elita/soundwave/models/llama3-70b-awq"} 38.0
  vllm:request_success_total{engine="0",finished_reason="length",model_name="/mnt/elita/soundwave/models/llama3-70b-awq"} 3.0
  vllm:request_success_total{engine="0",finished_reason="abort",model_name="/mnt/elita/soundwave/models/llama3-70b-awq"} 0.0
  vllm:request_success_total{engine="0",finished_reason="

#### TP2

```
rteixeira@elita:/mnt/elita/soundwave/playground$ nvidia-smi
Sun Mar  1 00:21:17 2026
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.119.02             Driver Version: 580.119.02     CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA L40S                    On  |   00000000:4A:00.0 Off |                    0 |
| N/A   40C    P0             82W /  350W |   43147MiB /  46068MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA L40S                    On  |   00000000:61:00.0 Off |                    0 |
| N/A   40C    P0             86W /  350W |   43147MiB /  46068MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   2  NVIDIA L40S                    On  |   00000000:CA:00.0 Off |                    0 |
| N/A   31C    P8             32W /  350W |       0MiB /  46068MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   3  NVIDIA L40S                    On  |   00000000:E1:00.0 Off |                    0 |
| N/A   30C    P8             32W /  350W |       0MiB /  46068MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|
|    0   N/A  N/A          420305      C   VLLM::Worker_TP0                      43138MiB |
|    1   N/A  N/A          420306      C   VLLM::Worker_TP1                      43138MiB |
+-----------------------------------------------------------------------------------------+
```

nvidia-smi topo -m  
```bash
	GPU0	GPU1	GPU2	GPU3	NIC0	NIC1	CPU Affinity	NUMA Affinity	GPU NUMA ID
GPU0	 X 	NODE	SYS	SYS	SYS	SYS	0,2,4,6,8,10	0		N/A
GPU1	NODE	 X 	SYS	SYS	SYS	SYS	0,2,4,6,8,10	0		N/A
GPU2	SYS	SYS	 X 	NODE	NODE	NODE	1,3,5,7,9,11	1		N/A
GPU3	SYS	SYS	NODE	 X 	NODE	NODE	1,3,5,7,9,11	1		N/A
NIC0	SYS	SYS	NODE	NODE	 X 	PIX
NIC1	SYS	SYS	NODE	NODE	PIX	 X

Legend:

  X    = Self
  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
  PIX  = Connection traversing at most a single PCIe bridge
  NV#  = Connection traversing a bonded set of # NVLinks

NIC Legend:

  NIC0: mlx5_0
  NIC1: mlx5_1
```

### On container:

```bash
 podman run --rm -it --net=host   --hooks-dir=/usr/share/containers/oci/hooks.d   --security-opt=label=disable   --device nvidia.com/gpu=0   --device nvidia.com/gpu=1   -e HF_HOME=/mnt/elita/soundwave/hf_cache   -e VLLM_CACHE_DIR=/mnt/elita/soundwave/vllm_cache   -v /mnt/elita/soundwave:/mnt/elita/soundwave   --entrypoint /bin/bash   docker.io/vllm/vllm-openai:latest -lc   'vllm serve /mnt/elita/soundwave/models/llama3-70b-awq \
     --port 8002 \
     --tensor-parallel-size 2 \
     2>&1 | tee /mnt/elita/soundwave/logs/vllm_tp2_default_DEBUG_8002.log'
```

```
<...>

(APIServer pid=53) INFO:     127.0.0.1:48568 - "GET /v1/models HTTP/1.1" 200 OK
(APIServer pid=53) INFO:     127.0.0.1:58256 - "GET /v1/models HTTP/1.1" 200 OK
(APIServer pid=53) INFO 02-28 23:30:16 [loggers.py:259] Engine 000: Avg prompt throughput: 0.5 tokens/s, Avg generation throughput: 16.7 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.1%, Prefix cache hit rate: 57.6%
(APIServer pid=53) INFO:     127.0.0.1:58256 - "POST /v1/completions HTTP/1.1" 200 OK
(APIServer pid=53) INFO:     127.0.0.1:43354 - "GET /metrics HTTP/1.1" 200 OK
(APIServer pid=53) INFO 02-28 23:30:26 [loggers.py:259] Engine 000: Avg prompt throughput: 0.5 tokens/s, Avg generation throughput: 34.0 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.2%, Prefix cache hit rate: 58.0%
(APIServer pid=53) INFO 02-28 23:30:36 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 34.2 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.5%, Prefix cache hit rate: 58.0%
(APIServer pid=53) INFO 02-28 23:30:46 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 34.2 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.7%, Prefix cache hit rate: 58.0%
(APIServer pid=53) INFO:     127.0.0.1:58256 - "POST /v1/completions HTTP/1.1" 200 OK
(APIServer pid=53) INFO:     127.0.0.1:43962 - "GET /metrics HTTP/1.1" 200 OK
(APIServer pid=53) INFO 02-28 23:30:56 [loggers.py:259] Engine 000: Avg prompt throughput: 0.5 tokens/s, Avg generation throughput: 34.1 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.2%, Prefix cache hit rate: 58.5%
(APIServer pid=53) INFO 02-28 23:31:06 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 34.2 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.5%, Prefix cache hit rate: 58.5%
(APIServer pid=53) INFO:     127.0.0.1:58256 - "POST /v1/completions HTTP/1.1" 200 OK
(APIServer pid=53) INFO:     127.0.0.1:41892 - "GET /metrics HTTP/1.1" 200 OK
(APIServer pid=53) INFO 02-28 23:31:16 [loggers.py:259] Engine 000: Avg prompt throughput: 0.5 tokens/s, Avg generation throughput: 34.1 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.1%, Prefix cache hit rate: 58.9%
(APIServer pid=53) INFO 02-28 23:31:26 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 34.2 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.4%, Prefix cache hit rate: 58.9%
(APIServer pid=53) INFO:     127.0.0.1:58256 - "POST /v1/completions HTTP/1.1" 200 OK
(APIServer pid=53) INFO:     127.0.0.1:37676 - "GET /metrics HTTP/1.1" 200 OK
(APIServer pid=53) INFO 02-28 23:31:36 [loggers.py:259] Engine 000: Avg prompt throughput: 0.5 tokens/s, Avg generation throughput: 34.1 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.1%, Prefix cache hit rate: 59.3%
(APIServer pid=53) INFO:     127.0.0.1:58256 - "POST /v1/completions HTTP/1.1" 200 OK
(APIServer pid=53) INFO:     127.0.0.1:37950 - "GET /metrics HTTP/1.1" 200 OK
(APIServer pid=53) INFO 02-28 23:31:46 [loggers.py:259] Engine 000: Avg prompt throughput: 0.5 tokens/s, Avg generation throughput: 34.1 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.1%, Prefix cache hit rate: 59.6%
(APIServer pid=53) INFO 02-28 23:31:56 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 34.2 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.4%, Prefix cache hit rate: 59.6%
(APIServer pid=53) INFO 02-28 23:32:06 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 34.2 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.7%, Prefix cache hit rate: 59.6%
(APIServer pid=53) INFO:     127.0.0.1:58256 - "POST /v1/completions HTTP/1.1" 200 OK
(APIServer pid=53) INFO:     127.0.0.1:47830 - "GET /metrics HTTP/1.1" 200 OK
(APIServer pid=53) INFO 02-28 23:32:16 [loggers.py:259] Engine 000: Avg prompt throughput: 0.5 tokens/s, Avg generation throughput: 34.1 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.2%, Prefix cache hit rate: 60.0%
(APIServer pid=53) INFO 02-28 23:32:26 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 34.2 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.4%, Prefix cache hit rate: 60.0%
(APIServer pid=53) INFO 02-28 23:32:36 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 34.2 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.7%, Prefix cache hit rate: 60.0%
(APIServer pid=53) INFO:     127.0.0.1:58256 - "POST /v1/completions HTTP/1.1" 200 OK
(APIServer pid=53) INFO:     127.0.0.1:37100 - "GET /metrics HTTP/1.1" 200 OK
(APIServer pid=53) INFO 02-28 23:32:46 [loggers.py:259] Engine 000: Avg prompt throughput: 0.5 tokens/s, Avg generation throughput: 34.1 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.2%, Prefix cache hit rate: 60.3%
(APIServer pid=53) INFO 02-28 23:32:56 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 34.2 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.5%, Prefix cache hit rate: 60.3%
(APIServer pid=53) INFO:     127.0.0.1:58256 - "POST /v1/completions HTTP/1.1" 200 OK
(APIServer pid=53) INFO:     127.0.0.1:40262 - "GET /metrics HTTP/1.1" 200 OK
(APIServer pid=53) INFO 02-28 23:33:06 [loggers.py:259] Engine 000: Avg prompt throughput: 0.5 tokens/s, Avg generation throughput: 34.1 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.2%, Prefix cache hit rate: 60.6%
(APIServer pid=53) INFO 02-28 23:33:16 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 34.2 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.5%, Prefix cache hit rate: 60.6%
(APIServer pid=53) INFO:     127.0.0.1:58256 - "POST /v1/completions HTTP/1.1" 200 OK
(APIServer pid=53) INFO:     127.0.0.1:47200 - "GET /metrics HTTP/1.1" 200 OK
(APIServer pid=53) INFO 02-28 23:33:26 [loggers.py:259] Engine 000: Avg prompt throughput: 0.5 tokens/s, Avg generation throughput: 34.1 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.1%, Prefix cache hit rate: 61.0%
(APIServer pid=53) INFO 02-28 23:33:36 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 34.2 tokens/s, Running: 1 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.4%, Prefix cache hit rate: 61.0%
(APIServer pid=53) INFO:     127.0.0.1:58256 - "POST /v1/completions HTTP/1.1" 200 OK
(APIServer pid=53) INFO:     127.0.0.1:42468 - "GET /metrics HTTP/1.1" 200 OK
(APIServer pid=53) INFO:     127.0.0.1:42476 - "GET /v1/models HTTP/1.1" 200 OK
(APIServer pid=53) INFO 02-28 23:33:46 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 26.0 tokens/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 61.0%
(APIServer pid=53) INFO 02-28 23:33:56 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 0.0 tokens/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 61.0%
```