From 047091b7b373de63fd344c62d6ad7d339047f344 Mon Sep 17 00:00:00 2001 From: Major Hayden Date: Fri, 8 May 2026 07:34:31 -0500 Subject: [PATCH] RSPEED-3017: use custom buckets for response duration histogram The response_duration_seconds histogram used prometheus_client default buckets which max out at 10s, causing histogram_quantile in Grafana to appear capped for requests exceeding 10 seconds. Reuse the existing LLM_INFERENCE_DURATION_BUCKETS (0.1-120s) to cover the full expected response time range. Signed-off-by: Major Hayden --- src/metrics/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/metrics/__init__.py b/src/metrics/__init__.py index 25a11a194..63d7b8e29 100644 --- a/src/metrics/__init__.py +++ b/src/metrics/__init__.py @@ -32,7 +32,10 @@ # Histogram to measure response durations # This will be used to track how long it takes to handle requests response_duration_seconds = Histogram( - "ls_response_duration_seconds", "Response durations", ["path"] + "ls_response_duration_seconds", + "Response durations", + ["path"], + buckets=LLM_INFERENCE_DURATION_BUCKETS, ) # Metric that indicates what provider + model customers are using so we can