From 9207da7bbaf0326302765ee1d51dea76c3b00bf1 Mon Sep 17 00:00:00 2001 From: machadovilaca Date: Tue, 9 Apr 2024 18:18:48 +0100 Subject: [PATCH] Rename rest client metrics to include kubevirt prefix Signed-off-by: machadovilaca --- docs/metrics.md | 18 +++++------ hack/prom-rule-ci/prom-rules-tests.yaml | 32 +++++++++---------- .../metrics/common/client/rest_metrics.go | 6 ++-- pkg/monitoring/rules/alerts/alerts.go | 2 +- tests/monitoring/metrics.go | 6 ---- .../metric-client/metric-client.go | 2 +- .../metrics_collector.go | 5 +-- 7 files changed, 31 insertions(+), 40 deletions(-) diff --git a/docs/metrics.md b/docs/metrics.md index daced8eedcb3..701a3e87de91 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -33,6 +33,15 @@ The number of VMs in the cluster by namespace. Type: Gauge. ### kubevirt_portforward_active_tunnels Amount of active portforward tunnels, broken down by namespace and vmi name. Type: Gauge. +### kubevirt_rest_client_rate_limiter_duration_seconds +Client side rate limiter latency in seconds. Broken down by verb and URL. Type: Histogram. + +### kubevirt_rest_client_request_latency_seconds +Request latency in seconds. Broken down by verb and URL. Type: Histogram. + +### kubevirt_rest_client_requests_total +Number of HTTP requests, partitioned by status code, method, and host. Type: Counter. + ### kubevirt_usbredir_active_connections Amount of active USB redirection connections, broken down by namespace and vmi name. Type: Gauge. @@ -270,15 +279,6 @@ Returns the labels of the persistent volume claims that are used for restoring v ### kubevirt_vnc_active_connections Amount of active VNC connections, broken down by namespace and vmi name. Type: Gauge. -### rest_client_rate_limiter_duration_seconds -Client side rate limiter latency in seconds. Broken down by verb and URL. Type: Histogram. - -### rest_client_request_latency_seconds -Request latency in seconds. Broken down by verb and URL. Type: Histogram. - -### rest_client_requests_total -Number of HTTP requests, partitioned by status code, method, and host. Type: Counter. - ## Developing new metrics After developing new metrics or changing old ones, please run `make generate` to regenerate this document. diff --git a/hack/prom-rule-ci/prom-rules-tests.yaml b/hack/prom-rule-ci/prom-rules-tests.yaml index 31f49fcdd3ca..b282d87c44f8 100644 --- a/hack/prom-rule-ci/prom-rules-tests.yaml +++ b/hack/prom-rule-ci/prom-rules-tests.yaml @@ -317,21 +317,21 @@ tests: # values : `0+100x15 0+100x5` the same way because prometheus counters might reset - interval: 1m input_series: - - series: 'rest_client_requests_total{namespace="ci", pod="virt-controller-1", code="200"}' + - series: 'kubevirt_rest_client_requests_total{namespace="ci", pod="virt-controller-1", code="200"}' values: '0+10x20' - - series: 'rest_client_requests_total{namespace="ci", pod="virt-controller-1", code="400"}' + - series: 'kubevirt_rest_client_requests_total{namespace="ci", pod="virt-controller-1", code="400"}' values: '0+100x15 0+100x5' - - series: 'rest_client_requests_total{namespace="ci", pod="virt-operator-1", code="200"}' + - series: 'kubevirt_rest_client_requests_total{namespace="ci", pod="virt-operator-1", code="200"}' values: '0+10x20' - - series: 'rest_client_requests_total{namespace="ci", pod="virt-operator-1", code="400"}' + - series: 'kubevirt_rest_client_requests_total{namespace="ci", pod="virt-operator-1", code="400"}' values: '0+100x15 0+100x5' - - series: 'rest_client_requests_total{namespace="ci", pod="virt-handler-1", code="200"}' + - series: 'kubevirt_rest_client_requests_total{namespace="ci", pod="virt-handler-1", code="200"}' values: '0+10x20' - - series: 'rest_client_requests_total{namespace="ci", pod="virt-handler-1", code="500"}' + - series: 'kubevirt_rest_client_requests_total{namespace="ci", pod="virt-handler-1", code="500"}' values: '0+100x15 0+100x5' - - series: 'rest_client_requests_total{namespace="ci", pod="virt-api-1", code="200"}' + - series: 'kubevirt_rest_client_requests_total{namespace="ci", pod="virt-api-1", code="200"}' values: '0+10x20' - - series: 'rest_client_requests_total{namespace="ci", pod="virt-api-1", code="500"}' + - series: 'kubevirt_rest_client_requests_total{namespace="ci", pod="virt-api-1", code="500"}' values: '0+100x15 0+100x5' alert_rule_test: @@ -390,21 +390,21 @@ tests: # values : '0+5x90 0+5x10' the same way because prometheus counters might reset - interval: 1m input_series: - - series: 'rest_client_requests_total{namespace="ci", pod="virt-controller-1", code="200"}' + - series: 'kubevirt_rest_client_requests_total{namespace="ci", pod="virt-controller-1", code="200"}' values: '0+10x100' - - series: 'rest_client_requests_total{namespace="ci", pod="virt-controller-1", code="400"}' + - series: 'kubevirt_rest_client_requests_total{namespace="ci", pod="virt-controller-1", code="400"}' values: '0+5x90 0+5x10' - - series: 'rest_client_requests_total{namespace="ci", pod="virt-operator-1", code="200"}' + - series: 'kubevirt_rest_client_requests_total{namespace="ci", pod="virt-operator-1", code="200"}' values: '0+10x100' - - series: 'rest_client_requests_total{namespace="ci", pod="virt-operator-1", code="400"}' + - series: 'kubevirt_rest_client_requests_total{namespace="ci", pod="virt-operator-1", code="400"}' values: '0+5x90 0+5x10' - - series: 'rest_client_requests_total{namespace="ci", pod="virt-handler-1", code="200"}' + - series: 'kubevirt_rest_client_requests_total{namespace="ci", pod="virt-handler-1", code="200"}' values: '0+10x100' - - series: 'rest_client_requests_total{namespace="ci", pod="virt-handler-1", code="500"}' + - series: 'kubevirt_rest_client_requests_total{namespace="ci", pod="virt-handler-1", code="500"}' values: '0+5x90 0+5x10' - - series: 'rest_client_requests_total{namespace="ci", pod="virt-api-1", code="200"}' + - series: 'kubevirt_rest_client_requests_total{namespace="ci", pod="virt-api-1", code="200"}' values: '0+10x100' - - series: 'rest_client_requests_total{namespace="ci", pod="virt-api-1", code="500"}' + - series: 'kubevirt_rest_client_requests_total{namespace="ci", pod="virt-api-1", code="500"}' values: '0+5x90 0+5x10' alert_rule_test: diff --git a/pkg/monitoring/metrics/common/client/rest_metrics.go b/pkg/monitoring/metrics/common/client/rest_metrics.go index 4e7310f11080..a6edb478401f 100644 --- a/pkg/monitoring/metrics/common/client/rest_metrics.go +++ b/pkg/monitoring/metrics/common/client/rest_metrics.go @@ -34,7 +34,7 @@ var ( // "verb" and "url" labels. It is used for the rest client latency metrics. requestLatency = operatormetrics.NewHistogramVec( operatormetrics.MetricOpts{ - Name: "rest_client_request_latency_seconds", + Name: "kubevirt_rest_client_request_latency_seconds", Help: "Request latency in seconds. Broken down by verb and URL.", }, prometheus.HistogramOpts{ @@ -50,7 +50,7 @@ var ( rateLimiterLatency = operatormetrics.NewHistogramVec( operatormetrics.MetricOpts{ - Name: "rest_client_rate_limiter_duration_seconds", + Name: "kubevirt_rest_client_rate_limiter_duration_seconds", Help: "Client side rate limiter latency in seconds. Broken down by verb and URL.", }, prometheus.HistogramOpts{ @@ -61,7 +61,7 @@ var ( requestResult = operatormetrics.NewCounterVec( operatormetrics.MetricOpts{ - Name: "rest_client_requests_total", + Name: "kubevirt_rest_client_requests_total", Help: "Number of HTTP requests, partitioned by status code, method, and host.", }, []string{"code", "method", "host", "resource", "verb"}, diff --git a/pkg/monitoring/rules/alerts/alerts.go b/pkg/monitoring/rules/alerts/alerts.go index bd7ba1e7dabc..a67d45f0c27b 100644 --- a/pkg/monitoring/rules/alerts/alerts.go +++ b/pkg/monitoring/rules/alerts/alerts.go @@ -79,7 +79,7 @@ func getRunbookURLTemplate() string { } func getErrorRatio(ns string, podName string, errorCodeRegex string, durationInMinutes int) string { - errorRatioQuery := "sum ( rate ( rest_client_requests_total{namespace=\"%s\",pod=~\"%s-.*\",code=~\"%s\"} [%dm] ) ) / sum ( rate ( rest_client_requests_total{namespace=\"%s\",pod=~\"%s-.*\"} [%dm] ) )" + errorRatioQuery := "sum ( rate ( kubevirt_rest_client_requests_total{namespace=\"%s\",pod=~\"%s-.*\",code=~\"%s\"} [%dm] ) ) / sum ( rate ( kubevirt_rest_client_requests_total{namespace=\"%s\",pod=~\"%s-.*\"} [%dm] ) )" return fmt.Sprintf(errorRatioQuery, ns, podName, errorCodeRegex, durationInMinutes, ns, podName, durationInMinutes) } diff --git a/tests/monitoring/metrics.go b/tests/monitoring/metrics.go index 47e9bdcbf9ca..75280ea3324c 100644 --- a/tests/monitoring/metrics.go +++ b/tests/monitoring/metrics.go @@ -74,12 +74,6 @@ var _ = Describe("[sig-monitoring]Metrics", decorators.SigMonitoring, func() { "kubevirt_vmi_migrations_in_running_phase": true, "kubevirt_vmi_migration_succeeded": true, "kubevirt_vmi_migration_failed": true, - - // name do not follow the convention to be prefixed with 'kubevirt_' - // TODO: @machadovilaca - refactor the metric names - "rest_client_request_latency_seconds": true, - "rest_client_rate_limiter_duration_seconds": true, - "rest_client_requests_total": true, } It("should contain virt components metrics", func() { diff --git a/tools/perfscale-audit/metric-client/metric-client.go b/tools/perfscale-audit/metric-client/metric-client.go index a6d6abc44c22..3cc54bdbc1a7 100644 --- a/tools/perfscale-audit/metric-client/metric-client.go +++ b/tools/perfscale-audit/metric-client/metric-client.go @@ -42,7 +42,7 @@ const ( vmiCreationTimePercentileQuery = `histogram_quantile(0.%d, rate(kubevirt_vmi_phase_transition_time_from_creation_seconds_bucket{phase="Running"}[%ds] offset %ds))` vmiDeletionToSucceededTimePercentileQuery = `histogram_quantile(0.%d, rate(kubevirt_vmi_phase_transition_time_from_deletion_seconds_bucket{phase="Succeeded"}[%ds] offset %ds))` vmiDeletionToFailedTimePercentileQuery = `histogram_quantile(0.%d, rate(kubevirt_vmi_phase_transition_time_from_deletion_seconds_bucket{phase="Failed"}[%ds] offset %ds))` - resourceRequestCountsByOperation = `increase(rest_client_requests_total{pod=~"virt-controller.*|virt-handler.*|virt-operator.*|virt-api.*"}[%ds] offset %ds)` + resourceRequestCountsByOperation = `increase(kubevirt_rest_client_requests_total{pod=~"virt-controller.*|virt-handler.*|virt-operator.*|virt-api.*"}[%ds] offset %ds)` ) // Gauge - Using a Gauge doesn't require using an offset because it holds the accurate count diff --git a/tools/prom-metrics-collector/metrics_collector.go b/tools/prom-metrics-collector/metrics_collector.go index 7d57021646d1..086c668b30db 100644 --- a/tools/prom-metrics-collector/metrics_collector.go +++ b/tools/prom-metrics-collector/metrics_collector.go @@ -33,10 +33,7 @@ import ( // https://sdk.operatorframework.io/docs/best-practices/observability-best-practices/#metrics-guidelines // should be ignored. var excludedMetrics = map[string]struct{}{ - "kubevirt_vmi_phase_count": {}, - "rest_client_rate_limiter_duration_seconds": {}, - "rest_client_request_latency_seconds": {}, - "rest_client_requests_total": {}, + "kubevirt_vmi_phase_count": {}, } // Extract the name, help, and type from the metrics doc file