diff --git a/docs/metrics.md b/docs/metrics.md index 722bbbb02908..8c6e5c721e68 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -12,6 +12,9 @@ All metrics documented here are auto-generated by the utility tool `tools/doc-ge ### kubevirt_info Version information. +### cnv_abnormal +The pod with the highest exceeded memory for each container. Type: Gauge. + ### kubevirt_allocatable_nodes The number of allocatable nodes in the cluster. Type: Gauge. @@ -33,6 +36,9 @@ The number of VMs in the cluster by namespace. Type: Gauge. ### kubevirt_portforward_active_tunnels Amount of active portforward tunnels, broken down by namespace and vmi name. Type: Gauge. +### kubevirt_rss_memory_exceeded +The pod with the highest exceeded memory for each container based on the rss. Type: Gauge. + ### kubevirt_usbredir_active_connections Amount of active USB redirection connections, broken down by namespace and vmi name. Type: Gauge. @@ -267,6 +273,9 @@ Returns the labels of the persistent volume claims that are used for restoring v ### kubevirt_vnc_active_connections Amount of active VNC connections, broken down by namespace and vmi name. Type: Gauge. +### kubevirt_working_set_memory_exceeded +The pod with the highest exceeded memory for each container based on the working set. Type: Gauge. + ## Developing new metrics After developing new metrics or changing old ones, please run `make generate` to regenerate this document. diff --git a/pkg/monitoring/rules/recordingrules/BUILD.bazel b/pkg/monitoring/rules/recordingrules/BUILD.bazel index c043ba966ebe..67aaad1cecbb 100644 --- a/pkg/monitoring/rules/recordingrules/BUILD.bazel +++ b/pkg/monitoring/rules/recordingrules/BUILD.bazel @@ -5,6 +5,7 @@ go_library( srcs = [ "api.go", "nodes.go", + "operator.go", "recordingrules.go", "virt.go", "vm.go", diff --git a/pkg/monitoring/rules/recordingrules/operator.go b/pkg/monitoring/rules/recordingrules/operator.go new file mode 100644 index 000000000000..ea672bae8842 --- /dev/null +++ b/pkg/monitoring/rules/recordingrules/operator.go @@ -0,0 +1,50 @@ +/* +Copyright 2024 The KubeVirt Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package recordingrules + +import ( + "github.com/machadovilaca/operator-observability/pkg/operatormetrics" + "github.com/machadovilaca/operator-observability/pkg/operatorrules" + "k8s.io/apimachinery/pkg/util/intstr" +) + +var operatorRecordingRules = []operatorrules.RecordingRule{ + { + MetricsOpts: operatormetrics.MetricOpts{ + Name: "kubevirt_working_set_memory_exceeded", + Help: "The pod with the highest exceeded memory for each container based on the working set.", + }, + MetricType: operatormetrics.GaugeType, + Expr: intstr.FromString("label_replace(max by(container, namespace)(container_memory_working_set_bytes{container=~\"virt-controller|virt-api|virt-handler|virt-operator\"} - on(pod) group_left(node) (kube_pod_container_resource_requests{container=~\"virt-controller|virt-api|virt-handler|virt-operator\",resource=\"memory\"})), \"reason\",\"working set memory request exceeded\", \"\", \"(.*)\")"), + }, + { + MetricsOpts: operatormetrics.MetricOpts{ + Name: "kubevirt_rss_memory_exceeded", + Help: "The pod with the highest exceeded memory for each container based on the rss.", + }, + MetricType: operatormetrics.GaugeType, + Expr: intstr.FromString("label_replace(max by(container, namespace)(container_memory_rss{container=~\"virt-controller|virt-api|virt-handler|virt-operator\"} - on(pod) group_left(node) (kube_pod_container_resource_requests{container=~\"virt-controller|virt-api|virt-handler|virt-operator\",resource=\"memory\"})), \"reason\",\"rss memory request exceeded\", \"\", \"(.*)\")"), + }, + { + MetricsOpts: operatormetrics.MetricOpts{ + Name: "cnv_abnormal", + Help: "This rule holds issues with the pods for each container, e.g. memory exceeded.", + }, + MetricType: operatormetrics.GaugeType, + Expr: intstr.FromString("kubevirt_working_set_memory_exceeded or kubevirt_rss_memory_exceeded"), + }, +} diff --git a/pkg/monitoring/rules/recordingrules/recordingrules.go b/pkg/monitoring/rules/recordingrules/recordingrules.go index c3f65e2be1e2..a1b8f4ff3001 100644 --- a/pkg/monitoring/rules/recordingrules/recordingrules.go +++ b/pkg/monitoring/rules/recordingrules/recordingrules.go @@ -6,6 +6,7 @@ func Register(namespace string) error { return operatorrules.RegisterRecordingRules( apiRecordingRules, nodesRecordingRules, + operatorRecordingRules, virtRecordingRules(namespace), vmRecordingRules, vmiRecordingRules, diff --git a/tools/prom-metrics-collector/metrics_collector.go b/tools/prom-metrics-collector/metrics_collector.go index 2987de02b7e6..b348e4e1e415 100644 --- a/tools/prom-metrics-collector/metrics_collector.go +++ b/tools/prom-metrics-collector/metrics_collector.go @@ -34,6 +34,7 @@ import ( // should be ignored. var excludedMetrics = map[string]struct{}{ "kubevirt_vmi_phase_count": struct{}{}, + "cnv_abnormal": struct{}{}, } // Extract the name, help, and type from the metrics doc file