From 07cca3e31b78002880e1f626081a390a41399d00 Mon Sep 17 00:00:00 2001 From: avlitman Date: Sun, 14 Jan 2024 20:39:37 +0200 Subject: [PATCH] refactor recording rules and alerts code Signed-off-by: avlitman --- docs/metrics.md | 19 +- go.mod | 2 +- go.sum | 4 +- internal/operands/metrics/reconcile.go | 8 +- internal/operands/metrics/reconcile_test.go | 5 +- internal/operands/metrics/resources.go | 47 ----- .../metrics/ssp-operator/metrics.go | 15 +- .../metrics/template-validator/metrics.go | 15 +- pkg/monitoring/rules/alerts/operator.go | 89 ++++++++ pkg/monitoring/rules/alerts/prometheus.go | 51 +++++ .../rules/recordingrules/operator.go | 55 +++++ .../rules/recordingrules/recordingrules.go | 9 + pkg/monitoring/rules/rules.go | 196 ++++-------------- tools/metricsdocs/metricsdocs.go | 113 +++------- .../metrics_collector.go | 31 --- .../metrics_json_generator.go | 56 ++++- .../pkg/metrics/parser/metrics_parser.go | 8 +- .../operator-observability/pkg/docs/alerts.go | 96 +++++++++ .../pkg/docs/metrics.go | 106 ++++++++++ .../pkg/operatorrules/prometheusrules.go | 75 +++++++ .../pkg/operatorrules/rbac.go | 45 ++++ .../pkg/operatorrules/recordingrule.go | 24 +++ .../pkg/operatorrules/registry.go | 65 ++++++ .../pkg/operatorrules/schema.go | 22 ++ vendor/modules.txt | 4 +- 25 files changed, 811 insertions(+), 349 deletions(-) create mode 100644 pkg/monitoring/rules/alerts/operator.go create mode 100644 pkg/monitoring/rules/alerts/prometheus.go create mode 100644 pkg/monitoring/rules/recordingrules/operator.go create mode 100644 pkg/monitoring/rules/recordingrules/recordingrules.go delete mode 100644 tools/prom-metrics-collector/metrics_collector.go create mode 100644 vendor/github.com/machadovilaca/operator-observability/pkg/docs/alerts.go create mode 100644 vendor/github.com/machadovilaca/operator-observability/pkg/docs/metrics.go create mode 100644 vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/prometheusrules.go create mode 100644 vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/rbac.go create mode 100644 vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/recordingrule.go create mode 100644 vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/registry.go create mode 100644 vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/schema.go diff --git a/docs/metrics.md b/docs/metrics.md index df129aa42..8566418ef 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -1,25 +1,34 @@ # SSP Operator metrics -This document aims to help users that are not familiar with metrics exposed by the SSP Operator. -All metrics documented here are auto-generated by the utility tool `tools/metricsdocs` and reflects exactly what is being exposed. -## SSP Operator Metrics List ### kubevirt_ssp_common_templates_restored_increase The increase in the number of common templates restored by the operator back to their original state, over the last hour. Type: Gauge. + ### kubevirt_ssp_common_templates_restored_total The total number of common templates restored by the operator back to their original state. Type: Counter. + ### kubevirt_ssp_operator_reconcile_succeeded Set to 1 if the reconcile process of all operands completes with no errors, and to 0 otherwise. Type: Gauge. + ### kubevirt_ssp_operator_reconcile_succeeded_aggregated The total number of ssp-operator pods reconciling with no errors. Type: Gauge. + ### kubevirt_ssp_operator_up The total number of running ssp-operator pods. Type: Gauge. + ### kubevirt_ssp_template_validator_rejected_increase The increase in the number of rejected template validators, over the last hour. Type: Gauge. + ### kubevirt_ssp_template_validator_rejected_total The total number of rejected template validators. Type: Counter. + ### kubevirt_ssp_template_validator_up The total number of running virt-template-validator pods. Type: Gauge. + ### kubevirt_ssp_vm_rbd_block_volume_without_rxbounce -VM with RBD mounted Block volume (without rxbounce option set). Type: Gauge. +[ALPHA] VM with RBD mounted Block volume (without rxbounce option set). Type: Gauge. + ## Developing new metrics -After developing new metrics or changing old ones, please run `make generate-doc` to regenerate this document. + +All metrics documented here are auto-generated and reflect exactly what is being +exposed. After developing new metrics or changing old ones please regenerate +this document. diff --git a/go.mod b/go.mod index 35b3ddb20..2c3240c5e 100644 --- a/go.mod +++ b/go.mod @@ -6,7 +6,7 @@ require ( github.com/blang/semver/v4 v4.0.0 github.com/fsnotify/fsnotify v1.7.0 github.com/go-logr/logr v1.4.1 - github.com/kubevirt/monitoring/pkg/metrics/parser v0.0.0-20230706095033-373a95665d5a + github.com/kubevirt/monitoring/pkg/metrics/parser v0.0.0-20240125201600-b689e9c89409 github.com/machadovilaca/operator-observability v0.0.12 github.com/onsi/ginkgo/v2 v2.15.0 github.com/onsi/gomega v1.31.1 diff --git a/go.sum b/go.sum index 7fdbedcd4..856b1bd6e 100644 --- a/go.sum +++ b/go.sum @@ -283,8 +283,8 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= -github.com/kubevirt/monitoring/pkg/metrics/parser v0.0.0-20230706095033-373a95665d5a h1:7YL/LNARjQWuXihwJ4b/nVzddGvoFRI7JqxAKISyJkg= -github.com/kubevirt/monitoring/pkg/metrics/parser v0.0.0-20230706095033-373a95665d5a/go.mod h1:qGj2agzgwQ27nYhP3xhLs+IBzE5+ALNUg8bDfMcwPqo= +github.com/kubevirt/monitoring/pkg/metrics/parser v0.0.0-20240125201600-b689e9c89409 h1:w+MkYRwdxddjNwR7BbNMWP24wVli/G6zna86wfbhiAk= +github.com/kubevirt/monitoring/pkg/metrics/parser v0.0.0-20240125201600-b689e9c89409/go.mod h1:qGj2agzgwQ27nYhP3xhLs+IBzE5+ALNUg8bDfMcwPqo= github.com/machadovilaca/operator-observability v0.0.12 h1:rd9iFmvWJiYS8LdW6siAiz8kLigcNLa1+dmCVb7dFxs= github.com/machadovilaca/operator-observability v0.0.12/go.mod h1:NGkaR3HEYLScVQf6kQAyxWOSN1ltHcsEvHU/8iIJ8cE= github.com/mailru/easyjson v0.0.0-20190614124828-94de47d64c63/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= diff --git a/internal/operands/metrics/reconcile.go b/internal/operands/metrics/reconcile.go index 7b1cef3fa..711ba2646 100644 --- a/internal/operands/metrics/reconcile.go +++ b/internal/operands/metrics/reconcile.go @@ -6,6 +6,7 @@ import ( utilruntime "k8s.io/apimachinery/pkg/util/runtime" "kubevirt.io/ssp-operator/internal/common" "kubevirt.io/ssp-operator/internal/operands" + "kubevirt.io/ssp-operator/pkg/monitoring/rules" ) // Define RBAC rules needed by this operand: @@ -96,7 +97,12 @@ func reconcileMonitoringRbacRoleBinding(request *common.Request) (common.Reconci } func reconcilePrometheusRule(request *common.Request) (common.ReconcileResult, error) { - prometheusRule, err := newPrometheusRule(request.Namespace) + err := rules.SetupRules() + if err != nil { + return common.ReconcileResult{}, err + } + + prometheusRule, err := rules.BuildPrometheusRule(request.Namespace) if err != nil { return common.ReconcileResult{}, err } diff --git a/internal/operands/metrics/reconcile_test.go b/internal/operands/metrics/reconcile_test.go index e51022404..003a35af5 100644 --- a/internal/operands/metrics/reconcile_test.go +++ b/internal/operands/metrics/reconcile_test.go @@ -18,6 +18,7 @@ import ( ssp "kubevirt.io/ssp-operator/api/v1beta2" "kubevirt.io/ssp-operator/internal/common" + "kubevirt.io/ssp-operator/pkg/monitoring/rules" ) var log = logf.Log.WithName("metrics_operand") @@ -67,7 +68,7 @@ var _ = Describe("Metrics operand", func() { _, err := operand.Reconcile(&request) Expect(err).ToNot(HaveOccurred()) - prometheusRule, err := newPrometheusRule(namespace) + prometheusRule, err := rules.BuildPrometheusRule(namespace) Expect(err).ToNot(HaveOccurred()) ExpectResourceExists(prometheusRule, request) @@ -82,7 +83,7 @@ var _ = Describe("Metrics operand", func() { os.Setenv(runbookURLTemplateEnv, template) } - prometheusRule, err := newPrometheusRule(namespace) + prometheusRule, err := rules.BuildPrometheusRule(namespace) if strings.Count(template, "%s") != 1 || strings.Count(template, "%") != 1 { Expect(err).To(HaveOccurred()) diff --git a/internal/operands/metrics/resources.go b/internal/operands/metrics/resources.go index 6fb8c1555..65206435c 100644 --- a/internal/operands/metrics/resources.go +++ b/internal/operands/metrics/resources.go @@ -1,15 +1,9 @@ package metrics import ( - "errors" - "os" - "strings" - promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" rbac "k8s.io/api/rbac/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - - "kubevirt.io/ssp-operator/pkg/monitoring/rules" ) const ( @@ -96,44 +90,3 @@ func newServiceMonitorCR(namespace string) *promv1.ServiceMonitor { }, } } - -func newPrometheusRule(namespace string) (*promv1.PrometheusRule, error) { - runbookURLTemplate, err := getRunbookURLTemplate() - if err != nil { - return nil, err - } - - return &promv1.PrometheusRule{ - ObjectMeta: metav1.ObjectMeta{ - Name: PrometheusRuleName, - Namespace: namespace, - Labels: map[string]string{ - "prometheus": "k8s", - "role": "alert-rules", - "kubevirt.io": "prometheus-rules", - PrometheusLabelKey: PrometheusLabelValue, - }, - }, - Spec: promv1.PrometheusRuleSpec{ - Groups: []promv1.RuleGroup{ - { - Name: "cnv.rules", - Rules: append(rules.RecordRules(), rules.AlertRules(runbookURLTemplate)...), - }, - }, - }, - }, nil -} - -func getRunbookURLTemplate() (string, error) { - runbookURLTemplate, exists := os.LookupEnv(runbookURLTemplateEnv) - if !exists { - runbookURLTemplate = defaultRunbookURLTemplate - } - - if strings.Count(runbookURLTemplate, "%s") != 1 || strings.Count(runbookURLTemplate, "%") != 1 { - return "", errors.New("runbook URL template must have exactly 1 %s substring") - } - - return runbookURLTemplate, nil -} diff --git a/pkg/monitoring/metrics/ssp-operator/metrics.go b/pkg/monitoring/metrics/ssp-operator/metrics.go index 2d930aaae..32641a9b9 100644 --- a/pkg/monitoring/metrics/ssp-operator/metrics.go +++ b/pkg/monitoring/metrics/ssp-operator/metrics.go @@ -5,14 +5,21 @@ import ( runtimemetrics "sigs.k8s.io/controller-runtime/pkg/metrics" ) -func SetupMetrics() { +func SetupMetrics() error { operatormetrics.Register = runtimemetrics.Registry.Register - if err := operatormetrics.RegisterMetrics( + err := operatormetrics.RegisterMetrics( operatorMetrics, rbdMetrics, templateMetrics, - ); err != nil { - panic(err) + ) + if err != nil { + return err } + return nil +} + +// ListMetrics registered prometheus metrics +func ListMetrics() []operatormetrics.Metric { + return operatormetrics.ListMetrics() } diff --git a/pkg/monitoring/metrics/template-validator/metrics.go b/pkg/monitoring/metrics/template-validator/metrics.go index d91d581a9..e720cd6b7 100644 --- a/pkg/monitoring/metrics/template-validator/metrics.go +++ b/pkg/monitoring/metrics/template-validator/metrics.go @@ -4,10 +4,17 @@ import ( "github.com/machadovilaca/operator-observability/pkg/operatormetrics" ) -func SetupMetrics() { - if err := operatormetrics.RegisterMetrics( +func SetupMetrics() error { + err := operatormetrics.RegisterMetrics( templateMetrics, - ); err != nil { - panic(err) + ) + if err != nil { + return err } + return nil +} + +// ListMetrics registered prometheus metrics +func ListMetrics() []operatormetrics.Metric { + return operatormetrics.ListMetrics() } diff --git a/pkg/monitoring/rules/alerts/operator.go b/pkg/monitoring/rules/alerts/operator.go new file mode 100644 index 000000000..2440fca63 --- /dev/null +++ b/pkg/monitoring/rules/alerts/operator.go @@ -0,0 +1,89 @@ +package alerts + +import ( + promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/utils/ptr" +) + +const ( + severityAlertLabelKey = "severity" + healthImpactAlertLabelKey = "operator_health_impact" +) + +func operatorAlerts() []promv1.Rule { + return []promv1.Rule{ + { + Alert: "SSPDown", + Expr: intstr.FromString("kubevirt_ssp_operator_up == 0"), + For: ptr.To[promv1.Duration]("5m"), + Annotations: map[string]string{ + "summary": "All SSP operator pods are down.", + }, + Labels: map[string]string{ + severityAlertLabelKey: "critical", + healthImpactAlertLabelKey: "critical", + }, + }, + { + Alert: "SSPTemplateValidatorDown", + Expr: intstr.FromString("kubevirt_ssp_template_validator_up == 0"), + For: ptr.To[promv1.Duration]("5m"), + Annotations: map[string]string{ + "summary": "All Template Validator pods are down.", + }, + Labels: map[string]string{ + severityAlertLabelKey: "critical", + healthImpactAlertLabelKey: "critical", + }, + }, + { + Alert: "SSPFailingToReconcile", + Expr: intstr.FromString("(kubevirt_ssp_operator_reconcile_succeeded_aggregated == 0) and (kubevirt_ssp_operator_up > 0)"), + For: ptr.To[promv1.Duration]("5m"), + Annotations: map[string]string{ + "summary": "The ssp-operator pod is up but failing to reconcile.", + }, + Labels: map[string]string{ + severityAlertLabelKey: "critical", + healthImpactAlertLabelKey: "critical", + }, + }, + { + Alert: "SSPHighRateRejectedVms", + Expr: intstr.FromString("kubevirt_ssp_template_validator_rejected_increase > 5"), + For: ptr.To[promv1.Duration]("5m"), + Annotations: map[string]string{ + "summary": "High rate of rejected Vms.", + }, + Labels: map[string]string{ + severityAlertLabelKey: "warning", + healthImpactAlertLabelKey: "warning", + }, + }, + { + Alert: "SSPCommonTemplatesModificationReverted", + Expr: intstr.FromString("kubevirt_ssp_common_templates_restored_increase > 0"), + For: ptr.To[promv1.Duration]("0m"), + Annotations: map[string]string{ + "summary": "Common Templates manual modifications were reverted by the operator.", + }, + Labels: map[string]string{ + severityAlertLabelKey: "warning", + healthImpactAlertLabelKey: "none", + }, + }, + { + Alert: "VirtualMachineCRCErrors", + Expr: intstr.FromString("(count(kubevirt_ssp_vm_rbd_block_volume_without_rxbounce > 0) or vector(0)) > 0"), + Annotations: map[string]string{ + "description": "{{ $value }} Virtual Machines are in risk of causing CRC errors and major service outages.", + "summary": "When running VMs using ODF storage with 'rbd' mounter or 'rbd.csi.ceph.com provisioner', it will report bad crc/signature errors and cluster performance will be severely degraded if krbd:rxbounce is not set.", + }, + Labels: map[string]string{ + severityAlertLabelKey: "warning", + healthImpactAlertLabelKey: "none", + }, + }, + } +} diff --git a/pkg/monitoring/rules/alerts/prometheus.go b/pkg/monitoring/rules/alerts/prometheus.go new file mode 100644 index 000000000..df7b26e3f --- /dev/null +++ b/pkg/monitoring/rules/alerts/prometheus.go @@ -0,0 +1,51 @@ +package alerts + +import ( + "errors" + "fmt" + "os" + "strings" + + "github.com/machadovilaca/operator-observability/pkg/operatorrules" + promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" +) + +const ( + prometheusRunbookAnnotationKey = "runbook_url" + partOfAlertLabelKey = "kubernetes_operator_part_of" + partOfAlertLabelValue = "kubevirt" + componentAlertLabelKey = "kubernetes_operator_component" + componentAlertLabelValue = "ssp-operator" + defaultRunbookURLTemplate = "https://kubevirt.io/monitoring/runbooks/%s" + runbookURLTemplateEnv = "RUNBOOK_URL_TEMPLATE" +) + +func Register() error { + alerts := [][]promv1.Rule{ + operatorAlerts(), + } + + runbookURLTemplate := getRunbookURLTemplate() + for _, alertGroup := range alerts { + for _, alert := range alertGroup { + alert.Labels[partOfAlertLabelKey] = partOfAlertLabelValue + alert.Labels[componentAlertLabelKey] = componentAlertLabelValue + alert.Annotations[prometheusRunbookAnnotationKey] = fmt.Sprintf(runbookURLTemplate, alert.Alert) + } + } + + return operatorrules.RegisterAlerts(alerts...) +} + +func getRunbookURLTemplate() string { + runbookURLTemplate, exists := os.LookupEnv(runbookURLTemplateEnv) + if !exists { + runbookURLTemplate = defaultRunbookURLTemplate + } + + if strings.Count(runbookURLTemplate, "%s") != 1 { + panic(errors.New("runbook URL template must have exactly 1 %s substring")) + } + + return runbookURLTemplate +} diff --git a/pkg/monitoring/rules/recordingrules/operator.go b/pkg/monitoring/rules/recordingrules/operator.go new file mode 100644 index 000000000..8149889ed --- /dev/null +++ b/pkg/monitoring/rules/recordingrules/operator.go @@ -0,0 +1,55 @@ +package recordingrules + +import ( + "github.com/machadovilaca/operator-observability/pkg/operatormetrics" + "github.com/machadovilaca/operator-observability/pkg/operatorrules" + "k8s.io/apimachinery/pkg/util/intstr" +) + +const ( + CommonTemplatesRestoredIncreaseQuery = "sum(increase(kubevirt_ssp_common_templates_restored_total{pod=~'ssp-operator.*'}[1h]))" + TemplateValidatorRejectedIncreaseQuery = "sum(increase(kubevirt_ssp_template_validator_rejected_total{pod=~'virt-template-validator.*'}[1h]))" +) + +var operatorRecordingRules = []operatorrules.RecordingRule{ + { + MetricsOpts: operatormetrics.MetricOpts{ + Name: "kubevirt_ssp_operator_up", + Help: "The total number of running ssp-operator pods", + }, + MetricType: operatormetrics.GaugeType, + Expr: intstr.FromString("sum(up{pod=~'ssp-operator.*'}) OR on() vector(0)"), + }, + { + MetricsOpts: operatormetrics.MetricOpts{ + Name: "kubevirt_ssp_template_validator_up", + Help: "The total number of running virt-template-validator pods", + }, + MetricType: operatormetrics.GaugeType, + Expr: intstr.FromString("sum(up{pod=~'virt-template-validator.*'}) OR on() vector(0)"), + }, + { + MetricsOpts: operatormetrics.MetricOpts{ + Name: "kubevirt_ssp_operator_reconcile_succeeded_aggregated", + Help: "The total number of ssp-operator pods reconciling with no errors", + }, + MetricType: operatormetrics.GaugeType, + Expr: intstr.FromString("sum(kubevirt_ssp_operator_reconcile_succeeded)"), + }, + { + MetricsOpts: operatormetrics.MetricOpts{ + Name: "kubevirt_ssp_template_validator_rejected_increase", + Help: "The increase in the number of rejected template validators, over the last hour", + }, + MetricType: operatormetrics.GaugeType, + Expr: intstr.FromString(TemplateValidatorRejectedIncreaseQuery + " OR on() vector(0)"), + }, + { + MetricsOpts: operatormetrics.MetricOpts{ + Name: "kubevirt_ssp_common_templates_restored_increase", + Help: "The increase in the number of common templates restored by the operator back to their original state, over the last hour", + }, + MetricType: operatormetrics.GaugeType, + Expr: intstr.FromString(CommonTemplatesRestoredIncreaseQuery + " OR on() vector(0)"), + }, +} diff --git a/pkg/monitoring/rules/recordingrules/recordingrules.go b/pkg/monitoring/rules/recordingrules/recordingrules.go new file mode 100644 index 000000000..3b95c3ae6 --- /dev/null +++ b/pkg/monitoring/rules/recordingrules/recordingrules.go @@ -0,0 +1,9 @@ +package recordingrules + +import "github.com/machadovilaca/operator-observability/pkg/operatorrules" + +func Register() error { + return operatorrules.RegisterRecordingRules( + operatorRecordingRules, + ) +} diff --git a/pkg/monitoring/rules/rules.go b/pkg/monitoring/rules/rules.go index a08b6b9fd..542978fbf 100644 --- a/pkg/monitoring/rules/rules.go +++ b/pkg/monitoring/rules/rules.go @@ -1,180 +1,54 @@ package rules import ( - "fmt" - + "github.com/machadovilaca/operator-observability/pkg/operatorrules" promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" - "k8s.io/apimachinery/pkg/util/intstr" - "k8s.io/utils/ptr" + "kubevirt.io/ssp-operator/pkg/monitoring/rules/alerts" + "kubevirt.io/ssp-operator/pkg/monitoring/rules/recordingrules" ) const ( - severityAlertLabelKey = "severity" - healthImpactAlertLabelKey = "operator_health_impact" - partOfAlertLabelKey = "kubernetes_operator_part_of" - partOfAlertLabelValue = "kubevirt" - componentAlertLabelKey = "kubernetes_operator_component" - componentAlertLabelValue = "ssp-operator" + ruleName = "prometheus-k8s-rules-cnv" + PrometheusLabelKey = "prometheus.ssp.kubevirt.io" + PrometheusLabelValue = "true" ) -const ( - CommonTemplatesRestoredIncreaseQuery = "sum(increase(kubevirt_ssp_common_templates_restored_total{pod=~'ssp-operator.*'}[1h]))" - TemplateValidatorRejectedIncreaseQuery = "sum(increase(kubevirt_ssp_template_validator_rejected_total{pod=~'virt-template-validator.*'}[1h]))" -) +func SetupRules() error { + err := recordingrules.Register() + if err != nil { + return err + } -// RecordRulesDesc represent SSP Operator Prometheus Record Rules -type RecordRulesDesc struct { - Name string - Expr intstr.IntOrString - Description string - Type string -} + err = alerts.Register() + if err != nil { + return err + } -// recordRulesDescList lists all SSP Operator Prometheus Record Rules -var recordRulesDescList = []RecordRulesDesc{ - { - Name: "kubevirt_ssp_operator_up", - Expr: intstr.FromString("sum(up{pod=~'ssp-operator.*'}) OR on() vector(0)"), - Description: "The total number of running ssp-operator pods", - Type: "Gauge", - }, - { - Name: "kubevirt_ssp_template_validator_up", - Expr: intstr.FromString("sum(up{pod=~'virt-template-validator.*'}) OR on() vector(0)"), - Description: "The total number of running virt-template-validator pods", - Type: "Gauge", - }, - { - Name: "kubevirt_ssp_operator_reconcile_succeeded_aggregated", - Expr: intstr.FromString("sum(kubevirt_ssp_operator_reconcile_succeeded)"), - Description: "The total number of ssp-operator pods reconciling with no errors", - Type: "Gauge", - }, - { - Name: "kubevirt_ssp_template_validator_rejected_increase", - Expr: intstr.FromString(TemplateValidatorRejectedIncreaseQuery + " OR on() vector(0)"), - Description: "The increase in the number of rejected template validators, over the last hour", - Type: "Gauge", - }, - { - Name: "kubevirt_ssp_common_templates_restored_increase", - Expr: intstr.FromString(CommonTemplatesRestoredIncreaseQuery + " OR on() vector(0)"), - Description: "The increase in the number of common templates restored by the operator back to their original state, over the last hour", - Type: "Gauge", - }, + return nil } -func RecordRules() []promv1.Rule { - result := make([]promv1.Rule, 0, len(recordRulesDescList)) - for _, rrd := range recordRulesDescList { - result = append(result, promv1.Rule{Record: rrd.Name, Expr: rrd.Expr}) +func BuildPrometheusRule(namespace string) (*promv1.PrometheusRule, error) { + rules, err := operatorrules.BuildPrometheusRule( + ruleName, + namespace, + map[string]string{ + "prometheus": "k8s", + "role": "alert-rules", + "kubevirt.io": "prometheus-rules", + PrometheusLabelKey: PrometheusLabelValue, + }, + ) + if err != nil { + return nil, err } - return result + + return rules, nil } -func RecordRulesWithDescriptions() []RecordRulesDesc { - result := make([]RecordRulesDesc, 0, len(recordRulesDescList)) - for _, rrd := range recordRulesDescList { - result = append(result, rrd) - } - return result +func ListRecordingRules() []operatorrules.RecordingRule { + return operatorrules.ListRecordingRules() } -func AlertRules(runbookURLTemplate string) []promv1.Rule { - return []promv1.Rule{ - { - Expr: intstr.FromString("sum(kubevirt_vmi_phase_count{phase=\"running\"}) by (node,os,workload,flavor,instance_type,preference)"), - Record: "cnv:vmi_status_running:count", - }, - { - Alert: "SSPDown", - Expr: intstr.FromString("kubevirt_ssp_operator_up == 0"), - For: ptr.To[promv1.Duration]("5m"), - Annotations: map[string]string{ - "summary": "All SSP operator pods are down.", - "runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPDown"), - }, - Labels: map[string]string{ - severityAlertLabelKey: "critical", - healthImpactAlertLabelKey: "critical", - partOfAlertLabelKey: partOfAlertLabelValue, - componentAlertLabelKey: componentAlertLabelValue, - }, - }, - { - Alert: "SSPTemplateValidatorDown", - Expr: intstr.FromString("kubevirt_ssp_template_validator_up == 0"), - For: ptr.To[promv1.Duration]("5m"), - Annotations: map[string]string{ - "summary": "All Template Validator pods are down.", - "runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPTemplateValidatorDown"), - }, - Labels: map[string]string{ - severityAlertLabelKey: "critical", - healthImpactAlertLabelKey: "critical", - partOfAlertLabelKey: partOfAlertLabelValue, - componentAlertLabelKey: componentAlertLabelValue, - }, - }, - { - Alert: "SSPFailingToReconcile", - Expr: intstr.FromString("(kubevirt_ssp_operator_reconcile_succeeded_aggregated == 0) and (kubevirt_ssp_operator_up > 0)"), - For: ptr.To[promv1.Duration]("5m"), - Annotations: map[string]string{ - "summary": "The ssp-operator pod is up but failing to reconcile", - "runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPFailingToReconcile"), - }, - Labels: map[string]string{ - severityAlertLabelKey: "critical", - healthImpactAlertLabelKey: "critical", - partOfAlertLabelKey: partOfAlertLabelValue, - componentAlertLabelKey: componentAlertLabelValue, - }, - }, - { - Alert: "SSPHighRateRejectedVms", - Expr: intstr.FromString("kubevirt_ssp_template_validator_rejected_increase > 5"), - For: ptr.To[promv1.Duration]("5m"), - Annotations: map[string]string{ - "summary": "High rate of rejected Vms", - "runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPHighRateRejectedVms"), - }, - Labels: map[string]string{ - severityAlertLabelKey: "warning", - healthImpactAlertLabelKey: "warning", - partOfAlertLabelKey: partOfAlertLabelValue, - componentAlertLabelKey: componentAlertLabelValue, - }, - }, - { - Alert: "SSPCommonTemplatesModificationReverted", - Expr: intstr.FromString("kubevirt_ssp_common_templates_restored_increase > 0"), - For: ptr.To[promv1.Duration]("0m"), - Annotations: map[string]string{ - "summary": "Common Templates manual modifications were reverted by the operator", - "runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPCommonTemplatesModificationReverted"), - }, - Labels: map[string]string{ - severityAlertLabelKey: "warning", - healthImpactAlertLabelKey: "none", - partOfAlertLabelKey: partOfAlertLabelValue, - componentAlertLabelKey: componentAlertLabelValue, - }, - }, - { - Alert: "VirtualMachineCRCErrors", - Expr: intstr.FromString("(count(kubevirt_ssp_vm_rbd_block_volume_without_rxbounce > 0) or vector(0)) > 0"), - Annotations: map[string]string{ - "description": "{{ $value }} Virtual Machines are in risk of causing CRC errors and major service outages", - "summary": "When running VMs using ODF storage with 'rbd' mounter or 'rbd.csi.ceph.com provisioner', it will report bad crc/signature errors and cluster performance will be severely degraded if krbd:rxbounce is not set.", - "runbook_url": fmt.Sprintf(runbookURLTemplate, "VirtualMachineCRCErrors"), - }, - Labels: map[string]string{ - severityAlertLabelKey: "warning", - healthImpactAlertLabelKey: "none", - partOfAlertLabelKey: partOfAlertLabelValue, - componentAlertLabelKey: componentAlertLabelValue, - }, - }, - } +func ListAlerts() []promv1.Rule { + return operatorrules.ListAlerts() } diff --git a/tools/metricsdocs/metricsdocs.go b/tools/metricsdocs/metricsdocs.go index 323b72368..9e042b9fa 100644 --- a/tools/metricsdocs/metricsdocs.go +++ b/tools/metricsdocs/metricsdocs.go @@ -2,106 +2,57 @@ package main import ( "fmt" - "sort" - "strings" - "github.com/machadovilaca/operator-observability/pkg/operatormetrics" + "github.com/machadovilaca/operator-observability/pkg/docs" sspMetrics "kubevirt.io/ssp-operator/pkg/monitoring/metrics/ssp-operator" validatorMetrics "kubevirt.io/ssp-operator/pkg/monitoring/metrics/template-validator" "kubevirt.io/ssp-operator/pkg/monitoring/rules" ) -const ( - title = "# SSP Operator metrics\n" - background = "This document aims to help users that are not familiar with metrics exposed by the SSP Operator.\n" + - "All metrics documented here are auto-generated by the utility tool `tools/metricsdocs` and reflects exactly what is being exposed.\n\n" +const tpl = `# SSP Operator metrics - KVSpecificMetrics = "## SSP Operator Metrics List\n" +{{- range . }} - opening = title + - background + - KVSpecificMetrics +{{ $deprecatedVersion := "" -}} +{{- with index .ExtraFields "DeprecatedVersion" -}} + {{- $deprecatedVersion = printf " in %s" . -}} +{{- end -}} - footerHeading = "## Developing new metrics\n" - footerContent = "After developing new metrics or changing old ones, please run `make generate-doc` to regenerate this document.\n" +{{- $stabilityLevel := "" -}} +{{- if and (.ExtraFields.StabilityLevel) (ne .ExtraFields.StabilityLevel "STABLE") -}} + {{- $stabilityLevel = printf "[%s%s] " .ExtraFields.StabilityLevel $deprecatedVersion -}} +{{- end -}} - footer = footerHeading + footerContent -) - -func main() { - metricsList := recordRulesDescToMetricList(rules.RecordRulesWithDescriptions()) - - sspMetrics.SetupMetrics() - validatorMetrics.SetupMetrics() - - for _, m := range operatormetrics.ListMetrics() { - metricsList = append(metricsList, metric{ - name: m.GetOpts().Name, - description: m.GetOpts().Help, - mtype: strings.TrimSuffix(string(m.GetType()), "Vec"), - }) - } +### {{ .Name }} +{{ print $stabilityLevel }}{{ .Help }}. Type: {{ .Type -}}. - sort.Sort(metricsList) - printMetrics(metricsList) -} +{{- end }} -func printMetrics(metricsList metricList) { - fmt.Print(opening) - metricsList.writeOut() - fmt.Print(footer) -} +## Developing new metrics -type metric struct { - name string - description string - mtype string -} +All metrics documented here are auto-generated and reflect exactly what is being +exposed. After developing new metrics or changing old ones please regenerate +this document. +` -func recordRulesDescToMetricList(mdl []rules.RecordRulesDesc) metricList { - res := make([]metric, len(mdl)) - for i, md := range mdl { - res[i] = metricDescriptionToMetric(md) +func main() { + err := sspMetrics.SetupMetrics() + if err != nil { + panic(err) } - return res -} - -func metricDescriptionToMetric(rrd rules.RecordRulesDesc) metric { - return metric{ - name: rrd.Name, - description: rrd.Description, - mtype: rrd.Type, + err = validatorMetrics.SetupMetrics() + if err != nil { + panic(err) } -} - -func (m metric) writeOut() { - fmt.Println("###", m.name) - fmt.Println(m.description + ". Type: " + m.mtype + ".") -} -type metricList []metric - -var _ sort.Interface = metricList{} - -// Len implements sort.Interface.Len -func (m metricList) Len() int { - return len(m) -} + err = rules.SetupRules() + if err != nil { + panic(err) + } -// Less implements sort.Interface.Less -func (m metricList) Less(i, j int) bool { - return m[i].name < m[j].name -} + docsString := docs.BuildMetricsDocsWithCustomTemplate(sspMetrics.ListMetrics(), rules.ListRecordingRules(), tpl) -// Swap implements sort.Interface.Swap -func (m metricList) Swap(i, j int) { - m[i], m[j] = m[j], m[i] -} - -func (m metricList) writeOut() { - for _, met := range m { - met.writeOut() - } + fmt.Print(docsString) } diff --git a/tools/prom-metrics-collector/metrics_collector.go b/tools/prom-metrics-collector/metrics_collector.go deleted file mode 100644 index a0c852f58..000000000 --- a/tools/prom-metrics-collector/metrics_collector.go +++ /dev/null @@ -1,31 +0,0 @@ -package main - -import ( - parser "github.com/kubevirt/monitoring/pkg/metrics/parser" - dto "github.com/prometheus/client_model/go" - - "kubevirt.io/ssp-operator/pkg/monitoring/rules" -) - -// This should be used only for very rare cases where the naming conventions that are explained in the best practices: -// https://sdk.operatorframework.io/docs/best-practices/observability-best-practices/#metrics-guidelines -// should be ignored. -var excludedMetrics = map[string]struct{}{} - -func readMetrics() []*dto.MetricFamily { - var metricFamilies []*dto.MetricFamily - sspMetrics := rules.RecordRulesWithDescriptions() - - for _, metric := range sspMetrics { - if _, isExcludedMetric := excludedMetrics[metric.Name]; !isExcludedMetric { - mf := parser.CreateMetricFamily(parser.Metric{ - Name: metric.Name, - Help: metric.Description, - Type: metric.Type, - }) - metricFamilies = append(metricFamilies, mf) - } - } - - return metricFamilies -} diff --git a/tools/prom-metrics-collector/metrics_json_generator.go b/tools/prom-metrics-collector/metrics_json_generator.go index 6c998f73d..b1e117b88 100644 --- a/tools/prom-metrics-collector/metrics_json_generator.go +++ b/tools/prom-metrics-collector/metrics_json_generator.go @@ -3,17 +3,63 @@ package main import ( "encoding/json" "fmt" - "os" + "strings" + + "github.com/kubevirt/monitoring/pkg/metrics/parser" + + sspMetrics "kubevirt.io/ssp-operator/pkg/monitoring/metrics/ssp-operator" + validatorMetrics "kubevirt.io/ssp-operator/pkg/monitoring/metrics/template-validator" + "kubevirt.io/ssp-operator/pkg/monitoring/rules" ) +// This should be used only for very rare cases where the naming conventions that are explained in the best practices: +// https://sdk.operatorframework.io/docs/best-practices/observability-best-practices/#metrics-guidelines +// should be ignored. +var excludedMetrics = map[string]struct{}{} + func main() { - metricFamilies := readMetrics() + err := sspMetrics.SetupMetrics() + if err != nil { + panic(err) + } + + err = validatorMetrics.SetupMetrics() + if err != nil { + panic(err) + } + + if err := rules.SetupRules(); err != nil { + panic(err) + } + + var metricFamilies []parser.Metric + + metricsList := sspMetrics.ListMetrics() + for _, m := range metricsList { + if _, isExcludedMetric := excludedMetrics[m.GetOpts().Name]; !isExcludedMetric { + metricFamilies = append(metricFamilies, parser.Metric{ + Name: m.GetOpts().Name, + Help: m.GetOpts().Help, + Type: strings.ToUpper(string(m.GetBaseType())), + }) + } + } + + rulesList := rules.ListRecordingRules() + for _, r := range rulesList { + if _, isExcludedMetric := excludedMetrics[r.GetOpts().Name]; !isExcludedMetric { + metricFamilies = append(metricFamilies, parser.Metric{ + Name: r.GetOpts().Name, + Help: r.GetOpts().Help, + Type: strings.ToUpper(string(r.GetType())), + }) + } + } jsonBytes, err := json.Marshal(metricFamilies) if err != nil { - fmt.Println(err) - os.Exit(1) + panic(err) } - fmt.Println(string(jsonBytes)) + fmt.Println(string(jsonBytes)) // Write the JSON string to standard output } diff --git a/vendor/github.com/kubevirt/monitoring/pkg/metrics/parser/metrics_parser.go b/vendor/github.com/kubevirt/monitoring/pkg/metrics/parser/metrics_parser.go index fd37bf52f..1f9c117cb 100644 --- a/vendor/github.com/kubevirt/monitoring/pkg/metrics/parser/metrics_parser.go +++ b/vendor/github.com/kubevirt/monitoring/pkg/metrics/parser/metrics_parser.go @@ -34,13 +34,13 @@ func CreateMetricFamily(m Metric) *dto.MetricFamily { metricType := dto.MetricType_UNTYPED switch m.Type { - case "Counter": + case "Counter", "CounterVec": metricType = dto.MetricType_COUNTER - case "Gauge": + case "Gauge", "GaugeVec": metricType = dto.MetricType_GAUGE - case "Histogram": + case "Histogram", "HistogramVec": metricType = dto.MetricType_HISTOGRAM - case "Summary": + case "Summary", "SummaryVec": metricType = dto.MetricType_SUMMARY } diff --git a/vendor/github.com/machadovilaca/operator-observability/pkg/docs/alerts.go b/vendor/github.com/machadovilaca/operator-observability/pkg/docs/alerts.go new file mode 100644 index 000000000..e5d65f922 --- /dev/null +++ b/vendor/github.com/machadovilaca/operator-observability/pkg/docs/alerts.go @@ -0,0 +1,96 @@ +package docs + +import ( + "bytes" + "log" + "sort" + "text/template" + + promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" +) + +const defaultAlertsTemplate = `# Operator Alerts + +{{- range . }} + +### {{.Name}} +**Summary:** {{ index .Annotations "summary" }}. + +**Description:** {{ index .Annotations "description" }}. + +**Severity:** {{ index .Labels "severity" }}. +{{- if .For }} + +**For:** {{ .For }}. +{{- end -}} +{{- end }} + +## Developing new alerts + +All alerts documented here are auto-generated and reflect exactly what is being +exposed. After developing new alerts or changing old ones please regenerate +this document. +` + +type alertDocs struct { + Name string + Expr string + For string + Annotations map[string]string + Labels map[string]string +} + +// BuildAlertsDocsWithCustomTemplate returns a string with the documentation +// for the given alerts, using the given template. +func BuildAlertsDocsWithCustomTemplate( + alerts []promv1.Rule, + tplString string, +) string { + + tpl, err := template.New("alerts").Parse(tplString) + if err != nil { + log.Fatalln(err) + } + + var allDocs []alertDocs + + if alerts != nil { + allDocs = append(allDocs, buildAlertsDocs(alerts)...) + } + + buf := bytes.NewBufferString("") + err = tpl.Execute(buf, allDocs) + if err != nil { + log.Fatalln(err) + } + + return buf.String() +} + +// BuildAlertsDocs returns a string with the documentation for the given +// metrics. +func BuildAlertsDocs(alerts []promv1.Rule) string { + return BuildAlertsDocsWithCustomTemplate(alerts, defaultAlertsTemplate) +} + +func buildAlertsDocs(alerts []promv1.Rule) []alertDocs { + alertsDocs := make([]alertDocs, len(alerts)) + for i, alert := range alerts { + alertsDocs[i] = alertDocs{ + Name: alert.Alert, + Expr: alert.Expr.String(), + For: string(*alert.For), + Annotations: alert.Annotations, + Labels: alert.Labels, + } + } + sortAlertsDocs(alertsDocs) + + return alertsDocs +} + +func sortAlertsDocs(alertsDocs []alertDocs) { + sort.Slice(alertsDocs, func(i, j int) bool { + return alertsDocs[i].Name < alertsDocs[j].Name + }) +} diff --git a/vendor/github.com/machadovilaca/operator-observability/pkg/docs/metrics.go b/vendor/github.com/machadovilaca/operator-observability/pkg/docs/metrics.go new file mode 100644 index 000000000..8271230bd --- /dev/null +++ b/vendor/github.com/machadovilaca/operator-observability/pkg/docs/metrics.go @@ -0,0 +1,106 @@ +package docs + +import ( + "bytes" + "log" + "sort" + "strings" + "text/template" + + "github.com/machadovilaca/operator-observability/pkg/operatormetrics" + "github.com/machadovilaca/operator-observability/pkg/operatorrules" +) + +const defaultMetricsTemplate = `# Operator Metrics + +{{- range . }} + +### {{.Name}} +{{.Help}}. + +Type: {{.Type}}. +{{- end }} + +## Developing new metrics + +All metrics documented here are auto-generated and reflect exactly what is being +exposed. After developing new metrics or changing old ones please regenerate +this document. +` + +type metricDocs struct { + Name string + Help string + Type string + ExtraFields map[string]string +} + +type docOptions interface { + GetOpts() operatormetrics.MetricOpts + GetType() operatormetrics.MetricType +} + +// BuildMetricsDocsWithCustomTemplate returns a string with the documentation +// for the given metrics, using the given template. +func BuildMetricsDocsWithCustomTemplate( + metrics []operatormetrics.Metric, + recordingRules []operatorrules.RecordingRule, + tplString string, +) string { + + tpl, err := template.New("metrics").Parse(tplString) + if err != nil { + log.Fatalln(err) + } + + var allDocs []metricDocs + + if metrics != nil { + allDocs = append(allDocs, buildMetricsDocs(metrics)...) + } + + if recordingRules != nil { + allDocs = append(allDocs, buildMetricsDocs(recordingRules)...) + } + + sortMetricsDocs(allDocs) + + buf := bytes.NewBufferString("") + err = tpl.Execute(buf, allDocs) + if err != nil { + log.Fatalln(err) + } + + return buf.String() +} + +// BuildMetricsDocs returns a string with the documentation for the given +// metrics. +func BuildMetricsDocs(metrics []operatormetrics.Metric, recordingRules []operatorrules.RecordingRule) string { + return BuildMetricsDocsWithCustomTemplate(metrics, recordingRules, defaultMetricsTemplate) +} + +func buildMetricsDocs[T docOptions](items []T) []metricDocs { + metricsDocs := make([]metricDocs, len(items)) + for i, metric := range items { + metricOpts := metric.GetOpts() + metricsDocs[i] = metricDocs{ + Name: metricOpts.Name, + Help: metricOpts.Help, + Type: getAndConvertMetricType(metric.GetType()), + ExtraFields: metricOpts.ExtraFields, + } + } + + return metricsDocs +} + +func sortMetricsDocs(metricsDocs []metricDocs) { + sort.Slice(metricsDocs, func(i, j int) bool { + return metricsDocs[i].Name < metricsDocs[j].Name + }) +} + +func getAndConvertMetricType(metricType operatormetrics.MetricType) string { + return strings.ReplaceAll(string(metricType), "Vec", "") +} diff --git a/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/prometheusrules.go b/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/prometheusrules.go new file mode 100644 index 000000000..50daee5d0 --- /dev/null +++ b/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/prometheusrules.go @@ -0,0 +1,75 @@ +package operatorrules + +import ( + "fmt" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" +) + +// BuildPrometheusRule builds a PrometheusRule object from the registered recording rules and alerts. +func BuildPrometheusRule(name, namespace string, labels map[string]string) (*promv1.PrometheusRule, error) { + spec, err := buildPrometheusRuleSpec() + if err != nil { + return nil, err + } + + return &promv1.PrometheusRule{ + TypeMeta: metav1.TypeMeta{ + APIVersion: promv1.SchemeGroupVersion.String(), + Kind: promv1.PrometheusRuleKind, + }, + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + Labels: labels, + }, + Spec: *spec, + }, nil +} + +func buildPrometheusRuleSpec() (*promv1.PrometheusRuleSpec, error) { + var groups []promv1.RuleGroup + + if len(operatorRegistry.registeredRecordingRules) != 0 { + groups = append(groups, promv1.RuleGroup{ + Name: "recordingRules.rules", + Rules: buildRecordingRulesRules(), + }) + } + + if len(operatorRegistry.registeredAlerts) != 0 { + groups = append(groups, promv1.RuleGroup{ + Name: "alerts.rules", + Rules: buildAlertsRules(), + }) + } + + if len(groups) == 0 { + return nil, fmt.Errorf("no registered recording rule or alert") + } + + return &promv1.PrometheusRuleSpec{Groups: groups}, nil +} + +func buildRecordingRulesRules() []promv1.Rule { + var rules []promv1.Rule + + for _, recordingRule := range operatorRegistry.registeredRecordingRules { + rules = append(rules, promv1.Rule{ + Record: recordingRule.MetricsOpts.Name, + Expr: recordingRule.Expr, + }) + } + + return rules +} + +func buildAlertsRules() []promv1.Rule { + var rules []promv1.Rule + for _, rule := range operatorRegistry.registeredAlerts { + rules = append(rules, rule) + } + return rules +} diff --git a/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/rbac.go b/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/rbac.go new file mode 100644 index 000000000..da582ceec --- /dev/null +++ b/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/rbac.go @@ -0,0 +1,45 @@ +package operatorrules + +import ( + rbacv1 "k8s.io/api/rbac/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func BuildRoleAndRoleBinding(namePrefix, namespace, promSAName, promSANamespace string, labels map[string]string) (*rbacv1.Role, *rbacv1.RoleBinding) { + r := &rbacv1.Role{ + ObjectMeta: metav1.ObjectMeta{ + Name: namePrefix + "-role", + Namespace: namespace, + Labels: labels, + }, + Rules: []rbacv1.PolicyRule{ + { + APIGroups: []string{""}, + Resources: []string{"services", "endpoints", "pods"}, + Verbs: []string{"get", "list"}, + }, + }, + } + + rb := &rbacv1.RoleBinding{ + ObjectMeta: metav1.ObjectMeta{ + Name: namePrefix + "-rolebinding", + Namespace: namespace, + Labels: labels, + }, + RoleRef: rbacv1.RoleRef{ + Kind: "Role", + Name: namePrefix + "-role", + APIGroup: rbacv1.GroupName, + }, + Subjects: []rbacv1.Subject{ + { + Kind: "ServiceAccount", + Name: promSAName, + Namespace: promSANamespace, + }, + }, + } + + return r, rb +} diff --git a/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/recordingrule.go b/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/recordingrule.go new file mode 100644 index 000000000..8dfc50c3e --- /dev/null +++ b/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/recordingrule.go @@ -0,0 +1,24 @@ +package operatorrules + +import ( + "k8s.io/apimachinery/pkg/util/intstr" + + "github.com/machadovilaca/operator-observability/pkg/operatormetrics" +) + +// RecordingRule is a struct that represents a Prometheus recording rule. +type RecordingRule struct { + MetricsOpts operatormetrics.MetricOpts + MetricType operatormetrics.MetricType + Expr intstr.IntOrString +} + +// GetOpts returns the metric options of the recording rule. +func (c RecordingRule) GetOpts() operatormetrics.MetricOpts { + return c.MetricsOpts +} + +// GetType returns the metric type of the recording rule. +func (c RecordingRule) GetType() operatormetrics.MetricType { + return c.MetricType +} diff --git a/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/registry.go b/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/registry.go new file mode 100644 index 000000000..77c2b62f2 --- /dev/null +++ b/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/registry.go @@ -0,0 +1,65 @@ +package operatorrules + +import ( + promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" +) + +var operatorRegistry = newRegistry() + +type operatorRegisterer struct { + registeredRecordingRules map[string]RecordingRule + registeredAlerts map[string]promv1.Rule +} + +func newRegistry() operatorRegisterer { + return operatorRegisterer{ + registeredRecordingRules: map[string]RecordingRule{}, + registeredAlerts: map[string]promv1.Rule{}, + } +} + +// RegisterRecordingRules registers the given recording rules. +func RegisterRecordingRules(recordingRules ...[]RecordingRule) error { + for _, recordingRuleList := range recordingRules { + for _, recordingRule := range recordingRuleList { + operatorRegistry.registeredRecordingRules[recordingRule.MetricsOpts.Name] = recordingRule + } + } + + return nil +} + +// RegisterAlerts registers the given alerts. +func RegisterAlerts(alerts ...[]promv1.Rule) error { + for _, alertList := range alerts { + for _, alert := range alertList { + operatorRegistry.registeredAlerts[alert.Alert] = alert + } + } + + return nil +} + +// ListRecordingRules returns the registered recording rules. +func ListRecordingRules() []RecordingRule { + var rules []RecordingRule + for _, rule := range operatorRegistry.registeredRecordingRules { + rules = append(rules, rule) + } + return rules +} + +// ListAlerts returns the registered alerts. +func ListAlerts() []promv1.Rule { + var alerts []promv1.Rule + for _, alert := range operatorRegistry.registeredAlerts { + alerts = append(alerts, alert) + } + return alerts +} + +// CleanRegistry removes all registered rules and alerts. +func CleanRegistry() error { + operatorRegistry = newRegistry() + return nil +} diff --git a/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/schema.go b/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/schema.go new file mode 100644 index 000000000..c06a032d7 --- /dev/null +++ b/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/schema.go @@ -0,0 +1,22 @@ +package operatorrules + +import ( + rbacv1 "k8s.io/api/rbac/v1" + "k8s.io/apimachinery/pkg/runtime" + + promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" +) + +func AddToScheme(scheme *runtime.Scheme) error { + err := promv1.AddToScheme(scheme) + if err != nil { + return err + } + + err = rbacv1.AddToScheme(scheme) + if err != nil { + return err + } + + return nil +} diff --git a/vendor/modules.txt b/vendor/modules.txt index ccadf136d..64ff31999 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -177,12 +177,14 @@ github.com/jpillora/backoff # github.com/json-iterator/go v1.1.12 ## explicit; go 1.12 github.com/json-iterator/go -# github.com/kubevirt/monitoring/pkg/metrics/parser v0.0.0-20230706095033-373a95665d5a +# github.com/kubevirt/monitoring/pkg/metrics/parser v0.0.0-20240125201600-b689e9c89409 ## explicit; go 1.20 github.com/kubevirt/monitoring/pkg/metrics/parser # github.com/machadovilaca/operator-observability v0.0.12 ## explicit; go 1.20 +github.com/machadovilaca/operator-observability/pkg/docs github.com/machadovilaca/operator-observability/pkg/operatormetrics +github.com/machadovilaca/operator-observability/pkg/operatorrules # github.com/mailru/easyjson v0.7.7 ## explicit; go 1.12 github.com/mailru/easyjson/buffer