Skip to content

Commit

Permalink
refactor recording rules and alerts code
Browse files Browse the repository at this point in the history
Signed-off-by: avlitman <alitman@redhat.com>
  • Loading branch information
avlitman committed Feb 8, 2024
1 parent 4b42532 commit 1579c18
Show file tree
Hide file tree
Showing 30 changed files with 842 additions and 388 deletions.
19 changes: 14 additions & 5 deletions docs/metrics.md
Original file line number Diff line number Diff line change
@@ -1,25 +1,34 @@
# SSP Operator metrics
This document aims to help users that are not familiar with metrics exposed by the SSP Operator.
All metrics documented here are auto-generated by the utility tool `tools/metricsdocs` and reflects exactly what is being exposed.

## SSP Operator Metrics List
### kubevirt_ssp_common_templates_restored_increase
The increase in the number of common templates restored by the operator back to their original state, over the last hour. Type: Gauge.

### kubevirt_ssp_common_templates_restored_total
The total number of common templates restored by the operator back to their original state. Type: Counter.

### kubevirt_ssp_operator_reconcile_succeeded
Set to 1 if the reconcile process of all operands completes with no errors, and to 0 otherwise. Type: Gauge.

### kubevirt_ssp_operator_reconcile_succeeded_aggregated
The total number of ssp-operator pods reconciling with no errors. Type: Gauge.

### kubevirt_ssp_operator_up
The total number of running ssp-operator pods. Type: Gauge.

### kubevirt_ssp_template_validator_rejected_increase
The increase in the number of rejected template validators, over the last hour. Type: Gauge.

### kubevirt_ssp_template_validator_rejected_total
The total number of rejected template validators. Type: Counter.

### kubevirt_ssp_template_validator_up
The total number of running virt-template-validator pods. Type: Gauge.

### kubevirt_ssp_vm_rbd_block_volume_without_rxbounce
VM with RBD mounted Block volume (without rxbounce option set). Type: Gauge.
[ALPHA] VM with RBD mounted Block volume (without rxbounce option set). Type: Gauge.

## Developing new metrics
After developing new metrics or changing old ones, please run `make generate-doc` to regenerate this document.

All metrics documented here are auto-generated and reflect exactly what is being
exposed. After developing new metrics or changing old ones please regenerate
this document.
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ require (
github.com/blang/semver/v4 v4.0.0
github.com/fsnotify/fsnotify v1.7.0
github.com/go-logr/logr v1.4.1
github.com/kubevirt/monitoring/pkg/metrics/parser v0.0.0-20230706095033-373a95665d5a
github.com/kubevirt/monitoring/pkg/metrics/parser v0.0.0-20240125201600-b689e9c89409
github.com/machadovilaca/operator-observability v0.0.12
github.com/onsi/ginkgo/v2 v2.15.0
github.com/onsi/gomega v1.31.1
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -283,8 +283,8 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/kubevirt/monitoring/pkg/metrics/parser v0.0.0-20230706095033-373a95665d5a h1:7YL/LNARjQWuXihwJ4b/nVzddGvoFRI7JqxAKISyJkg=
github.com/kubevirt/monitoring/pkg/metrics/parser v0.0.0-20230706095033-373a95665d5a/go.mod h1:qGj2agzgwQ27nYhP3xhLs+IBzE5+ALNUg8bDfMcwPqo=
github.com/kubevirt/monitoring/pkg/metrics/parser v0.0.0-20240125201600-b689e9c89409 h1:w+MkYRwdxddjNwR7BbNMWP24wVli/G6zna86wfbhiAk=
github.com/kubevirt/monitoring/pkg/metrics/parser v0.0.0-20240125201600-b689e9c89409/go.mod h1:qGj2agzgwQ27nYhP3xhLs+IBzE5+ALNUg8bDfMcwPqo=
github.com/machadovilaca/operator-observability v0.0.12 h1:rd9iFmvWJiYS8LdW6siAiz8kLigcNLa1+dmCVb7dFxs=
github.com/machadovilaca/operator-observability v0.0.12/go.mod h1:NGkaR3HEYLScVQf6kQAyxWOSN1ltHcsEvHU/8iIJ8cE=
github.com/mailru/easyjson v0.0.0-20190614124828-94de47d64c63/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
Expand Down
7 changes: 6 additions & 1 deletion internal/operands/metrics/reconcile.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"kubevirt.io/ssp-operator/internal/common"
"kubevirt.io/ssp-operator/internal/operands"
"kubevirt.io/ssp-operator/pkg/monitoring/rules"
)

// Define RBAC rules needed by this operand:
Expand Down Expand Up @@ -96,7 +97,11 @@ func reconcileMonitoringRbacRoleBinding(request *common.Request) (common.Reconci
}

func reconcilePrometheusRule(request *common.Request) (common.ReconcileResult, error) {
prometheusRule, err := newPrometheusRule(request.Namespace)
if err := rules.SetupRules(); err != nil {
return common.ReconcileResult{}, err
}

prometheusRule, err := rules.BuildPrometheusRule(request.Namespace)
if err != nil {
return common.ReconcileResult{}, err
}
Expand Down
8 changes: 6 additions & 2 deletions internal/operands/metrics/reconcile_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (

ssp "kubevirt.io/ssp-operator/api/v1beta2"
"kubevirt.io/ssp-operator/internal/common"
"kubevirt.io/ssp-operator/pkg/monitoring/rules"
)

var log = logf.Log.WithName("metrics_operand")
Expand Down Expand Up @@ -67,7 +68,7 @@ var _ = Describe("Metrics operand", func() {
_, err := operand.Reconcile(&request)
Expect(err).ToNot(HaveOccurred())

prometheusRule, err := newPrometheusRule(namespace)
prometheusRule, err := rules.BuildPrometheusRule(namespace)
Expect(err).ToNot(HaveOccurred())

ExpectResourceExists(prometheusRule, request)
Expand All @@ -82,7 +83,7 @@ var _ = Describe("Metrics operand", func() {
os.Setenv(runbookURLTemplateEnv, template)
}

prometheusRule, err := newPrometheusRule(namespace)
err := rules.SetupRules()

if strings.Count(template, "%s") != 1 || strings.Count(template, "%") != 1 {
Expect(err).To(HaveOccurred())
Expand All @@ -91,6 +92,9 @@ var _ = Describe("Metrics operand", func() {

Expect(err).ToNot(HaveOccurred())

prometheusRule, err := rules.BuildPrometheusRule(namespace)
Expect(err).ToNot(HaveOccurred())

for _, group := range prometheusRule.Spec.Groups {
for _, rule := range group.Rules {
if rule.Alert != "" {
Expand Down
47 changes: 0 additions & 47 deletions internal/operands/metrics/resources.go
Original file line number Diff line number Diff line change
@@ -1,15 +1,9 @@
package metrics

import (
"errors"
"os"
"strings"

promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
rbac "k8s.io/api/rbac/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

"kubevirt.io/ssp-operator/pkg/monitoring/rules"
)

const (
Expand Down Expand Up @@ -96,44 +90,3 @@ func newServiceMonitorCR(namespace string) *promv1.ServiceMonitor {
},
}
}

func newPrometheusRule(namespace string) (*promv1.PrometheusRule, error) {
runbookURLTemplate, err := getRunbookURLTemplate()
if err != nil {
return nil, err
}

return &promv1.PrometheusRule{
ObjectMeta: metav1.ObjectMeta{
Name: PrometheusRuleName,
Namespace: namespace,
Labels: map[string]string{
"prometheus": "k8s",
"role": "alert-rules",
"kubevirt.io": "prometheus-rules",
PrometheusLabelKey: PrometheusLabelValue,
},
},
Spec: promv1.PrometheusRuleSpec{
Groups: []promv1.RuleGroup{
{
Name: "cnv.rules",
Rules: append(rules.RecordRules(), rules.AlertRules(runbookURLTemplate)...),
},
},
},
}, nil
}

func getRunbookURLTemplate() (string, error) {
runbookURLTemplate, exists := os.LookupEnv(runbookURLTemplateEnv)
if !exists {
runbookURLTemplate = defaultRunbookURLTemplate
}

if strings.Count(runbookURLTemplate, "%s") != 1 || strings.Count(runbookURLTemplate, "%") != 1 {
return "", errors.New("runbook URL template must have exactly 1 %s substring")
}

return runbookURLTemplate, nil
}
5 changes: 4 additions & 1 deletion internal/template-validator/validator/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,10 @@ func (app *App) Run() {
registerReadinessProbe()

// setup monitoring
validatorMetrics.SetupMetrics()
err = validatorMetrics.SetupMetrics()
if err != nil {
logger.Log.Error(err, "Error setting up metrics")
}

logger.Log.Info("TLS certs directory", "directory", app.TLSInfo.CertsDirectory)

Expand Down
11 changes: 7 additions & 4 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -181,15 +181,18 @@ func (s *prometheusServer) getPrometheusTLSConfig(ctx context.Context, certWatch
}
}

func newPrometheusServer(metricsAddr string, cache cache.Cache) *prometheusServer {
sspMetrics.SetupMetrics()
func newPrometheusServer(metricsAddr string, cache cache.Cache) (*prometheusServer, error) {
err := sspMetrics.SetupMetrics()
if err != nil {
return nil, err
}

return &prometheusServer{
certPath: path.Join(sdkTLSDir, sdkTLSCrt),
keyPath: path.Join(sdkTLSDir, sdkTLSKey),
cache: cache,
serverAddress: metricsAddr,
}
}, nil
}

func main() {
Expand Down Expand Up @@ -249,7 +252,7 @@ func main() {
}
}

metricsServer := newPrometheusServer(metricsAddr, mgr.GetCache())
metricsServer, err := newPrometheusServer(metricsAddr, mgr.GetCache())
if err != nil {
setupLog.Error(err, "unable create Prometheus server")
os.Exit(1)
Expand Down
13 changes: 8 additions & 5 deletions pkg/monitoring/metrics/ssp-operator/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,17 @@ import (
runtimemetrics "sigs.k8s.io/controller-runtime/pkg/metrics"
)

func SetupMetrics() {
func SetupMetrics() error {
operatormetrics.Register = runtimemetrics.Registry.Register

if err := operatormetrics.RegisterMetrics(
return operatormetrics.RegisterMetrics(
operatorMetrics,
rbdMetrics,
templateMetrics,
); err != nil {
panic(err)
}
)
}

// ListMetrics registered prometheus metrics
func ListMetrics() []operatormetrics.Metric {
return operatormetrics.ListMetrics()
}
13 changes: 8 additions & 5 deletions pkg/monitoring/metrics/template-validator/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,13 @@ import (
"github.com/machadovilaca/operator-observability/pkg/operatormetrics"
)

func SetupMetrics() {
if err := operatormetrics.RegisterMetrics(
func SetupMetrics() error {
return operatormetrics.RegisterMetrics(
templateMetrics,
); err != nil {
panic(err)
}
)
}

// ListMetrics registered prometheus metrics
func ListMetrics() []operatormetrics.Metric {
return operatormetrics.ListMetrics()
}
89 changes: 89 additions & 0 deletions pkg/monitoring/rules/alerts/operator.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
package alerts

import (
promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/utils/ptr"
)

const (
severityAlertLabelKey = "severity"
healthImpactAlertLabelKey = "operator_health_impact"
)

func operatorAlerts() []promv1.Rule {
return []promv1.Rule{
{
Alert: "SSPDown",
Expr: intstr.FromString("kubevirt_ssp_operator_up == 0"),
For: ptr.To[promv1.Duration]("5m"),
Annotations: map[string]string{
"summary": "All SSP operator pods are down.",
},
Labels: map[string]string{
severityAlertLabelKey: "critical",
healthImpactAlertLabelKey: "critical",
},
},
{
Alert: "SSPTemplateValidatorDown",
Expr: intstr.FromString("kubevirt_ssp_template_validator_up == 0"),
For: ptr.To[promv1.Duration]("5m"),
Annotations: map[string]string{
"summary": "All Template Validator pods are down.",
},
Labels: map[string]string{
severityAlertLabelKey: "critical",
healthImpactAlertLabelKey: "critical",
},
},
{
Alert: "SSPFailingToReconcile",
Expr: intstr.FromString("(kubevirt_ssp_operator_reconcile_succeeded_aggregated == 0) and (kubevirt_ssp_operator_up > 0)"),
For: ptr.To[promv1.Duration]("5m"),
Annotations: map[string]string{
"summary": "The ssp-operator pod is up but failing to reconcile.",
},
Labels: map[string]string{
severityAlertLabelKey: "critical",
healthImpactAlertLabelKey: "critical",
},
},
{
Alert: "SSPHighRateRejectedVms",
Expr: intstr.FromString("kubevirt_ssp_template_validator_rejected_increase > 5"),
For: ptr.To[promv1.Duration]("5m"),
Annotations: map[string]string{
"summary": "High rate of rejected Vms.",
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "warning",
},
},
{
Alert: "SSPCommonTemplatesModificationReverted",
Expr: intstr.FromString("kubevirt_ssp_common_templates_restored_increase > 0"),
For: ptr.To[promv1.Duration]("0m"),
Annotations: map[string]string{
"summary": "Common Templates manual modifications were reverted by the operator.",
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "none",
},
},
{
Alert: "VMStorageClassWarning",
Expr: intstr.FromString("(count(kubevirt_ssp_vm_rbd_block_volume_without_rxbounce > 0) or vector(0)) > 0"),
Annotations: map[string]string{
"summary": "{{ $value }} Virtual Machines may cause reports of bad crc/signature errors due to certain I/O patterns.",
"description": "When running VMs using ODF storage with 'rbd' mounter or 'rbd.csi.ceph.com provisioner', VMs may cause reports of bad crc/signature errors due to certain I/O patterns. Cluster performance can be severely degraded if the number of re-transmissions due to crc errors causes network saturation.",
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "none",
},
},
}
}

0 comments on commit 1579c18

Please sign in to comment.