Skip to content

Commit

Permalink
Fix VirtualMachineCRCErrors not getting resolved
Browse files Browse the repository at this point in the history
The controller keeps reporting old metric values
even after the VM is deleted. This commit updates
the metric name and labels so that we can set up
the metric value to 0, and no longer trigger the
alert on VM deletion

Signed-off-by: João Vilaça <jvilaca@redhat.com>
  • Loading branch information
machadovilaca committed Jan 4, 2024
1 parent b29e389 commit 47709c4
Show file tree
Hide file tree
Showing 6 changed files with 58 additions and 69 deletions.
6 changes: 5 additions & 1 deletion controllers/vm_controller.go
Expand Up @@ -58,7 +58,11 @@ func (r *VmReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Re
vm := kubevirtv1.VirtualMachine{}
if err := r.client.Get(ctx, req.NamespacedName, &vm); err != nil {
if errors.IsNotFound(err) {
// VM was deleted, so we can ignore it
// VM was deleted
vm.Name = req.Name
vm.Namespace = req.Namespace
metrics.SetVmWithVolume(&vm, nil, nil)

return ctrl.Result{}, nil
}
r.log.Error(err, "Error getting VM", "vm", req.NamespacedName)
Expand Down
4 changes: 2 additions & 2 deletions docs/metrics.md
Expand Up @@ -19,7 +19,7 @@ The increase in the number of rejected template validators, over the last hour.
The total number of rejected template validators. Type: Counter.
### kubevirt_ssp_template_validator_up
The total number of running virt-template-validator pods. Type: Gauge.
### kubevirt_ssp_vm_rbd_volume
VM with RBD mounted volume. Type: Gauge.
### kubevirt_ssp_vm_rbd_block_volume_without_rxbounce
VM with RBD mounted Block volume (without rxbounce option set). Type: Gauge.
## Developing new metrics
After developing new metrics or changing old ones, please run `make generate-doc` to regenerate this document.
4 changes: 2 additions & 2 deletions internal/operands/metrics/resources.go
Expand Up @@ -166,15 +166,15 @@ func getAlertRules() ([]promv1.Rule, error) {
},
{
Alert: "VirtualMachineCRCErrors",
Expr: intstr.FromString("(count(kubevirt_ssp_vm_rbd_volume{volume_mode=\"Block\", rxbounce_enabled=\"false\"} > 0) or vector(0)) > 0"),
Expr: intstr.FromString("(count(kubevirt_ssp_vm_rbd_block_volume_without_rxbounce > 0) or vector(0)) > 0"),
Annotations: map[string]string{
"description": "{{ $value }} Virtual Machines are in risk of causing CRC errors and major service outages",
"summary": "When running VMs using ODF storage with 'rbd' mounter or 'rbd.csi.ceph.com provisioner', it will report bad crc/signature errors and cluster performance will be severely degraded if krbd:rxbounce is not set.",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "VirtualMachineCRCErrors"),
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "warning",
healthImpactAlertLabelKey: "none",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
Expand Down
36 changes: 22 additions & 14 deletions pkg/monitoring/metrics/rbd_metrics.go
@@ -1,7 +1,6 @@
package metrics

import (
"strconv"
"strings"

"github.com/machadovilaca/operator-observability/pkg/operatormetrics"
Expand All @@ -16,36 +15,45 @@ var (

vmRbdVolume = operatormetrics.NewGaugeVec(
operatormetrics.MetricOpts{
Name: metricPrefix + "vm_rbd_volume",
Help: "VM with RBD mounted volume",
Name: metricPrefix + "vm_rbd_block_volume_without_rxbounce",
Help: "VM with RBD mounted Block volume (without rxbounce option set)",
ExtraFields: map[string]string{
"StabilityLevel": "ALPHA",
},
},
[]string{"name", "namespace", "pv_name", "volume_mode", "rxbounce_enabled"},
[]string{"name", "namespace"},
)
)

func SetVmWithVolume(vm *kubevirtv1.VirtualMachine, pvc *k8sv1.PersistentVolumeClaim, pv *k8sv1.PersistentVolume) {
if pv.Spec.PersistentVolumeSource.CSI == nil || !strings.Contains(pv.Spec.PersistentVolumeSource.CSI.Driver, "rbd.csi.ceph.com") {
if pv == nil || pvc == nil {
vmRbdVolume.WithLabelValues(vm.Name, vm.Namespace).Set(0)
return
}

// If the volume is not using the RBD CSI driver, or if it is not using the block mode, it is not impacted by https://bugzilla.redhat.com/2109455
if !usesRbdCsiDriver(pv) || *pvc.Spec.VolumeMode != k8sv1.PersistentVolumeBlock {
vmRbdVolume.WithLabelValues(vm.Name, vm.Namespace).Set(0)
return
}

mounter, ok := pv.Spec.PersistentVolumeSource.CSI.VolumeAttributes["mounter"]
// If mounter is not set, it is using the default mounter, which is "rbd"
if ok && mounter != "rbd" {
vmRbdVolume.WithLabelValues(vm.Name, vm.Namespace).Set(0)
return
}

rxbounce, ok := pv.Spec.PersistentVolumeSource.CSI.VolumeAttributes["mapOptions"]
if !ok {
rxbounce = ""
// If mapOptions is not set, or if it is set but does not contain "krbd:rxbounce", it might be impacted by https://bugzilla.redhat.com/2109455
if !ok || !strings.Contains(rxbounce, "krbd:rxbounce") {
vmRbdVolume.WithLabelValues(vm.Name, vm.Namespace).Set(1)
return
}

vmRbdVolume.WithLabelValues(
vm.Name,
vm.Namespace,
pv.Name,
string(*pvc.Spec.VolumeMode),
strconv.FormatBool(strings.Contains(rxbounce, "krbd:rxbounce")),
).Set(1)
vmRbdVolume.WithLabelValues(vm.Name, vm.Namespace).Set(0)
}

func usesRbdCsiDriver(pv *k8sv1.PersistentVolume) bool {
return pv.Spec.PersistentVolumeSource.CSI != nil && strings.Contains(pv.Spec.PersistentVolumeSource.CSI.Driver, "rbd.csi.ceph.com")
}
20 changes: 5 additions & 15 deletions pkg/monitoring/metrics/rbd_metrics_test.go
@@ -1,9 +1,6 @@
package metrics

import (
"strconv"
"strings"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"

Expand Down Expand Up @@ -73,14 +70,7 @@ var _ = Describe("rbd_metrics", func() {
SetVmWithVolume(vm, pvc, pv)

dto := &ioprometheusclient.Metric{}
err := vmRbdVolume.WithLabelValues(
vm.Name,
vm.Namespace,
pv.Name,
string(*pvc.Spec.VolumeMode),
strconv.FormatBool(strings.Contains(mapOptions, "krbd:rxbounce")),
).Write(dto)

err := vmRbdVolume.WithLabelValues(vm.Name, vm.Namespace).Write(dto)
Expect(err).ToNot(HaveOccurred())

if metricExists {
Expand All @@ -89,10 +79,10 @@ var _ = Describe("rbd_metrics", func() {
Expect(dto.GetGauge().GetValue()).To(Equal(float64(0)))
}
},
Entry("rbd driver and default mounter", rbdDriver, "", "krbd:rxbounce", true),
Entry("rbd driver and rbd mounter", rbdDriver, "rbd", "krbd:rxbounce", true),
Entry("non-rbd driver", "random", "rbd", "krbd:rxbounce", false),
Entry("non-rbd mounter", rbdDriver, "random", "krbd:rxbounce", false),
Entry("rbd driver and default mounter", rbdDriver, "", "krbd:rxbounce", false),
Entry("rbd driver and rbd mounter", rbdDriver, "rbd", "krbd:rxbounce", false),
Entry("non-rbd driver", "random", "rbd", "", false),
Entry("non-rbd mounter", rbdDriver, "random", "", false),
Entry("krbd:rxbounce not enabled", rbdDriver, "rbd", "random", true),
)
})
57 changes: 22 additions & 35 deletions tests/monitoring_test.go
Expand Up @@ -3,8 +3,8 @@ package tests
import (
"context"
"crypto/tls"
"errors"
"fmt"
"k8s.io/apimachinery/pkg/api/errors"
"net"
"net/http"
"strings"
Expand Down Expand Up @@ -175,7 +175,7 @@ var _ = Describe("Prometheus Alerts", func() {

AfterEach(func() {
vmError := apiClient.Delete(ctx, vm)
Expect(vmError).ToNot(HaveOccurred())
Expect(client.IgnoreNotFound(vmError)).ToNot(HaveOccurred())

if pvc != nil {
pvcError := apiClient.Delete(ctx, pvc)
Expand Down Expand Up @@ -271,22 +271,27 @@ var _ = Describe("Prometheus Alerts", func() {
return vmName
}

It("[test_id:TODO] Should not create kubevirt_ssp_vm_rbd_volume when not using DataVolume or PVC", func() {
vmName := createResources(false, true)
seriesShouldNotBeDetected(fmt.Sprintf("kubevirt_ssp_vm_rbd_volume{name='%s'}", vmName))
alertShouldNotBeActive("VirtualMachineCRCErrors")
})

It("[test_id:TODO] Should not fire VirtualMachineCRCErrors when rxbounce is enabled", func() {
vmName := createResources(true, true)
waitForSeriesToBeDetected(fmt.Sprintf("kubevirt_ssp_vm_rbd_volume{name='%s', rxbounce_enabled='true'}", vmName))
waitForSeriesToBeDetected(fmt.Sprintf("kubevirt_ssp_vm_rbd_block_volume_without_rxbounce{name='%s'} == 0", vmName))
alertShouldNotBeActive("VirtualMachineCRCErrors")
})

It("[test_id:10549] Should fire VirtualMachineCRCErrors when rxbounce is disabled", func() {
vmName := createResources(true, false)
waitForSeriesToBeDetected(fmt.Sprintf("kubevirt_ssp_vm_rbd_volume{name='%s', rxbounce_enabled='false'}", vmName))
waitForSeriesToBeDetected(fmt.Sprintf("kubevirt_ssp_vm_rbd_block_volume_without_rxbounce{name='%s'} == 1", vmName))
waitForAlertToActivate("VirtualMachineCRCErrors")

err := apiClient.Delete(ctx, vm)
Expect(err).ToNot(HaveOccurred())

Eventually(func(g Gomega) {

Check failure on line 288 in tests/monitoring_test.go

View workflow job for this annotation

GitHub Actions / build

ginkgo-linter: "Eventually": missing assertion method. Expected "Should()" or "ShouldNot()" (ginkgolinter)
err := apiClient.Get(ctx, types.NamespacedName{Name: vmName, Namespace: strategy.GetNamespace()}, vm)
g.Expect(err).To(MatchError(errors.IsNotFound, "IsNotFound"))
})

waitForSeriesToBeDetected(fmt.Sprintf("kubevirt_ssp_vm_rbd_block_volume_without_rxbounce{name='%s'} == 0", vmName))
alertShouldNotBeActive("VirtualMachineCRCErrors")
})
})
})
Expand All @@ -301,7 +306,7 @@ func checkAlert(alertName string) (*promApiv1.Alert, error) {
}

func waitForAlertToActivate(alertName string) {
Eventually(func() error {
EventuallyWithOffset(1, func() error {
alert, err := checkAlert(alertName)
if err != nil {
return err
Expand All @@ -314,55 +319,37 @@ func waitForAlertToActivate(alertName string) {
}

func alertShouldNotBeActive(alertName string) {
Eventually(func() error {
EventuallyWithOffset(1, func() error {
alert, err := checkAlert(alertName)
if err != nil {
return err
}
if alert == nil {
if alert == nil || alert.State == "inactive" {
return nil
}
return fmt.Errorf("alert %s found", alertName)
}, env.Timeout(), time.Second).ShouldNot(HaveOccurred())

Consistently(func() error {
ConsistentlyWithOffset(1, func() error {
alert, err := checkAlert(alertName)
if err != nil {
return err
}
if alert == nil {
if alert == nil || alert.State == "inactive" {
return nil
}
return fmt.Errorf("alert %s found", alertName)
}, env.ShortTimeout(), time.Second).ShouldNot(HaveOccurred())
}

func waitForSeriesToBeDetected(seriesName string) {
Eventually(func() bool {
EventuallyWithOffset(1, func() bool {
results, _, err := getPrometheusClient().Query(context.TODO(), seriesName, time.Now())
Expect(err).ShouldNot(HaveOccurred())
return results.String() != ""
}, env.Timeout(), 10*time.Second).Should(BeTrue())
}

func seriesShouldNotBeDetected(seriesName string) {
Eventually(func() bool {
results, _, err := getPrometheusClient().Query(context.TODO(), seriesName, time.Now())
if err != nil {
return false
}
return results.String() == ""
}, env.Timeout(), 10*time.Second).Should(BeTrue())

Consistently(func() bool {
results, _, err := getPrometheusClient().Query(context.TODO(), seriesName, time.Now())
if err != nil {
return false
}
return results.String() == ""
}, env.ShortTimeout(), 10*time.Second).Should(BeTrue())
}

func getAlertByName(alerts promApiv1.AlertsResult, alertName string) *promApiv1.Alert {
for _, alert := range alerts.Alerts {
if string(alert.Labels["alertname"]) == alertName {
Expand Down Expand Up @@ -436,7 +423,7 @@ func getAuthorizationTokenForPrometheus() string {
if strings.HasPrefix(secret.Name, "prometheus-k8s-token") {
tokenBytes, ok = secret.Data["token"]
if !ok {
return errors.New("token not found in secret data")
return fmt.Errorf("token not found in secret %s", secret.Name)
}
break
}
Expand Down

0 comments on commit 47709c4

Please sign in to comment.