Skip to content

Commit

Permalink
Fix VirtualMachineCRCErrors not getting resolved
Browse files Browse the repository at this point in the history
Signed-off-by: João Vilaça <jvilaca@redhat.com>
  • Loading branch information
machadovilaca committed Nov 29, 2023
1 parent 161c4cc commit 58ad69e
Show file tree
Hide file tree
Showing 6 changed files with 44 additions and 37 deletions.
6 changes: 5 additions & 1 deletion controllers/vm_controller.go
Expand Up @@ -58,7 +58,11 @@ func (r *VmReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Re
vm := kubevirtv1.VirtualMachine{}
if err := r.client.Get(ctx, req.NamespacedName, &vm); err != nil {
if errors.IsNotFound(err) {
// VM was deleted, so we can ignore it
// VM was deleted
vm.Name = req.Name
vm.Namespace = req.Namespace
metrics.SetVmWithVolume(&vm, nil, nil)

return ctrl.Result{}, nil
}
r.log.Error(err, "Error getting VM", "vm", req.NamespacedName)
Expand Down
4 changes: 2 additions & 2 deletions docs/metrics.md
Expand Up @@ -19,7 +19,7 @@ The increase in the number of rejected template validators, over the last hour.
The total number of rejected template validators. Type: Counter.
### kubevirt_ssp_template_validator_up
The total number of running virt-template-validator pods. Type: Gauge.
### kubevirt_ssp_vm_rbd_volume
VM with RBD mounted volume. Type: Gauge.
### kubevirt_ssp_vm_rbd_block_volume_without_rxbounce
VM with RBD mounted Block volume (without rxbounce option set). Type: Gauge.
## Developing new metrics
After developing new metrics or changing old ones, please run `make generate-doc` to regenerate this document.
4 changes: 2 additions & 2 deletions internal/operands/metrics/resources.go
Expand Up @@ -166,15 +166,15 @@ func getAlertRules() ([]promv1.Rule, error) {
},
{
Alert: "VirtualMachineCRCErrors",
Expr: intstr.FromString("(count(kubevirt_ssp_vm_rbd_volume{volume_mode=\"Block\", rxbounce_enabled=\"false\"} > 0) or vector(0)) > 0"),
Expr: intstr.FromString("(count(kubevirt_ssp_vm_rbd_block_volume_without_rxbounce > 0) or vector(0)) > 0"),
Annotations: map[string]string{
"description": "{{ $value }} Virtual Machines are in risk of causing CRC errors and major service outages",
"summary": "When running VMs using ODF storage with 'rbd' mounter or 'rbd.csi.ceph.com provisioner', it will report bad crc/signature errors and cluster performance will be severely degraded if krbd:rxbounce is not set.",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "VirtualMachineCRCErrors"),
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "warning",
healthImpactAlertLabelKey: "none",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
Expand Down
36 changes: 22 additions & 14 deletions pkg/monitoring/metrics/rbd_metrics.go
@@ -1,7 +1,6 @@
package metrics

import (
"strconv"
"strings"

"github.com/machadovilaca/operator-observability/pkg/operatormetrics"
Expand All @@ -16,36 +15,45 @@ var (

vmRbdVolume = operatormetrics.NewGaugeVec(
operatormetrics.MetricOpts{
Name: metricPrefix + "vm_rbd_volume",
Help: "VM with RBD mounted volume",
Name: metricPrefix + "vm_rbd_block_volume_without_rxbounce",
Help: "VM with RBD mounted Block volume (without rxbounce option set)",
ExtraFields: map[string]string{
"StabilityLevel": "ALPHA",
},
},
[]string{"name", "namespace", "pv_name", "volume_mode", "rxbounce_enabled"},
[]string{"name", "namespace"},
)
)

func SetVmWithVolume(vm *kubevirtv1.VirtualMachine, pvc *k8sv1.PersistentVolumeClaim, pv *k8sv1.PersistentVolume) {
if pv.Spec.PersistentVolumeSource.CSI == nil || !strings.Contains(pv.Spec.PersistentVolumeSource.CSI.Driver, "rbd.csi.ceph.com") {
if pv == nil || pvc == nil {
vmRbdVolume.WithLabelValues(vm.Name, vm.Namespace).Set(0)
return
}

// If the volume is not using the RBD CSI driver, or if it is not using the block mode, it is not impacted by https://bugzilla.redhat.com/2109455
if !usesRbdCsiDriver(pv) || *pvc.Spec.VolumeMode != k8sv1.PersistentVolumeBlock {
vmRbdVolume.WithLabelValues(vm.Name, vm.Namespace).Set(0)
return
}

mounter, ok := pv.Spec.PersistentVolumeSource.CSI.VolumeAttributes["mounter"]
// If mounter is not set, it is using the default mounter, which is "rbd"
if ok && mounter != "rbd" {
vmRbdVolume.WithLabelValues(vm.Name, vm.Namespace).Set(0)
return
}

rxbounce, ok := pv.Spec.PersistentVolumeSource.CSI.VolumeAttributes["mapOptions"]
if !ok {
rxbounce = ""
// If mapOptions is not set, or if it is set but does not contain "krbd:rxbounce", it might be impacted by https://bugzilla.redhat.com/2109455
if !ok || !strings.Contains(rxbounce, "krbd:rxbounce") {
vmRbdVolume.WithLabelValues(vm.Name, vm.Namespace).Set(1)
return
}

vmRbdVolume.WithLabelValues(
vm.Name,
vm.Namespace,
pv.Name,
string(*pvc.Spec.VolumeMode),
strconv.FormatBool(strings.Contains(rxbounce, "krbd:rxbounce")),
).Set(1)
vmRbdVolume.WithLabelValues(vm.Name, vm.Namespace).Set(0)
}

func usesRbdCsiDriver(pv *k8sv1.PersistentVolume) bool {
return pv.Spec.PersistentVolumeSource.CSI != nil && strings.Contains(pv.Spec.PersistentVolumeSource.CSI.Driver, "rbd.csi.ceph.com")
}
20 changes: 5 additions & 15 deletions pkg/monitoring/metrics/rbd_metrics_test.go
@@ -1,9 +1,6 @@
package metrics

import (
"strconv"
"strings"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"

Expand Down Expand Up @@ -73,14 +70,7 @@ var _ = Describe("rbd_metrics", func() {
SetVmWithVolume(vm, pvc, pv)

dto := &ioprometheusclient.Metric{}
err := vmRbdVolume.WithLabelValues(
vm.Name,
vm.Namespace,
pv.Name,
string(*pvc.Spec.VolumeMode),
strconv.FormatBool(strings.Contains(mapOptions, "krbd:rxbounce")),
).Write(dto)

err := vmRbdVolume.WithLabelValues(vm.Name, vm.Namespace).Write(dto)
Expect(err).ToNot(HaveOccurred())

if metricExists {
Expand All @@ -89,10 +79,10 @@ var _ = Describe("rbd_metrics", func() {
Expect(dto.GetGauge().GetValue()).To(Equal(float64(0)))
}
},
Entry("rbd driver and default mounter", rbdDriver, "", "krbd:rxbounce", true),
Entry("rbd driver and rbd mounter", rbdDriver, "rbd", "krbd:rxbounce", true),
Entry("non-rbd driver", "random", "rbd", "krbd:rxbounce", false),
Entry("non-rbd mounter", rbdDriver, "random", "krbd:rxbounce", false),
Entry("rbd driver and default mounter", rbdDriver, "", "krbd:rxbounce", false),
Entry("rbd driver and rbd mounter", rbdDriver, "rbd", "krbd:rxbounce", false),
Entry("non-rbd driver", "random", "rbd", "", false),
Entry("non-rbd mounter", rbdDriver, "random", "", false),
Entry("krbd:rxbounce not enabled", rbdDriver, "rbd", "random", true),
)
})
11 changes: 8 additions & 3 deletions tests/monitoring_test.go
Expand Up @@ -287,6 +287,11 @@ var _ = Describe("Prometheus Alerts", func() {
vmName := createResources(true, false)
waitForSeriesToBeDetected(fmt.Sprintf("kubevirt_ssp_vm_rbd_volume{name='%s', rxbounce_enabled='false'}", vmName))
waitForAlertToActivate("VirtualMachineCRCErrors")

Eventually(func() error {
return apiClient.Delete(ctx, vm)
}).ShouldNot(Succeed())
alertShouldNotBeActive("VirtualMachineCRCErrors")
})
})
})
Expand All @@ -301,7 +306,7 @@ func checkAlert(alertName string) (*promApiv1.Alert, error) {
}

func waitForAlertToActivate(alertName string) {
Eventually(func() error {
EventuallyWithOffset(1, func() error {
alert, err := checkAlert(alertName)
if err != nil {
return err
Expand All @@ -314,7 +319,7 @@ func waitForAlertToActivate(alertName string) {
}

func alertShouldNotBeActive(alertName string) {
Eventually(func() error {
EventuallyWithOffset(1, func() error {
alert, err := checkAlert(alertName)
if err != nil {
return err
Expand All @@ -325,7 +330,7 @@ func alertShouldNotBeActive(alertName string) {
return fmt.Errorf("alert %s found", alertName)
}, env.Timeout(), time.Second).ShouldNot(HaveOccurred())

Consistently(func() error {
ConsistentlyWithOffset(1, func() error {
alert, err := checkAlert(alertName)
if err != nil {
return err
Expand Down

0 comments on commit 58ad69e

Please sign in to comment.