Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generate an Event, and a Prometheus based Alert for non evictable VMs #5392

Merged
merged 7 commits into from Apr 20, 2021
30 changes: 30 additions & 0 deletions hack/prom-rule-ci/prom-rules-tests.yaml
Expand Up @@ -243,3 +243,33 @@ tests:
- eval_time: 1m
alertname: KubevirtVmHighMemoryUsage
exp_alerts: []

# VM eviction strategy is set but vm is not migratable
- interval: 1m
input_series:
- series: 'kubevirt_vmi_non_evictable{node="node1", namespace="ns-test", name="vm-evict-nonmigratable"}'
values: "1 1 1 1 1 1 1 1"

alert_rule_test:
- eval_time: 1m
alertname: VMCannotBeEvicted
exp_alerts:
- exp_annotations:
description: "Eviction policy for vm-evict-nonmigratable (on node node1) is set to Live Migration but the VM is not migratable"
summary: "The VM's eviction strategy is set to Live Migration but the VM is not migratable"
exp_labels:
severity: "warning"
name: "vm-evict-nonmigratable"
namespace: "ns-test"
node: "node1"

# VM eviction strategy is set and vm is migratable
- interval: 1m
input_series:
- series: 'kubevirt_vmi_non_evictable{node="node1", namespace="ns-test", name="vm-evict-migratable"}'
values: "0 0 0 0 0 0 0 0 "

alert_rule_test:
- eval_time: 1m
alertname: VMCannotBeEvicted
exp_alerts: []
4 changes: 4 additions & 0 deletions pkg/monitoring/vms/prometheus/BUILD.bazel
Expand Up @@ -10,6 +10,7 @@ go_library(
importpath = "kubevirt.io/kubevirt/pkg/monitoring/vms/prometheus",
visibility = ["//visibility:public"],
deps = [
"//pkg/controller:go_default_library",
"//pkg/virt-handler/cmd-client:go_default_library",
"//pkg/virt-launcher/virtwrap/stats:go_default_library",
"//pkg/virt-launcher/virtwrap/statsconv:go_default_library",
Expand All @@ -20,6 +21,7 @@ go_library(
"//staging/src/kubevirt.io/client-go/version:go_default_library",
"//vendor/github.com/prometheus/client_golang/prometheus:go_default_library",
"//vendor/github.com/prometheus/client_golang/prometheus/promhttp:go_default_library",
"//vendor/k8s.io/api/core/v1:go_default_library",
"//vendor/k8s.io/client-go/tools/cache:go_default_library",
"//vendor/libvirt.org/libvirt-go:go_default_library",
],
Expand All @@ -38,9 +40,11 @@ go_test(
"//staging/src/kubevirt.io/client-go/api/v1:go_default_library",
"//staging/src/kubevirt.io/client-go/testutils:go_default_library",
"//vendor/github.com/onsi/ginkgo:go_default_library",
"//vendor/github.com/onsi/ginkgo/extensions/table:go_default_library",
"//vendor/github.com/onsi/gomega:go_default_library",
"//vendor/github.com/prometheus/client_golang/prometheus:go_default_library",
"//vendor/github.com/prometheus/client_model/go:go_default_library",
"//vendor/k8s.io/api/core/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
"//vendor/libvirt.org/libvirt-go:go_default_library",
],
Expand Down
43 changes: 42 additions & 1 deletion pkg/monitoring/vms/prometheus/prometheus.go
Expand Up @@ -25,6 +25,7 @@ import (
"strings"
"time"

k8sv1 "k8s.io/api/core/v1"
"k8s.io/client-go/tools/cache"

libvirt "libvirt.org/libvirt-go"
Expand All @@ -36,6 +37,7 @@ import (
"kubevirt.io/client-go/kubecli"
"kubevirt.io/client-go/log"
"kubevirt.io/client-go/version"
"kubevirt.io/kubevirt/pkg/controller"
cmdclient "kubevirt.io/kubevirt/pkg/virt-handler/cmd-client"
"kubevirt.io/kubevirt/pkg/virt-launcher/virtwrap/stats"
)
Expand Down Expand Up @@ -68,6 +70,15 @@ var (
},
nil,
)

vmiEvictionBlockerDesc = prometheus.NewDesc(
"kubevirt_vmi_non_evictable",
"Indication for a VirtualMachine that its eviction strategy is set to Live Migration but is not migratable.",
[]string{
"node", "namespace", "name",
},
nil,
)
)

func tryToPushMetric(desc *prometheus.Desc, mv prometheus.Metric, err error, ch chan<- prometheus.Metric) {
Expand Down Expand Up @@ -444,6 +455,36 @@ func updateVMIsPhase(nodeName string, vmis []*k6tv1.VirtualMachineInstance, ch c
}
}

func updateVMIEvictionBlocker(nodeName string, vmis []*k6tv1.VirtualMachineInstance, ch chan<- prometheus.Metric) {
for _, vmi := range vmis {
mv, err := prometheus.NewConstMetric(
vmiEvictionBlockerDesc, prometheus.GaugeValue,
checkNonEvictableVMAndSetMetric(vmi),
nodeName, vmi.Namespace, vmi.Name,
)
if err != nil {
continue
}
ch <- mv
}
}

func checkNonEvictableVMAndSetMetric(vmi *k6tv1.VirtualMachineInstance) float64 {
setVal := 0.0
if vmi.IsEvictable() {
vmiIsMigratableCond := controller.NewVirtualMachineInstanceConditionManager().
GetCondition(vmi, k6tv1.VirtualMachineInstanceIsMigratable)

//As this metric is used for user alert we refer to be conservative - so if the VirtualMachineInstanceIsMigratable
//condition is still not set we treat the VM as if it's "not migratable"
if vmiIsMigratableCond == nil || vmiIsMigratableCond.Status == k8sv1.ConditionFalse {
setVal = 1.0
}

}
return setVal
}

func updateVersion(ch chan<- prometheus.Metric) {
verinfo := version.Get()
ch <- prometheus.MustNewConstMetric(
Expand Down Expand Up @@ -518,6 +559,7 @@ func (co *Collector) Collect(ch chan<- prometheus.Metric) {
co.concCollector.Collect(socketToVMIs, scraper, collectionTimeout)

updateVMIsPhase(co.nodeName, vmis, ch)
updateVMIEvictionBlocker(co.nodeName, vmis, ch)
return
}

Expand Down Expand Up @@ -581,7 +623,6 @@ func (ps *prometheusScraper) Report(socketFile string, vmi *k6tv1.VirtualMachine

vmiMetrics := newVmiMetrics(vmi, ps.ch)
vmiMetrics.updateMetrics(vmStats)

}

func Handler(MaxRequestsInFlight int) http.Handler {
Expand Down
56 changes: 56 additions & 0 deletions pkg/monitoring/vms/prometheus/prometheus_test.go
Expand Up @@ -22,9 +22,12 @@ package prometheus
import (
"fmt"

"github.com/onsi/ginkgo/extensions/table"
"github.com/prometheus/client_golang/prometheus"
io_prometheus_client "github.com/prometheus/client_model/go"
k8sv1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

libvirt "libvirt.org/libvirt-go"

. "github.com/onsi/ginkgo"
Expand Down Expand Up @@ -966,8 +969,61 @@ var _ = Describe("Prometheus", func() {
Expect(s).To(ContainSubstring("vcpu_0_cpu_2=true"))
})
})

Context("VMI Eviction blocker", func() {

liveMigrateEvictPolicy := k6tv1.EvictionStrategyLiveMigrate
table.DescribeTable("Add evictionion alert matrics", func(evictionPolicy *k6tv1.EvictionStrategy, migrateCondStatus k8sv1.ConditionStatus, expectedVal float64) {

ch := make(chan prometheus.Metric, 1)
defer close(ch)

vmis := createVMISForEviction(evictionPolicy, migrateCondStatus)
updateVMIEvictionBlocker("testNode", vmis, ch)

result := <-ch
dto := &io_prometheus_client.Metric{}
result.Write(dto)

Expect(result).ToNot(BeNil())
Expect(result.Desc().String()).To(ContainSubstring("kubevirt_vmi_non_evictable"))
Expect(dto.Gauge.GetValue()).To(BeEquivalentTo(expectedVal))
},
table.Entry("VMI Eviction policy set to LiveMigration and vm is not migratable", &liveMigrateEvictPolicy, k8sv1.ConditionFalse, 1.0),
table.Entry("VMI Eviction policy set to LiveMigration and vm migratable status is not known", &liveMigrateEvictPolicy, k8sv1.ConditionUnknown, 1.0),
table.Entry("VMI Eviction policy set to LiveMigration and vm is migratable", &liveMigrateEvictPolicy, k8sv1.ConditionTrue, 0.0),
table.Entry("VMI Eviction policy is not set and vm is not migratable", nil, k8sv1.ConditionFalse, 0.0),
table.Entry("VMI Eviction policy is not set and vm is migratable", nil, k8sv1.ConditionTrue, 0.0),
table.Entry("VMI Eviction policy is not set and vm migratable status is not known", nil, k8sv1.ConditionUnknown, 0.0),
)
})
})

func createVMISForEviction(evictionStrategy *k6tv1.EvictionStrategy, migratableCondStatus k8sv1.ConditionStatus) []*k6tv1.VirtualMachineInstance {

vmis := []*k6tv1.VirtualMachineInstance{
{
ObjectMeta: metav1.ObjectMeta{
Namespace: "test-ns",
Name: "testvmi",
},
},
}

if migratableCondStatus != k8sv1.ConditionUnknown {
vmis[0].Status.Conditions = []k6tv1.VirtualMachineInstanceCondition{
{
Type: k6tv1.VirtualMachineInstanceIsMigratable,
Status: migratableCondStatus,
},
}
}

vmis[0].Spec.EvictionStrategy = evictionStrategy

return vmis
}

var _ = Describe("Utility functions", func() {
Context("VMI Count map reporting", func() {
It("should handle missing VMs", func() {
Expand Down
3 changes: 3 additions & 0 deletions pkg/virt-handler/vm.go
Expand Up @@ -1012,6 +1012,9 @@ func (d *VirtualMachineController) updateVMIStatus(origVMI *v1.VirtualMachineIns
vmi.Status.Conditions = append(vmi.Status.Conditions, *liveMigrationCondition)
}
}
if vmi.IsEvictable() && liveMigrationCondition.Status == k8sv1.ConditionFalse {
d.recorder.Event(vmi, k8sv1.EventTypeWarning, v1.Migrated.String(), "EvictionStrategy is set but vmi is not migratable")
}
// Update the condition when GA is connected
channelConnected := false
if domain != nil {
Expand Down
12 changes: 12 additions & 0 deletions pkg/virt-operator/resource/generate/components/crds.go
Expand Up @@ -858,6 +858,18 @@ func NewPrometheusRuleSpec(ns string, workloadUpdatesEnabled bool) *promv1.Prome
"severity": "warning",
},
},
{
Alert: "VMCannotBeEvicted",
Expr: intstr.FromString("kubevirt_vmi_non_evictable > 0"),
For: "1m",
Annotations: map[string]string{
"description": "Eviction policy for {{ $labels.name }} (on node {{ $labels.node }}) is set to Live Migration but the VM is not migratable",
"summary": "The VM's eviction strategy is set to Live Migration but the VM is not migratable",
},
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it make sense to attempt to succinctly explain to the admin why this is a problem, that this would cause difficulty during node drain?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we do that as part of the run-book, where we include full details on the issue (including overview, mitigation, etc.) . Here we just explain the alert itself. If you still think it should go here as well I can add it

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The PR for the runbook is here: kubevirt/monitoring#6

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok. fair point.

/lgtm

Labels: map[string]string{
"severity": "warning",
},
},
},
},
},
Expand Down
20 changes: 20 additions & 0 deletions tests/infra_test.go
Expand Up @@ -900,6 +900,26 @@ var _ = Describe("[Serial][owner:@sig-compute]Infrastructure", func() {
table.Entry("by IPv6", k8sv1.IPv6Protocol),
)

table.DescribeTable("should include VMI eviction blocker status for all running VMs", func(family k8sv1.IPFamily) {
if family == k8sv1.IPv6Protocol {
libnet.SkipWhenNotDualStackCluster(virtClient)
}

ip := getSupportedIP(metricsIPs, family)

metrics := collectMetrics(ip, "kubevirt_vmi_non_evictable")
By("Checking the collected metrics")
keys := getKeysFromMetrics(metrics)
for _, key := range keys {
value := metrics[key]
fmt.Fprintf(GinkgoWriter, "metric value was %f\n", value)
Expect(value).To(BeNumerically(">=", float64(0.0)))
}
},
table.Entry("[test_id:4148] by IPv4", k8sv1.IPv4Protocol),
table.Entry("by IPv6", k8sv1.IPv6Protocol),
)

table.DescribeTable("should include kubernetes labels to VMI metrics", func(family k8sv1.IPFamily) {
if family == k8sv1.IPv6Protocol {
libnet.SkipWhenNotDualStackCluster(virtClient)
Expand Down