Skip to content

Commit

Permalink
Merge pull request #3867 from whitewindmills/preempt-metrics-events
Browse files Browse the repository at this point in the history
feat: add metrics and events for policy preemption
  • Loading branch information
karmada-bot committed Aug 2, 2023
2 parents 924ef2f + f26a31f commit 946fc72
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 6 deletions.
48 changes: 42 additions & 6 deletions pkg/detector/preemption.go
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
package detector

import (
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/klog/v2"

policyv1alpha1 "github.com/karmada-io/karmada/pkg/apis/policy/v1alpha1"
"github.com/karmada-io/karmada/pkg/events"
"github.com/karmada-io/karmada/pkg/features"
"github.com/karmada-io/karmada/pkg/metrics"
"github.com/karmada-io/karmada/pkg/util"
"github.com/karmada-io/karmada/pkg/util/helper"
"github.com/karmada-io/karmada/pkg/util/names"
Expand Down Expand Up @@ -74,7 +77,7 @@ func (d *ResourceDetector) handleClusterPropagationPolicyPreemption(policy *poli
}

// preemptPropagationPolicy preempts resource template that is claimed by PropagationPolicy.
func (d *ResourceDetector) preemptPropagationPolicy(resourceTemplate *unstructured.Unstructured, policy *policyv1alpha1.PropagationPolicy) error {
func (d *ResourceDetector) preemptPropagationPolicy(resourceTemplate *unstructured.Unstructured, policy *policyv1alpha1.PropagationPolicy) (err error) {
rtLabels := resourceTemplate.GetLabels()
claimedPolicyNamespace := util.GetLabelValue(rtLabels, policyv1alpha1.PropagationPolicyNamespaceLabel)
claimedPolicyName := util.GetLabelValue(rtLabels, policyv1alpha1.PropagationPolicyNameLabel)
Expand Down Expand Up @@ -104,7 +107,18 @@ func (d *ResourceDetector) preemptPropagationPolicy(resourceTemplate *unstructur
return nil
}

if err := d.ClaimPolicyForObject(resourceTemplate, policy.Namespace, policy.Name); err != nil {
defer func() {
metrics.CountPolicyPreemption(err)
if err != nil {
d.EventRecorder.Eventf(resourceTemplate, corev1.EventTypeWarning, events.EventReasonPreemptPolicyFailed,
"Propagation policy(%s/%s) failed to preempt propagation policy(%s/%s): %v", policy.Namespace, policy.Name, claimedPolicyNamespace, claimedPolicyName, err)
return
}
d.EventRecorder.Eventf(resourceTemplate, corev1.EventTypeNormal, events.EventReasonPreemptPolicySucceed,
"Propagation policy(%s/%s) preempted propagation policy(%s/%s) successfully", policy.Namespace, policy.Name, claimedPolicyNamespace, claimedPolicyName)
}()

if err = d.ClaimPolicyForObject(resourceTemplate, policy.Namespace, policy.Name); err != nil {
klog.Errorf("Failed to claim new propagation policy(%s/%s) on resource template(%s, kind=%s, %s): %v.", policy.Namespace, policy.Name,
resourceTemplate.GetAPIVersion(), resourceTemplate.GetKind(), names.NamespacedKey(resourceTemplate.GetNamespace(), resourceTemplate.GetName()), err)
return err
Expand All @@ -115,13 +129,24 @@ func (d *ResourceDetector) preemptPropagationPolicy(resourceTemplate *unstructur
}

// preemptClusterPropagationPolicyDirectly directly preempts resource template claimed by ClusterPropagationPolicy regardless of priority.
func (d *ResourceDetector) preemptClusterPropagationPolicyDirectly(resourceTemplate *unstructured.Unstructured, policy *policyv1alpha1.PropagationPolicy) error {
func (d *ResourceDetector) preemptClusterPropagationPolicyDirectly(resourceTemplate *unstructured.Unstructured, policy *policyv1alpha1.PropagationPolicy) (err error) {
claimedPolicyName := util.GetLabelValue(resourceTemplate.GetLabels(), policyv1alpha1.ClusterPropagationPolicyLabel)
if claimedPolicyName == "" {
return nil
}

if err := d.ClaimPolicyForObject(resourceTemplate, policy.Namespace, policy.Name); err != nil {
defer func() {
metrics.CountPolicyPreemption(err)
if err != nil {
d.EventRecorder.Eventf(resourceTemplate, corev1.EventTypeWarning, events.EventReasonPreemptPolicyFailed,
"Propagation policy(%s/%s) failed to preempt cluster propagation policy(%s): %v", policy.Namespace, policy.Name, claimedPolicyName, err)
return
}
d.EventRecorder.Eventf(resourceTemplate, corev1.EventTypeNormal, events.EventReasonPreemptPolicySucceed,
"Propagation policy(%s/%s) preempted cluster propagation policy(%s) successfully", policy.Namespace, policy.Name, claimedPolicyName)
}()

if err = d.ClaimPolicyForObject(resourceTemplate, policy.Namespace, policy.Name); err != nil {
klog.Errorf("Failed to claim new propagation policy(%s/%s) on resource template(%s, kind=%s, %s) directly: %v.", policy.Namespace, policy.Name,
resourceTemplate.GetAPIVersion(), resourceTemplate.GetKind(), names.NamespacedKey(resourceTemplate.GetNamespace(), resourceTemplate.GetName()), err)
return err
Expand All @@ -132,7 +157,7 @@ func (d *ResourceDetector) preemptClusterPropagationPolicyDirectly(resourceTempl
}

// preemptClusterPropagationPolicy preempts resource template that is claimed by ClusterPropagationPolicy.
func (d *ResourceDetector) preemptClusterPropagationPolicy(resourceTemplate *unstructured.Unstructured, policy *policyv1alpha1.ClusterPropagationPolicy) error {
func (d *ResourceDetector) preemptClusterPropagationPolicy(resourceTemplate *unstructured.Unstructured, policy *policyv1alpha1.ClusterPropagationPolicy) (err error) {
claimedPolicyName := util.GetLabelValue(resourceTemplate.GetLabels(), policyv1alpha1.ClusterPropagationPolicyLabel)
if claimedPolicyName == "" {
return nil
Expand Down Expand Up @@ -160,7 +185,18 @@ func (d *ResourceDetector) preemptClusterPropagationPolicy(resourceTemplate *uns
return nil
}

if err := d.ClaimClusterPolicyForObject(resourceTemplate, policy.Name); err != nil {
defer func() {
metrics.CountPolicyPreemption(err)
if err != nil {
d.EventRecorder.Eventf(resourceTemplate, corev1.EventTypeWarning, events.EventReasonPreemptPolicyFailed,
"Cluster propagation policy(%s) failed to preempt cluster propagation policy(%s): %v", policy.Name, claimedPolicyName, err)
return
}
d.EventRecorder.Eventf(resourceTemplate, corev1.EventTypeNormal, events.EventReasonPreemptPolicySucceed,
"Cluster propagation policy(%s) preempted cluster propagation policy(%s) successfully", policy.Name, claimedPolicyName)
}()

if err = d.ClaimClusterPolicyForObject(resourceTemplate, policy.Name); err != nil {
klog.Errorf("Failed to claim new cluster propagation policy(%s) on resource template(%s, kind=%s, %s): %v.", policy.Name,
resourceTemplate.GetAPIVersion(), resourceTemplate.GetKind(), names.NamespacedKey(resourceTemplate.GetNamespace(), resourceTemplate.GetName()), err)
return err
Expand Down
4 changes: 4 additions & 0 deletions pkg/events/events.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,10 @@ const (
EventReasonGetDependenciesSucceed = "GetDependenciesSucceed"
// EventReasonGetDependenciesFailed indicates get dependencies of resource template failed.
EventReasonGetDependenciesFailed = "GetDependenciesFailed"
// EventReasonPreemptPolicySucceed indicates policy preemption of resource template succeed.
EventReasonPreemptPolicySucceed = "PreemptPolicySucceed"
// EventReasonPreemptPolicyFailed indicates policy preemption of resource template failed.
EventReasonPreemptPolicyFailed = "PreemptPolicyFailed"
)

// Define events for ServiceImport objects.
Expand Down
12 changes: 12 additions & 0 deletions pkg/metrics/resource.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ const (
policyApplyAttemptsMetricsName = "policy_apply_attempts_total"
syncWorkDurationMetricsName = "binding_sync_work_duration_seconds"
syncWorkloadDurationMetricsName = "work_sync_workload_duration_seconds"
policyPreemptionMetricsName = "policy_preemption_total"
)

var (
Expand Down Expand Up @@ -45,6 +46,11 @@ var (
Help: "Duration in seconds to sync the workload to a target cluster. By the result, 'error' means a work failed to sync workloads. Otherwise 'success'.",
Buckets: prometheus.ExponentialBuckets(0.001, 2, 12),
}, []string{"result"})

policyPreemptionCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: policyPreemptionMetricsName,
Help: "Number of preemption for the resource template. By the result, 'error' means a resource template failed to be preempted by other propagation policies. Otherwise 'success'.",
}, []string{"result"})
)

// ObserveFindMatchedPolicyLatency records the duration for the resource finding a matched policy.
Expand All @@ -68,6 +74,11 @@ func ObserveSyncWorkloadLatency(err error, start time.Time) {
syncWorkloadDurationHistogram.WithLabelValues(utilmetrics.GetResultByError(err)).Observe(utilmetrics.DurationInSeconds(start))
}

// CountPolicyPreemption records the numbers of policy preemption.
func CountPolicyPreemption(err error) {
policyPreemptionCounter.WithLabelValues(utilmetrics.GetResultByError(err)).Inc()
}

// ResourceCollectors returns the collectors about resources.
func ResourceCollectors() []prometheus.Collector {
return []prometheus.Collector{
Expand All @@ -76,6 +87,7 @@ func ResourceCollectors() []prometheus.Collector {
policyApplyAttempts,
syncWorkDurationHistogram,
syncWorkloadDurationHistogram,
policyPreemptionCounter,
}
}

Expand Down

0 comments on commit 946fc72

Please sign in to comment.