From ec17b93ac4b2f3220f0c44f045aac3fec6080e2f Mon Sep 17 00:00:00 2001 From: Britania Rodriguez Reyes Date: Tue, 15 Apr 2025 15:38:12 -0700 Subject: [PATCH 1/3] update placement status metric Signed-off-by: Britania Rodriguez Reyes --- .../clusterresourceplacement/controller.go | 38 +- .../controller_integration_test.go | 1422 +++++++++++++++-- pkg/utils/controller/metrics/metrics.go | 11 +- test/utils/metrics/metrics.go | 37 + 4 files changed, 1362 insertions(+), 146 deletions(-) create mode 100644 test/utils/metrics/metrics.go diff --git a/pkg/controllers/clusterresourceplacement/controller.go b/pkg/controllers/clusterresourceplacement/controller.go index f5376fb16..c4a4ff8ae 100644 --- a/pkg/controllers/clusterresourceplacement/controller.go +++ b/pkg/controllers/clusterresourceplacement/controller.go @@ -59,13 +59,14 @@ func (r *Reconciler) Reconcile(ctx context.Context, key controller.QueueKey) (ct return ctrl.Result{}, nil // ignore this unexpected error } startTime := time.Now() + crp := fleetv1beta1.ClusterResourcePlacement{} klog.V(2).InfoS("ClusterResourcePlacement reconciliation starts", "clusterResourcePlacement", name) defer func() { latency := time.Since(startTime).Milliseconds() klog.V(2).InfoS("ClusterResourcePlacement reconciliation ends", "clusterResourcePlacement", name, "latency", latency) + emitPlacementStatusMetric(&crp) }() - crp := fleetv1beta1.ClusterResourcePlacement{} if err := r.Client.Get(ctx, types.NamespacedName{Name: name}, &crp); err != nil { if apierrors.IsNotFound(err) { klog.V(4).InfoS("Ignoring NotFound clusterResourcePlacement", "clusterResourcePlacement", name) @@ -105,6 +106,7 @@ func (r *Reconciler) handleDelete(ctx context.Context, crp *fleetv1beta1.Cluster return ctrl.Result{}, err } + metrics.FleetPlacementStatusLastTimeStampSeconds.DeletePartialMatch(prometheus.Labels{"name": crp.Name}) controllerutil.RemoveFinalizer(crp, fleetv1beta1.ClusterResourcePlacementCleanupFinalizer) if err := r.Client.Update(ctx, crp); err != nil { klog.ErrorS(err, "Failed to remove crp finalizer", "clusterResourcePlacement", crpKObj) @@ -112,7 +114,7 @@ func (r *Reconciler) handleDelete(ctx context.Context, crp *fleetv1beta1.Cluster } klog.V(2).InfoS("Removed crp-cleanup finalizer", "clusterResourcePlacement", crpKObj) r.Recorder.Event(crp, corev1.EventTypeNormal, "PlacementCleanupFinalizerRemoved", "Deleted the snapshots and removed the placement cleanup finalizer") - metrics.FleetPlacementStatus.Delete(prometheus.Labels{"name": crp.Name}) + return ctrl.Result{}, nil } @@ -233,12 +235,10 @@ func (r *Reconciler) handleUpdate(ctx context.Context, crp *fleetv1beta1.Cluster klog.V(2).InfoS("Placement has finished the rollout process and reached the desired status", "clusterResourcePlacement", crpKObj, "generation", crp.Generation) r.Recorder.Event(crp, corev1.EventTypeNormal, "PlacementRolloutCompleted", "Placement has finished the rollout process and reached the desired status") } - metrics.FleetPlacementStatus.WithLabelValues(crp.Name).Set(1) // We don't need to requeue any request now by watching the binding changes return ctrl.Result{}, nil } - metrics.FleetPlacementStatus.WithLabelValues(crp.Name).Set(0) if !isClusterScheduled { // Note: // If the scheduledCondition is failed, it means the placement requirement cannot be satisfied fully. For example, @@ -1046,3 +1046,33 @@ func isRolloutCompleted(crp *fleetv1beta1.ClusterResourcePlacement) bool { func isCRPScheduled(crp *fleetv1beta1.ClusterResourcePlacement) bool { return condition.IsConditionStatusTrue(crp.GetCondition(string(fleetv1beta1.ClusterResourcePlacementScheduledConditionType)), crp.Generation) } + +func emitPlacementStatusMetric(crp *fleetv1beta1.ClusterResourcePlacement) { + // Check CRP Scheduled condition. + status := "nil" + cond := crp.GetCondition(string(fleetv1beta1.ClusterResourcePlacementScheduledConditionType)) + if !condition.IsConditionStatusTrue(cond, crp.Generation) { + if cond != nil { + status = string(cond.Status) + } + metrics.FleetPlacementStatusLastTimeStampSeconds.WithLabelValues(crp.Name, strconv.FormatInt(crp.Generation, 10), string(fleetv1beta1.ClusterResourcePlacementScheduledConditionType), status).SetToCurrentTime() + return + } + + // Check CRP expected conditions. + expectedCondTypes := determineExpectedCRPAndResourcePlacementStatusCondType(crp) + for _, condType := range expectedCondTypes { + cond = crp.GetCondition(string(condType.ClusterResourcePlacementConditionType())) + if !condition.IsConditionStatusTrue(cond, crp.Generation) { + if cond != nil { + status = string(cond.Status) + } + metrics.FleetPlacementStatusLastTimeStampSeconds.WithLabelValues(crp.Name, strconv.FormatInt(crp.Generation, 10), string(condType.ClusterResourcePlacementConditionType()), status).SetToCurrentTime() + return + } + } + + // Emit the "ClusterResourcePlacementCompleted" condition metric to indicate that the CRP has completed. + // This condition is used solely for metric reporting purposes. + metrics.FleetPlacementStatusLastTimeStampSeconds.WithLabelValues(crp.Name, strconv.FormatInt(crp.Generation, 10), "ClusterResourcePlacementCompleted", string(metav1.ConditionTrue)).SetToCurrentTime() +} diff --git a/pkg/controllers/clusterresourceplacement/controller_integration_test.go b/pkg/controllers/clusterresourceplacement/controller_integration_test.go index 3399a92b2..0cbc88b49 100644 --- a/pkg/controllers/clusterresourceplacement/controller_integration_test.go +++ b/pkg/controllers/clusterresourceplacement/controller_integration_test.go @@ -1,12 +1,9 @@ /* Copyright 2025 The KubeFleet Authors. - Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - +http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -24,6 +21,8 @@ import ( "github.com/google/go-cmp/cmp/cmpopts" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + "github.com/prometheus/client_golang/prometheus" + prometheusclientmodel "github.com/prometheus/client_model/go" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/meta" @@ -35,7 +34,9 @@ import ( placementv1beta1 "go.goms.io/fleet/apis/placement/v1beta1" "go.goms.io/fleet/pkg/utils" "go.goms.io/fleet/pkg/utils/condition" + "go.goms.io/fleet/pkg/utils/controller/metrics" "go.goms.io/fleet/pkg/utils/resource" + metricsUtils "go.goms.io/fleet/test/utils/metrics" ) const ( @@ -82,6 +83,16 @@ var ( } ) +var ( + customRegistry *prometheus.Registry + crp *placementv1beta1.ClusterResourcePlacement + gotCRP *placementv1beta1.ClusterResourcePlacement + gotPolicySnapshot *placementv1beta1.ClusterSchedulingPolicySnapshot + gotResourceSnapshot *placementv1beta1.ClusterResourceSnapshot + member1Binding *placementv1beta1.ClusterResourceBinding + member2Binding *placementv1beta1.ClusterResourceBinding +) + func retrieveAndValidatePolicySnapshot(crp *placementv1beta1.ClusterResourcePlacement, want *placementv1beta1.ClusterSchedulingPolicySnapshot) *placementv1beta1.ClusterSchedulingPolicySnapshot { policySnapshotList := &placementv1beta1.ClusterSchedulingPolicySnapshotList{} Eventually(func() error { @@ -199,7 +210,7 @@ func createOverriddenClusterResourceBinding(cluster string, policySnapshot *plac return binding } -func createSynchronizedClusterResourceBinding(cluster string, policySnapshot *placementv1beta1.ClusterSchedulingPolicySnapshot, resourceSnapshot *placementv1beta1.ClusterResourceSnapshot) { +func createSynchronizedClusterResourceBinding(cluster string, policySnapshot *placementv1beta1.ClusterSchedulingPolicySnapshot, resourceSnapshot *placementv1beta1.ClusterResourceSnapshot) *placementv1beta1.ClusterResourceBinding { binding := createOverriddenClusterResourceBinding(cluster, policySnapshot, resourceSnapshot) cond := metav1.Condition{ Status: metav1.ConditionTrue, @@ -209,19 +220,153 @@ func createSynchronizedClusterResourceBinding(cluster string, policySnapshot *pl } meta.SetStatusCondition(&binding.Status.Conditions, cond) Expect(k8sClient.Status().Update(ctx, binding)).Should(Succeed(), "Failed to update the binding status") + return binding } -var _ = Describe("Test ClusterResourcePlacement Controller", func() { - Context("When creating new pickAll ClusterResourcePlacement", func() { +func createAvailableClusterResourceBinding(cluster string, policySnapshot *placementv1beta1.ClusterSchedulingPolicySnapshot, resourceSnapshot *placementv1beta1.ClusterResourceSnapshot) *placementv1beta1.ClusterResourceBinding { + binding := createSynchronizedClusterResourceBinding(cluster, policySnapshot, resourceSnapshot) + cond := metav1.Condition{ + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceBindingApplied), + Reason: condition.ApplySucceededReason, + ObservedGeneration: binding.Generation, + } + meta.SetStatusCondition(&binding.Status.Conditions, cond) + cond = metav1.Condition{ + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceBindingAvailable), + Reason: condition.AvailableReason, + ObservedGeneration: binding.Generation, + } + meta.SetStatusCondition(&binding.Status.Conditions, cond) + Expect(k8sClient.Status().Update(ctx, binding)).Should(Succeed(), "Failed to update the binding status") + return binding +} + +func updateClusterResourceBindingWithReportDiff(binding *placementv1beta1.ClusterResourceBinding) *placementv1beta1.ClusterResourceBinding { + cond := metav1.Condition{ + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceBindingWorkSynchronized), + Reason: condition.WorkSynchronizedReason, + ObservedGeneration: binding.Generation, + } + meta.SetStatusCondition(&binding.Status.Conditions, cond) + cond = metav1.Condition{ + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourcesDiffReportedConditionType), + Reason: condition.DiffReportedStatusTrueReason, + ObservedGeneration: binding.Generation, + } + meta.SetStatusCondition(&binding.Status.Conditions, cond) + Expect(k8sClient.Status().Update(ctx, binding)).Should(Succeed(), "Failed to update the binding status") + return binding +} + +func checkClusterSchedulingPolicySnapshot() *placementv1beta1.ClusterSchedulingPolicySnapshot { + policyHash, err := resource.HashOf(crp.Spec.Policy) + Expect(err).Should(Succeed(), "failed to create policy hash") + + wantPolicySnapshot := placementv1beta1.ClusterSchedulingPolicySnapshot{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf(placementv1beta1.PolicySnapshotNameFmt, crp.Name, 0), + Labels: map[string]string{ + placementv1beta1.CRPTrackingLabel: crp.Name, + placementv1beta1.IsLatestSnapshotLabel: strconv.FormatBool(true), + placementv1beta1.PolicyIndexLabel: strconv.Itoa(0), + }, + Annotations: map[string]string{ + placementv1beta1.CRPGenerationAnnotation: strconv.Itoa(int(crp.Generation)), + }, + OwnerReferences: []metav1.OwnerReference{ + crpOwnerReference, + }, + }, + Spec: placementv1beta1.SchedulingPolicySnapshotSpec{ + PolicyHash: []byte(policyHash), + }, + } + return retrieveAndValidatePolicySnapshot(crp, &wantPolicySnapshot) +} + +func checkClusterResourceSnapshot() *placementv1beta1.ClusterResourceSnapshot { + emptyResources := &placementv1beta1.ResourceSnapshotSpec{ + SelectedResources: []placementv1beta1.ResourceContent{}, + } + jsonBytes, err := resource.HashOf(emptyResources) + Expect(err).Should(Succeed(), "Failed to create resource hash") + + wantResourceSnapshot := &placementv1beta1.ClusterResourceSnapshot{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf(placementv1beta1.ResourceSnapshotNameFmt, crp.Name, 0), + Labels: map[string]string{ + placementv1beta1.CRPTrackingLabel: crp.Name, + placementv1beta1.IsLatestSnapshotLabel: strconv.FormatBool(true), + placementv1beta1.ResourceIndexLabel: strconv.Itoa(0), + }, + Annotations: map[string]string{ + placementv1beta1.NumberOfResourceSnapshotsAnnotation: strconv.Itoa(1), + placementv1beta1.ResourceGroupHashAnnotation: jsonBytes, + placementv1beta1.NumberOfEnvelopedObjectsAnnotation: strconv.Itoa(0), + }, + OwnerReferences: []metav1.OwnerReference{ + crpOwnerReference, + }, + }, + Spec: placementv1beta1.ResourceSnapshotSpec{ + SelectedResources: []placementv1beta1.ResourceContent{}, + }, + } + return retrieveAndValidateResourceSnapshot(crp, wantResourceSnapshot) +} + +func updateClusterSchedulingPolicySnapshotStatus(status metav1.ConditionStatus, clustersSelected bool) { + reason := ResourceScheduleSucceededReason + if status == metav1.ConditionFalse { + reason = ResourceScheduleFailedReason + } + + // Update scheduling condition + scheduledCondition := metav1.Condition{ + Type: string(placementv1beta1.PolicySnapshotScheduled), + Status: status, + Reason: reason, + ObservedGeneration: gotCRP.Generation, + } + meta.SetStatusCondition(&gotPolicySnapshot.Status.Conditions, scheduledCondition) + gotPolicySnapshot.Status.ObservedCRPGeneration = gotCRP.Generation + + // Only update ClusterDecisions if clustersSelected is true + if clustersSelected { + // Build cluster decisions + reasonStr := "valid" + selected := true + if status == metav1.ConditionFalse { + selected = false + reasonStr = "invalid" + } + gotPolicySnapshot.Status.ClusterDecisions = []placementv1beta1.ClusterDecision{ + { + ClusterName: member1Name, + Selected: selected, + Reason: reasonStr, + }, + { + ClusterName: member2Name, + Selected: selected, + Reason: reasonStr, + }, + } + } - var ( - crp *placementv1beta1.ClusterResourcePlacement - gotCRP *placementv1beta1.ClusterResourcePlacement - gotPolicySnapshot *placementv1beta1.ClusterSchedulingPolicySnapshot - gotResourceSnapshot *placementv1beta1.ClusterResourceSnapshot - ) + // Apply status update + Expect(k8sClient.Status().Update(ctx, gotPolicySnapshot)).Should(Succeed(), "Failed to update the policy snapshot status") +} +var _ = Describe("Test ClusterResourcePlacement Controller", func() { + Context("When creating new pickAll ClusterResourcePlacement", func() { BeforeEach(func() { + registerMetrics() + By("Create a new crp") crp = &placementv1beta1.ClusterResourcePlacement{ ObjectMeta: metav1.ObjectMeta{ @@ -243,61 +388,13 @@ var _ = Describe("Test ClusterResourcePlacement Controller", func() { } Expect(k8sClient.Create(ctx, crp)).Should(Succeed(), "Failed to create crp") - By("By checking clusterSchedulingPolicySnapshot") - policyHash, err := resource.HashOf(crp.Spec.Policy) - Expect(err).Should(Succeed(), "failed to create policy hash") - - wantPolicySnapshot := placementv1beta1.ClusterSchedulingPolicySnapshot{ - ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf(placementv1beta1.PolicySnapshotNameFmt, crp.Name, 0), - Labels: map[string]string{ - placementv1beta1.CRPTrackingLabel: crp.Name, - placementv1beta1.IsLatestSnapshotLabel: strconv.FormatBool(true), - placementv1beta1.PolicyIndexLabel: strconv.Itoa(0), - }, - Annotations: map[string]string{ - placementv1beta1.CRPGenerationAnnotation: strconv.Itoa(int(crp.Generation)), - }, - OwnerReferences: []metav1.OwnerReference{ - crpOwnerReference, - }, - }, - Spec: placementv1beta1.SchedulingPolicySnapshotSpec{ - PolicyHash: []byte(policyHash), - }, - } - gotPolicySnapshot = retrieveAndValidatePolicySnapshot(crp, &wantPolicySnapshot) + By("Check clusterSchedulingPolicySnapshot") + gotPolicySnapshot = checkClusterSchedulingPolicySnapshot() - By("By checking clusterResourceSnapshot") - emptyResources := &placementv1beta1.ResourceSnapshotSpec{ - SelectedResources: []placementv1beta1.ResourceContent{}, - } - jsonBytes, err := resource.HashOf(emptyResources) - Expect(err).Should(Succeed(), "Failed to create resource hash") + By("Check clusterResourceSnapshot") + gotResourceSnapshot = checkClusterResourceSnapshot() - wantResourceSnapshot := &placementv1beta1.ClusterResourceSnapshot{ - ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf(placementv1beta1.ResourceSnapshotNameFmt, crp.Name, 0), - Labels: map[string]string{ - placementv1beta1.CRPTrackingLabel: crp.Name, - placementv1beta1.IsLatestSnapshotLabel: strconv.FormatBool(true), - placementv1beta1.ResourceIndexLabel: strconv.Itoa(0), - }, - Annotations: map[string]string{ - placementv1beta1.NumberOfResourceSnapshotsAnnotation: strconv.Itoa(1), - placementv1beta1.ResourceGroupHashAnnotation: jsonBytes, - placementv1beta1.NumberOfEnvelopedObjectsAnnotation: strconv.Itoa(0), - }, - OwnerReferences: []metav1.OwnerReference{ - crpOwnerReference, - }, - }, - Spec: placementv1beta1.ResourceSnapshotSpec{ - SelectedResources: []placementv1beta1.ResourceContent{}, - }, - } - gotResourceSnapshot = retrieveAndValidateResourceSnapshot(crp, wantResourceSnapshot) - By("By checking CRP status") + By("Validate CRP status") wantCRP := &placementv1beta1.ClusterResourcePlacement{ ObjectMeta: metav1.ObjectMeta{ Name: testCRPName, @@ -315,28 +412,30 @@ var _ = Describe("Test ClusterResourcePlacement Controller", func() { }, }, } - gotCRP = retrieveAndValidateClusterResourcePlacement(testCRPName, wantCRP) + gotCRP = retrieveAndValidateClusterResourcePlacement(crp.Name, wantCRP) }) AfterEach(func() { By("Deleting crp") Expect(k8sClient.Delete(ctx, gotCRP)).Should(Succeed()) retrieveAndValidateCRPDeletion(gotCRP) + + By("Deleting clusterResourceBindings") + if member1Binding != nil { + Expect(k8sClient.Delete(ctx, member1Binding)).Should(Succeed()) + } + if member2Binding != nil { + Expect(k8sClient.Delete(ctx, member2Binding)).Should(Succeed()) + } + + Expect(customRegistry.Unregister(metrics.FleetPlacementStatusLastTimeStampSeconds)).Should(BeTrue()) }) It("None of the clusters are selected", func() { - By("By updating clusterSchedulingPolicySnapshot status to schedule success") - scheduledCondition := metav1.Condition{ - Status: metav1.ConditionTrue, - Type: string(placementv1beta1.PolicySnapshotScheduled), - Reason: ResourceScheduleSucceededReason, - ObservedGeneration: gotCRP.Generation, - } - meta.SetStatusCondition(&gotPolicySnapshot.Status.Conditions, scheduledCondition) - gotPolicySnapshot.Status.ObservedCRPGeneration = gotCRP.Generation - Expect(k8sClient.Status().Update(ctx, gotPolicySnapshot)).Should(Succeed(), "Failed to update the policy snapshot status") + By("Update clusterSchedulingPolicySnapshot status to schedule success") + updateClusterSchedulingPolicySnapshotStatus(metav1.ConditionTrue, false) - By("By validating the CRP status has only scheduling condition") + By("Validate the CRP status has only scheduling condition") wantCRP := &placementv1beta1.ClusterResourcePlacement{ ObjectMeta: metav1.ObjectMeta{ Name: testCRPName, @@ -354,37 +453,101 @@ var _ = Describe("Test ClusterResourcePlacement Controller", func() { }, }, } - retrieveAndValidateClusterResourcePlacement(testCRPName, wantCRP) + gotCRP = retrieveAndValidateClusterResourcePlacement(testCRPName, wantCRP) + + By("Ensure placement status metric was emitted") + // A metric for every !true (nil, unknown, false) status as the CRP reconciles. + // In this case, no clusters are selected therefore with pickAll policy therefore there is nothing to rollout + // so the RolloutStarted condition is nil. + wantMetrics := []*prometheusclientmodel.Metric{ + { + Label: []*prometheusclientmodel.LabelPair{ + {Name: ptr.To("name"), Value: ptr.To(gotCRP.Name)}, + {Name: ptr.To("generation"), Value: ptr.To(strconv.FormatInt(gotCRP.Generation, 10))}, + {Name: ptr.To("conditionType"), Value: ptr.To(string(placementv1beta1.ClusterResourcePlacementScheduledConditionType))}, + {Name: ptr.To("status"), Value: ptr.To(string(corev1.ConditionUnknown))}, + }, + Gauge: &prometheusclientmodel.Gauge{ + Value: ptr.To(float64(time.Now().UnixNano()) / 1e9), + }, + }, + { + Label: []*prometheusclientmodel.LabelPair{ + {Name: ptr.To("name"), Value: ptr.To(gotCRP.Name)}, + {Name: ptr.To("generation"), Value: ptr.To(strconv.FormatInt(gotCRP.Generation, 10))}, + {Name: ptr.To("conditionType"), Value: ptr.To(string(placementv1beta1.ClusterResourcePlacementRolloutStartedConditionType))}, + {Name: ptr.To("status"), Value: ptr.To("nil")}, + }, + Gauge: &prometheusclientmodel.Gauge{ + Value: ptr.To(float64(time.Now().UnixNano()) / 1e9), + }, + }, + } + checkPlacementStatusMetric(customRegistry, wantMetrics) }) - It("Clusters are selected and resources are applied successfully", func() { - By("By updating clusterSchedulingPolicySnapshot status to schedule success") - scheduledCondition := metav1.Condition{ - Status: metav1.ConditionTrue, - Type: string(placementv1beta1.PolicySnapshotScheduled), - Reason: ResourceScheduleSucceededReason, - ObservedGeneration: gotCRP.Generation, + It("Clusters are not selected", func() { + By("Update clusterSchedulingPolicySnapshot status to schedule failed") + updateClusterSchedulingPolicySnapshotStatus(metav1.ConditionFalse, true) + + By("Validate the CRP status has only scheduling condition") + wantCRP := &placementv1beta1.ClusterResourcePlacement{ + ObjectMeta: metav1.ObjectMeta{ + Name: testCRPName, + Finalizers: []string{placementv1beta1.ClusterResourcePlacementCleanupFinalizer}, + }, + Spec: crp.Spec, + Status: placementv1beta1.ClusterResourcePlacementStatus{ + ObservedResourceIndex: "0", + Conditions: []metav1.Condition{ + { + Status: metav1.ConditionFalse, + Type: string(placementv1beta1.ClusterResourcePlacementScheduledConditionType), + Reason: ResourceScheduleFailedReason, + }, + }, + }, } - meta.SetStatusCondition(&gotPolicySnapshot.Status.Conditions, scheduledCondition) - gotPolicySnapshot.Status.ObservedCRPGeneration = gotCRP.Generation - gotPolicySnapshot.Status.ClusterDecisions = []placementv1beta1.ClusterDecision{ + gotCRP = retrieveAndValidateClusterResourcePlacement(testCRPName, wantCRP) + + By("Ensure placement status metric was emitted") + // A metric for the different reconcilations for all the !true statuses as the CRP reconciles. + // In this case we have 2 metrics for 1 condition type as Scheduled goes from `Unknown` to `False`. + wantMetrics := []*prometheusclientmodel.Metric{ { - ClusterName: member1Name, - Selected: true, - Reason: "valid", + Label: []*prometheusclientmodel.LabelPair{ + {Name: ptr.To("name"), Value: ptr.To(gotCRP.Name)}, + {Name: ptr.To("generation"), Value: ptr.To(strconv.FormatInt(gotCRP.Generation, 10))}, + {Name: ptr.To("conditionType"), Value: ptr.To(string(placementv1beta1.ClusterResourcePlacementScheduledConditionType))}, + {Name: ptr.To("status"), Value: ptr.To(string(corev1.ConditionUnknown))}, + }, + Gauge: &prometheusclientmodel.Gauge{ + Value: ptr.To(float64(time.Now().UnixNano()) / 1e9), + }, }, { - ClusterName: member2Name, - Selected: true, - Reason: "valid", + Label: []*prometheusclientmodel.LabelPair{ + {Name: ptr.To("name"), Value: ptr.To(gotCRP.Name)}, + {Name: ptr.To("generation"), Value: ptr.To(strconv.FormatInt(gotCRP.Generation, 10))}, + {Name: ptr.To("conditionType"), Value: ptr.To(string(placementv1beta1.ClusterResourcePlacementScheduledConditionType))}, + {Name: ptr.To("status"), Value: ptr.To(string(corev1.ConditionFalse))}, + }, + Gauge: &prometheusclientmodel.Gauge{ + Value: ptr.To(float64(time.Now().UnixNano()) / 1e9), + }, }, } - Expect(k8sClient.Status().Update(ctx, gotPolicySnapshot)).Should(Succeed(), "Failed to update the policy snapshot status") + checkPlacementStatusMetric(customRegistry, wantMetrics) + }) + + It("Clusters are selected and resources are applied successfully", func() { + By("Update clusterSchedulingPolicySnapshot status to schedule success") + updateClusterSchedulingPolicySnapshotStatus(metav1.ConditionTrue, true) - By("By creating clusterResourceBinding on member-1") - createOverriddenClusterResourceBinding(member1Name, gotPolicySnapshot, gotResourceSnapshot) + By("Create an overridden clusterResourceBinding on member-1") + member1Binding = createOverriddenClusterResourceBinding(member1Name, gotPolicySnapshot, gotResourceSnapshot) - By("By validating the CRP status") + By("Validate the CRP status") wantCRP := &placementv1beta1.ClusterResourcePlacement{ ObjectMeta: metav1.ObjectMeta{ Name: testCRPName, @@ -449,44 +612,167 @@ var _ = Describe("Test ClusterResourcePlacement Controller", func() { }, }, } - retrieveAndValidateClusterResourcePlacement(testCRPName, wantCRP) + gotCRP = retrieveAndValidateClusterResourcePlacement(testCRPName, wantCRP) + + By("Ensure placement status metric was emitted") + // There are two metrics because 1 member cluster has an unknown RolloutStarted status. + // We emit the !true status for CRP as it reconciles. In this case, CRP is still rolling out. + wantMetrics := []*prometheusclientmodel.Metric{ + { + Label: []*prometheusclientmodel.LabelPair{ + {Name: ptr.To("name"), Value: ptr.To(gotCRP.Name)}, + {Name: ptr.To("generation"), Value: ptr.To(strconv.FormatInt(gotCRP.Generation, 10))}, + {Name: ptr.To("conditionType"), Value: ptr.To(string(placementv1beta1.ClusterResourcePlacementScheduledConditionType))}, + {Name: ptr.To("status"), Value: ptr.To(string(corev1.ConditionUnknown))}, + }, + Gauge: &prometheusclientmodel.Gauge{ + Value: ptr.To(float64(time.Now().UnixNano()) / 1e9), + }, + }, + { + Label: []*prometheusclientmodel.LabelPair{ + {Name: ptr.To("name"), Value: ptr.To(gotCRP.Name)}, + {Name: ptr.To("generation"), Value: ptr.To(strconv.FormatInt(gotCRP.Generation, 10))}, + {Name: ptr.To("conditionType"), Value: ptr.To(string(placementv1beta1.ClusterResourcePlacementRolloutStartedConditionType))}, + {Name: ptr.To("status"), Value: ptr.To(string(corev1.ConditionUnknown))}, + }, + Gauge: &prometheusclientmodel.Gauge{ + Value: ptr.To(float64(time.Now().UnixNano()) / 1e9), + }, + }, + } + checkPlacementStatusMetric(customRegistry, wantMetrics) - By("By creating a synchronized clusterResourceBinding on member-2") - createSynchronizedClusterResourceBinding(member2Name, gotPolicySnapshot, gotResourceSnapshot) + By("Create a synchronized clusterResourceBinding on member-2") + member2Binding = createSynchronizedClusterResourceBinding(member2Name, gotPolicySnapshot, gotResourceSnapshot) - wantCRP = &placementv1beta1.ClusterResourcePlacement{ - ObjectMeta: metav1.ObjectMeta{ - Name: testCRPName, - Finalizers: []string{placementv1beta1.ClusterResourcePlacementCleanupFinalizer}, + By("Validate the CRP status") + wantCRP.Status.Conditions = []metav1.Condition{ + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ClusterResourcePlacementScheduledConditionType), + Reason: ResourceScheduleSucceededReason, }, - Spec: crp.Spec, - Status: placementv1beta1.ClusterResourcePlacementStatus{ - ObservedResourceIndex: "0", + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ClusterResourcePlacementRolloutStartedConditionType), + Reason: condition.RolloutStartedReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ClusterResourcePlacementOverriddenConditionType), + Reason: condition.OverrideNotSpecifiedReason, + }, + { + Status: metav1.ConditionUnknown, + Type: string(placementv1beta1.ClusterResourcePlacementWorkSynchronizedConditionType), + Reason: condition.WorkSynchronizedUnknownReason, + }, + } + wantCRP.Status.PlacementStatuses = []placementv1beta1.ResourcePlacementStatus{ + { + ClusterName: member1Name, Conditions: []metav1.Condition{ { Status: metav1.ConditionTrue, - Type: string(placementv1beta1.ClusterResourcePlacementScheduledConditionType), - Reason: ResourceScheduleSucceededReason, + Type: string(placementv1beta1.ResourceScheduledConditionType), + Reason: condition.ScheduleSucceededReason, }, { Status: metav1.ConditionTrue, - Type: string(placementv1beta1.ClusterResourcePlacementRolloutStartedConditionType), + Type: string(placementv1beta1.ResourceRolloutStartedConditionType), Reason: condition.RolloutStartedReason, }, { Status: metav1.ConditionTrue, - Type: string(placementv1beta1.ClusterResourcePlacementOverriddenConditionType), - Reason: condition.OverrideNotSpecifiedReason, + Type: string(placementv1beta1.ResourceOverriddenConditionType), + Reason: condition.OverriddenSucceededReason, }, { Status: metav1.ConditionUnknown, - Type: string(placementv1beta1.ClusterResourcePlacementWorkSynchronizedConditionType), + Type: string(placementv1beta1.ResourceWorkSynchronizedConditionType), Reason: condition.WorkSynchronizedUnknownReason, }, }, - PlacementStatuses: []placementv1beta1.ResourcePlacementStatus{ + }, + { + ClusterName: member2Name, + Conditions: []metav1.Condition{ + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceScheduledConditionType), + Reason: condition.ScheduleSucceededReason, + }, { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceRolloutStartedConditionType), + Reason: condition.RolloutStartedReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceOverriddenConditionType), + Reason: condition.OverriddenSucceededReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceWorkSynchronizedConditionType), + Reason: condition.WorkSynchronizedReason, + }, + { + Status: metav1.ConditionUnknown, + Type: string(placementv1beta1.ResourcesAppliedConditionType), + Reason: condition.ApplyPendingReason, + }, + }, + }, + } + gotCRP = retrieveAndValidateClusterResourcePlacement(testCRPName, wantCRP) + By("Ensure placement status metric was emitted") + wantMetrics = append(wantMetrics, &prometheusclientmodel.Metric{ + Label: []*prometheusclientmodel.LabelPair{ + {Name: ptr.To("name"), Value: ptr.To(gotCRP.Name)}, + {Name: ptr.To("generation"), Value: ptr.To(strconv.FormatInt(gotCRP.Generation, 10))}, + {Name: ptr.To("conditionType"), Value: ptr.To(string(placementv1beta1.ClusterResourcePlacementWorkSynchronizedConditionType))}, + {Name: ptr.To("status"), Value: ptr.To(string(corev1.ConditionUnknown))}, + }, + Gauge: &prometheusclientmodel.Gauge{ + Value: ptr.To(float64(time.Now().UnixNano()) / 1e9), + }, + }) + checkPlacementStatusMetric(customRegistry, wantMetrics) + }) + + It("Emit metrics when CRP spec updates with different generations", func() { + By("Update clusterSchedulingPolicySnapshot status to schedule success") + updateClusterSchedulingPolicySnapshotStatus(metav1.ConditionTrue, true) + + By("Create an overridden clusterResourceBinding on member-1") + member1Binding = createOverriddenClusterResourceBinding(member1Name, gotPolicySnapshot, gotResourceSnapshot) + + By("Validate the CRP status") + wantCRP := &placementv1beta1.ClusterResourcePlacement{ + ObjectMeta: metav1.ObjectMeta{ + Name: testCRPName, + Finalizers: []string{placementv1beta1.ClusterResourcePlacementCleanupFinalizer}, + }, + Spec: crp.Spec, + Status: placementv1beta1.ClusterResourcePlacementStatus{ + ObservedResourceIndex: "0", + Conditions: []metav1.Condition{ + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ClusterResourcePlacementScheduledConditionType), + Reason: ResourceScheduleSucceededReason, + }, + { + Status: metav1.ConditionUnknown, + Type: string(placementv1beta1.ClusterResourcePlacementRolloutStartedConditionType), + Reason: condition.RolloutStartedUnknownReason, + }, + }, + PlacementStatuses: []placementv1beta1.ResourcePlacementStatus{ + { ClusterName: member1Name, Conditions: []metav1.Condition{ { @@ -519,32 +805,894 @@ var _ = Describe("Test ClusterResourcePlacement Controller", func() { Type: string(placementv1beta1.ResourceScheduledConditionType), Reason: condition.ScheduleSucceededReason, }, - { - Status: metav1.ConditionTrue, - Type: string(placementv1beta1.ResourceRolloutStartedConditionType), - Reason: condition.RolloutStartedReason, - }, - { - Status: metav1.ConditionTrue, - Type: string(placementv1beta1.ResourceOverriddenConditionType), - Reason: condition.OverriddenSucceededReason, - }, - { - Status: metav1.ConditionTrue, - Type: string(placementv1beta1.ResourceWorkSynchronizedConditionType), - Reason: condition.WorkSynchronizedReason, - }, { Status: metav1.ConditionUnknown, - Type: string(placementv1beta1.ResourcesAppliedConditionType), - Reason: condition.ApplyPendingReason, + Type: string(placementv1beta1.ResourceRolloutStartedConditionType), + Reason: condition.RolloutStartedUnknownReason, }, }, }, }, }, } - retrieveAndValidateClusterResourcePlacement(testCRPName, wantCRP) - }) - }) -}) + gotCRP = retrieveAndValidateClusterResourcePlacement(testCRPName, wantCRP) + + By("Ensure placement status metric was emitted") + wantMetrics := []*prometheusclientmodel.Metric{ + { + Label: []*prometheusclientmodel.LabelPair{ + {Name: ptr.To("name"), Value: ptr.To(gotCRP.Name)}, + {Name: ptr.To("generation"), Value: ptr.To(strconv.FormatInt(gotCRP.Generation, 10))}, + {Name: ptr.To("conditionType"), Value: ptr.To(string(placementv1beta1.ClusterResourcePlacementScheduledConditionType))}, + {Name: ptr.To("status"), Value: ptr.To(string(corev1.ConditionUnknown))}, + }, + Gauge: &prometheusclientmodel.Gauge{ + Value: ptr.To(float64(time.Now().UnixNano()) / 1e9), + }, + }, + { + Label: []*prometheusclientmodel.LabelPair{ + {Name: ptr.To("name"), Value: ptr.To(gotCRP.Name)}, + {Name: ptr.To("generation"), Value: ptr.To(strconv.FormatInt(gotCRP.Generation, 10))}, + {Name: ptr.To("conditionType"), Value: ptr.To(string(placementv1beta1.ClusterResourcePlacementRolloutStartedConditionType))}, + {Name: ptr.To("status"), Value: ptr.To(string(corev1.ConditionUnknown))}, + }, + Gauge: &prometheusclientmodel.Gauge{ + Value: ptr.To(float64(time.Now().UnixNano()) / 1e9), + }, + }, + } + checkPlacementStatusMetric(customRegistry, wantMetrics) + + By("Create a synchronized clusterResourceBinding on member-2") + member2Binding = createSynchronizedClusterResourceBinding(member2Name, gotPolicySnapshot, gotResourceSnapshot) + + wantCRP.Status.Conditions = []metav1.Condition{ + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ClusterResourcePlacementScheduledConditionType), + Reason: ResourceScheduleSucceededReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ClusterResourcePlacementRolloutStartedConditionType), + Reason: condition.RolloutStartedReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ClusterResourcePlacementOverriddenConditionType), + Reason: condition.OverrideNotSpecifiedReason, + }, + { + Status: metav1.ConditionUnknown, + Type: string(placementv1beta1.ClusterResourcePlacementWorkSynchronizedConditionType), + Reason: condition.WorkSynchronizedUnknownReason, + }, + } + wantCRP.Status.PlacementStatuses = []placementv1beta1.ResourcePlacementStatus{ + { + ClusterName: member1Name, + Conditions: []metav1.Condition{ + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceScheduledConditionType), + Reason: condition.ScheduleSucceededReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceRolloutStartedConditionType), + Reason: condition.RolloutStartedReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceOverriddenConditionType), + Reason: condition.OverriddenSucceededReason, + }, + { + Status: metav1.ConditionUnknown, + Type: string(placementv1beta1.ResourceWorkSynchronizedConditionType), + Reason: condition.WorkSynchronizedUnknownReason, + }, + }, + }, + { + ClusterName: member2Name, + Conditions: []metav1.Condition{ + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceScheduledConditionType), + Reason: condition.ScheduleSucceededReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceRolloutStartedConditionType), + Reason: condition.RolloutStartedReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceOverriddenConditionType), + Reason: condition.OverriddenSucceededReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceWorkSynchronizedConditionType), + Reason: condition.WorkSynchronizedReason, + }, + { + Status: metav1.ConditionUnknown, + Type: string(placementv1beta1.ResourcesAppliedConditionType), + Reason: condition.ApplyPendingReason, + }, + }, + }, + } + gotCRP = retrieveAndValidateClusterResourcePlacement(testCRPName, wantCRP) + + By("Ensure placement status metric was emitted") + wantMetrics = append(wantMetrics, &prometheusclientmodel.Metric{ + Label: []*prometheusclientmodel.LabelPair{ + {Name: ptr.To("name"), Value: ptr.To(gotCRP.Name)}, + {Name: ptr.To("generation"), Value: ptr.To(strconv.FormatInt(gotCRP.Generation, 10))}, + {Name: ptr.To("conditionType"), Value: ptr.To(string(placementv1beta1.ClusterResourcePlacementWorkSynchronizedConditionType))}, + {Name: ptr.To("status"), Value: ptr.To(string(corev1.ConditionUnknown))}, + }, + Gauge: &prometheusclientmodel.Gauge{ + Value: ptr.To(float64(time.Now().UnixNano()) / 1e9), + }, + }) + checkPlacementStatusMetric(customRegistry, wantMetrics) + + By("Update CRP spec to add another resource selector") + gotCRP.Spec.ResourceSelectors = append(crp.Spec.ResourceSelectors, + placementv1beta1.ClusterResourceSelector{ + Group: corev1.GroupName, + Version: "v1", + Kind: "Namespace", + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"region": "west"}, + }, + }) + Expect(k8sClient.Update(ctx, gotCRP)).Should(Succeed(), "Failed to update crp") + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: testCRPName}, crp)).Should(BeNil(), "Get() clusterResourcePlacement mismatch") + + By("Validate CRP status with new spec") + wantCRP.Spec.ResourceSelectors = gotCRP.Spec.ResourceSelectors + wantCRP.Status.Conditions = []metav1.Condition{ + { + Status: metav1.ConditionUnknown, + Type: string(placementv1beta1.ClusterResourcePlacementScheduledConditionType), + Reason: SchedulingUnknownReason, + ObservedGeneration: 2, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ClusterResourcePlacementRolloutStartedConditionType), + Reason: condition.RolloutStartedReason, + ObservedGeneration: 1, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ClusterResourcePlacementOverriddenConditionType), + Reason: condition.OverrideNotSpecifiedReason, + ObservedGeneration: 1, + }, + { + Status: metav1.ConditionUnknown, + Type: string(placementv1beta1.ClusterResourcePlacementWorkSynchronizedConditionType), + Reason: condition.WorkSynchronizedUnknownReason, + ObservedGeneration: 1, + }, + } + wantCRP.Status.PlacementStatuses = nil + gotCRP = retrieveAndValidateClusterResourcePlacement(testCRPName, wantCRP) + + By("Ensure placement status metric was emitted with different generations") + // When a CRP spec is updated, the generation of the CRP changes. Therefore, the observed generation for the conditions will also change. + // Should have multiples of same condition type with different generations. + // In this case we have 2 metrics for Scheduled condition type as crp generation goes from 1 to 2. + wantMetrics = append(wantMetrics, &prometheusclientmodel.Metric{ + Label: []*prometheusclientmodel.LabelPair{ + {Name: ptr.To("name"), Value: ptr.To(gotCRP.Name)}, + {Name: ptr.To("generation"), Value: ptr.To(strconv.FormatInt(gotCRP.Generation, 10))}, + {Name: ptr.To("conditionType"), Value: ptr.To(string(placementv1beta1.ClusterResourcePlacementScheduledConditionType))}, + {Name: ptr.To("status"), Value: ptr.To(string(corev1.ConditionUnknown))}, + }, + Gauge: &prometheusclientmodel.Gauge{ + Value: ptr.To(float64(time.Now().UnixNano()) / 1e9), + }, + }) + checkPlacementStatusMetric(customRegistry, wantMetrics) + }) + + It("Emit metrics for complete CRP", func() { + By("Update clusterSchedulingPolicySnapshot status to schedule success") + updateClusterSchedulingPolicySnapshotStatus(metav1.ConditionTrue, true) + + By("Create a synchronized clusterResourceBinding on member-1") + member1Binding = createAvailableClusterResourceBinding(member1Name, gotPolicySnapshot, gotResourceSnapshot) + + By("Create synchronized clusterResourceBinding on member-2") + member2Binding = createAvailableClusterResourceBinding(member2Name, gotPolicySnapshot, gotResourceSnapshot) + + By("Validate the CRP status is Available") + wantCRP := &placementv1beta1.ClusterResourcePlacement{ + ObjectMeta: metav1.ObjectMeta{ + Name: testCRPName, + Finalizers: []string{placementv1beta1.ClusterResourcePlacementCleanupFinalizer}, + }, + Spec: crp.Spec, + Status: placementv1beta1.ClusterResourcePlacementStatus{ + ObservedResourceIndex: "0", + Conditions: []metav1.Condition{ + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ClusterResourcePlacementScheduledConditionType), + Reason: ResourceScheduleSucceededReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ClusterResourcePlacementRolloutStartedConditionType), + Reason: condition.RolloutStartedReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ClusterResourcePlacementOverriddenConditionType), + Reason: condition.OverrideNotSpecifiedReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ClusterResourcePlacementWorkSynchronizedConditionType), + Reason: condition.WorkSynchronizedReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ClusterResourcePlacementAppliedConditionType), + Reason: condition.ApplySucceededReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ClusterResourcePlacementAvailableConditionType), + Reason: condition.AvailableReason, + }, + }, + PlacementStatuses: []placementv1beta1.ResourcePlacementStatus{ + { + ClusterName: member1Name, + Conditions: []metav1.Condition{ + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceScheduledConditionType), + Reason: condition.ScheduleSucceededReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceRolloutStartedConditionType), + Reason: condition.RolloutStartedReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceOverriddenConditionType), + Reason: condition.OverriddenSucceededReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceWorkSynchronizedConditionType), + Reason: condition.WorkSynchronizedReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourcesAppliedConditionType), + Reason: condition.ApplySucceededReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourcesAvailableConditionType), + Reason: condition.AvailableReason, + }, + }, + }, + { + ClusterName: member2Name, + Conditions: []metav1.Condition{ + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceScheduledConditionType), + Reason: condition.ScheduleSucceededReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceRolloutStartedConditionType), + Reason: condition.RolloutStartedReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceOverriddenConditionType), + Reason: condition.OverriddenSucceededReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceWorkSynchronizedConditionType), + Reason: condition.WorkSynchronizedReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourcesAppliedConditionType), + Reason: condition.ApplySucceededReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourcesAvailableConditionType), + Reason: condition.AvailableReason, + }, + }, + }, + }, + }, + } + gotCRP = retrieveAndValidateClusterResourcePlacement(testCRPName, wantCRP) + + By("Ensure placement status metric was emitted") + wantMetrics := []*prometheusclientmodel.Metric{ + { + Label: []*prometheusclientmodel.LabelPair{ + {Name: ptr.To("name"), Value: ptr.To(gotCRP.Name)}, + {Name: ptr.To("generation"), Value: ptr.To(strconv.FormatInt(gotCRP.Generation, 10))}, + {Name: ptr.To("conditionType"), Value: ptr.To(string(placementv1beta1.ClusterResourcePlacementScheduledConditionType))}, + {Name: ptr.To("status"), Value: ptr.To(string(corev1.ConditionUnknown))}, + }, + Gauge: &prometheusclientmodel.Gauge{ + Value: ptr.To(float64(time.Now().UnixNano()) / 1e9), + }, + }, + { + Label: []*prometheusclientmodel.LabelPair{ + {Name: ptr.To("name"), Value: ptr.To(gotCRP.Name)}, + {Name: ptr.To("generation"), Value: ptr.To(strconv.FormatInt(gotCRP.Generation, 10))}, + {Name: ptr.To("conditionType"), Value: ptr.To(string(placementv1beta1.ClusterResourcePlacementRolloutStartedConditionType))}, + {Name: ptr.To("status"), Value: ptr.To(string(corev1.ConditionUnknown))}, + }, + Gauge: &prometheusclientmodel.Gauge{ + Value: ptr.To(float64(time.Now().UnixNano()) / 1e9), + }, + }, + { + Label: []*prometheusclientmodel.LabelPair{ + {Name: ptr.To("name"), Value: ptr.To(gotCRP.Name)}, + {Name: ptr.To("generation"), Value: ptr.To(strconv.FormatInt(gotCRP.Generation, 10))}, + {Name: ptr.To("conditionType"), Value: ptr.To(string(placementv1beta1.ClusterResourcePlacementWorkSynchronizedConditionType))}, + {Name: ptr.To("status"), Value: ptr.To(string(corev1.ConditionUnknown))}, + }, + Gauge: &prometheusclientmodel.Gauge{ + Value: ptr.To(float64(time.Now().UnixNano()) / 1e9), + }, + }, + { + Label: []*prometheusclientmodel.LabelPair{ + {Name: ptr.To("name"), Value: ptr.To(crp.Name)}, + {Name: ptr.To("generation"), Value: ptr.To(strconv.FormatInt(gotCRP.Generation, 10))}, + {Name: ptr.To("conditionType"), Value: ptr.To(string(placementv1beta1.ClusterResourcePlacementAppliedConditionType))}, + {Name: ptr.To("status"), Value: ptr.To(string(corev1.ConditionUnknown))}, + }, + Gauge: &prometheusclientmodel.Gauge{ + Value: ptr.To(float64(time.Now().UnixNano()) / 1e9), + }, + }, + { + Label: []*prometheusclientmodel.LabelPair{ + {Name: ptr.To("name"), Value: ptr.To(gotCRP.Name)}, + {Name: ptr.To("generation"), Value: ptr.To(strconv.FormatInt(gotCRP.Generation, 10))}, + {Name: ptr.To("conditionType"), Value: ptr.To("ClusterResourcePlacementCompleted")}, + {Name: ptr.To("status"), Value: ptr.To(string(corev1.ConditionTrue))}, + }, + Gauge: &prometheusclientmodel.Gauge{ + Value: ptr.To(float64(time.Now().UnixNano()) / 1e9), + }, + }, + } + checkPlacementStatusMetric(customRegistry, wantMetrics) + }) + }) + + Context("When creating a ReportDiff ClusterResourcePlacement", func() { + BeforeEach(func() { + // Create a test registry + registerMetrics() + + By("Create a new crp") + crp = &placementv1beta1.ClusterResourcePlacement{ + ObjectMeta: metav1.ObjectMeta{ + Name: testCRPName, + }, + Spec: placementv1beta1.ClusterResourcePlacementSpec{ + ResourceSelectors: []placementv1beta1.ClusterResourceSelector{ + { + Group: corev1.GroupName, + Version: "v1", + Kind: "Namespace", + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"region": "east"}, + }, + }, + }, + RevisionHistoryLimit: ptr.To(int32(1)), + Strategy: placementv1beta1.RolloutStrategy{ + ApplyStrategy: &placementv1beta1.ApplyStrategy{ + Type: placementv1beta1.ApplyStrategyTypeReportDiff, + }, + }, + }, + } + Expect(k8sClient.Create(ctx, crp)).Should(Succeed(), "Failed to create crp") + + By("Check clusterSchedulingPolicySnapshot") + gotPolicySnapshot = checkClusterSchedulingPolicySnapshot() + + By("Check clusterResourceSnapshot") + gotResourceSnapshot = checkClusterResourceSnapshot() + + By("Check CRP status") + wantCRP := &placementv1beta1.ClusterResourcePlacement{ + ObjectMeta: metav1.ObjectMeta{ + Name: testCRPName, + Finalizers: []string{placementv1beta1.ClusterResourcePlacementCleanupFinalizer}, + }, + Spec: crp.Spec, + Status: placementv1beta1.ClusterResourcePlacementStatus{ + ObservedResourceIndex: "0", + Conditions: []metav1.Condition{ + { + Status: metav1.ConditionUnknown, + Type: string(placementv1beta1.ClusterResourcePlacementScheduledConditionType), + Reason: SchedulingUnknownReason, + }, + }, + }, + } + gotCRP = retrieveAndValidateClusterResourcePlacement(crp.Name, wantCRP) + }) + + AfterEach(func() { + By("Deleting crp") + Expect(k8sClient.Delete(ctx, gotCRP)).Should(Succeed()) + retrieveAndValidateCRPDeletion(gotCRP) + + By("Deleting clusterResourceBindings") + Expect(k8sClient.Delete(ctx, member1Binding)).Should(Succeed()) + Expect(k8sClient.Delete(ctx, member2Binding)).Should(Succeed()) + + Expect(customRegistry.Unregister(metrics.FleetPlacementStatusLastTimeStampSeconds)).Should(BeTrue()) + }) + + It("Emit metrics for ReportDiff Incomplete CRP", func() { + By("Update clusterSchedulingPolicySnapshot status to schedule success") + updateClusterSchedulingPolicySnapshotStatus(metav1.ConditionTrue, true) + + By("Create a synchronized clusterResourceBinding on member-1") + member1Binding = createSynchronizedClusterResourceBinding(member1Name, gotPolicySnapshot, gotResourceSnapshot) + + By("Create a synchronized clusterResourceBinding on member-2") + member2Binding = createSynchronizedClusterResourceBinding(member2Name, gotPolicySnapshot, gotResourceSnapshot) + + By("Validate CRP status") + wantCRP := &placementv1beta1.ClusterResourcePlacement{ + ObjectMeta: metav1.ObjectMeta{ + Name: testCRPName, + Finalizers: []string{placementv1beta1.ClusterResourcePlacementCleanupFinalizer}, + }, + Spec: crp.Spec, + Status: placementv1beta1.ClusterResourcePlacementStatus{ + ObservedResourceIndex: "0", + Conditions: []metav1.Condition{ + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ClusterResourcePlacementScheduledConditionType), + Reason: ResourceScheduleSucceededReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ClusterResourcePlacementRolloutStartedConditionType), + Reason: condition.RolloutStartedReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ClusterResourcePlacementOverriddenConditionType), + Reason: condition.OverrideNotSpecifiedReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ClusterResourcePlacementWorkSynchronizedConditionType), + Reason: condition.WorkSynchronizedReason, + }, + { + Status: metav1.ConditionUnknown, + Type: string(placementv1beta1.ClusterResourcePlacementDiffReportedConditionType), + Reason: condition.DiffReportedStatusUnknownReason, + }, + }, + PlacementStatuses: []placementv1beta1.ResourcePlacementStatus{ + { + ClusterName: member1Name, + Conditions: []metav1.Condition{ + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceScheduledConditionType), + Reason: condition.ScheduleSucceededReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceRolloutStartedConditionType), + Reason: condition.RolloutStartedReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceOverriddenConditionType), + Reason: condition.OverriddenSucceededReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceWorkSynchronizedConditionType), + Reason: condition.WorkSynchronizedReason, + }, + { + Status: metav1.ConditionUnknown, + Type: string(placementv1beta1.ResourcesDiffReportedConditionType), + Reason: condition.DiffReportedStatusUnknownReason, + }, + }, + }, + { + ClusterName: member2Name, + Conditions: []metav1.Condition{ + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceScheduledConditionType), + Reason: condition.ScheduleSucceededReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceRolloutStartedConditionType), + Reason: condition.RolloutStartedReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceOverriddenConditionType), + Reason: condition.OverriddenSucceededReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceWorkSynchronizedConditionType), + Reason: condition.WorkSynchronizedReason, + }, + { + Status: metav1.ConditionUnknown, + Type: string(placementv1beta1.ResourcesDiffReportedConditionType), + Reason: condition.DiffReportedStatusUnknownReason, + }, + }, + }, + }, + }, + } + gotCRP = retrieveAndValidateClusterResourcePlacement(testCRPName, wantCRP) + + By("Ensure placement status metric was emitted") + wantMetrics := []*prometheusclientmodel.Metric{ + { + Label: []*prometheusclientmodel.LabelPair{ + {Name: ptr.To("name"), Value: ptr.To(gotCRP.Name)}, + {Name: ptr.To("generation"), Value: ptr.To(strconv.FormatInt(gotCRP.Generation, 10))}, + {Name: ptr.To("conditionType"), Value: ptr.To(string(placementv1beta1.ClusterResourcePlacementScheduledConditionType))}, + {Name: ptr.To("status"), Value: ptr.To(string(corev1.ConditionUnknown))}, + }, + Gauge: &prometheusclientmodel.Gauge{ + Value: ptr.To(float64(time.Now().UnixNano()) / 1e9), + }, + }, + { + Label: []*prometheusclientmodel.LabelPair{ + {Name: ptr.To("name"), Value: ptr.To(gotCRP.Name)}, + {Name: ptr.To("generation"), Value: ptr.To(strconv.FormatInt(gotCRP.Generation, 10))}, + {Name: ptr.To("conditionType"), Value: ptr.To(string(placementv1beta1.ClusterResourcePlacementRolloutStartedConditionType))}, + {Name: ptr.To("status"), Value: ptr.To(string(corev1.ConditionUnknown))}, + }, + Gauge: &prometheusclientmodel.Gauge{ + Value: ptr.To(float64(time.Now().UnixNano()) / 1e9), + }, + }, + { + Label: []*prometheusclientmodel.LabelPair{ + {Name: ptr.To("name"), Value: ptr.To(gotCRP.Name)}, + {Name: ptr.To("generation"), Value: ptr.To(strconv.FormatInt(gotCRP.Generation, 10))}, + {Name: ptr.To("conditionType"), Value: ptr.To(string(placementv1beta1.ClusterResourcePlacementWorkSynchronizedConditionType))}, + {Name: ptr.To("status"), Value: ptr.To(string(corev1.ConditionUnknown))}, + }, + Gauge: &prometheusclientmodel.Gauge{ + Value: ptr.To(float64(time.Now().UnixNano()) / 1e9), + }, + }, + { + Label: []*prometheusclientmodel.LabelPair{ + {Name: ptr.To("name"), Value: ptr.To(gotCRP.Name)}, + {Name: ptr.To("generation"), Value: ptr.To(strconv.FormatInt(gotCRP.Generation, 10))}, + {Name: ptr.To("conditionType"), Value: ptr.To(string(placementv1beta1.ClusterResourcePlacementDiffReportedConditionType))}, + {Name: ptr.To("status"), Value: ptr.To(string(corev1.ConditionUnknown))}, + }, + Gauge: &prometheusclientmodel.Gauge{ + Value: ptr.To(float64(time.Now().UnixNano()) / 1e9), + }, + }, + } + checkPlacementStatusMetric(customRegistry, wantMetrics) + }) + + It("Emit metrics for ReportDiff Complete CRP", func() { + By("Update clusterSchedulingPolicySnapshot status to schedule success") + updateClusterSchedulingPolicySnapshotStatus(metav1.ConditionTrue, true) + + By("Create a synchronized clusterResourceBinding on member-1") + member1Binding = createOverriddenClusterResourceBinding(member1Name, gotPolicySnapshot, gotResourceSnapshot) + + By("Create a synchronized clusterResourceBinding on member-2") + member2Binding = createOverriddenClusterResourceBinding(member2Name, gotPolicySnapshot, gotResourceSnapshot) + + By("Validate CRP status") + wantCRP := &placementv1beta1.ClusterResourcePlacement{ + ObjectMeta: metav1.ObjectMeta{ + Name: testCRPName, + Finalizers: []string{placementv1beta1.ClusterResourcePlacementCleanupFinalizer}, + }, + Spec: crp.Spec, + Status: placementv1beta1.ClusterResourcePlacementStatus{ + ObservedResourceIndex: "0", + Conditions: []metav1.Condition{ + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ClusterResourcePlacementScheduledConditionType), + Reason: ResourceScheduleSucceededReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ClusterResourcePlacementRolloutStartedConditionType), + Reason: condition.RolloutStartedReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ClusterResourcePlacementOverriddenConditionType), + Reason: condition.OverrideNotSpecifiedReason, + }, + { + Status: metav1.ConditionUnknown, + Type: string(placementv1beta1.ClusterResourcePlacementWorkSynchronizedConditionType), + Reason: condition.WorkSynchronizedUnknownReason, + }, + }, + PlacementStatuses: []placementv1beta1.ResourcePlacementStatus{ + { + ClusterName: member1Name, + Conditions: []metav1.Condition{ + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceScheduledConditionType), + Reason: condition.ScheduleSucceededReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceRolloutStartedConditionType), + Reason: condition.RolloutStartedReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceOverriddenConditionType), + Reason: condition.OverriddenSucceededReason, + }, + { + Status: metav1.ConditionUnknown, + Type: string(placementv1beta1.ResourceWorkSynchronizedConditionType), + Reason: condition.WorkSynchronizedUnknownReason, + }, + }, + }, + { + ClusterName: member2Name, + Conditions: []metav1.Condition{ + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceScheduledConditionType), + Reason: condition.ScheduleSucceededReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceRolloutStartedConditionType), + Reason: condition.RolloutStartedReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceOverriddenConditionType), + Reason: condition.OverriddenSucceededReason, + }, + { + Status: metav1.ConditionUnknown, + Type: string(placementv1beta1.ResourceWorkSynchronizedConditionType), + Reason: condition.WorkSynchronizedUnknownReason, + }, + }, + }, + }, + }, + } + gotCRP = retrieveAndValidateClusterResourcePlacement(testCRPName, wantCRP) + + By("Ensure placement status metric was emitted") + wantMetrics := []*prometheusclientmodel.Metric{ + { + Label: []*prometheusclientmodel.LabelPair{ + {Name: ptr.To("name"), Value: ptr.To(gotCRP.Name)}, + {Name: ptr.To("generation"), Value: ptr.To(strconv.FormatInt(gotCRP.Generation, 10))}, + {Name: ptr.To("conditionType"), Value: ptr.To(string(placementv1beta1.ClusterResourcePlacementScheduledConditionType))}, + {Name: ptr.To("status"), Value: ptr.To(string(corev1.ConditionUnknown))}, + }, + Gauge: &prometheusclientmodel.Gauge{ + Value: ptr.To(float64(time.Now().UnixNano()) / 1e9), + }, + }, + { + Label: []*prometheusclientmodel.LabelPair{ + {Name: ptr.To("name"), Value: ptr.To(gotCRP.Name)}, + {Name: ptr.To("generation"), Value: ptr.To(strconv.FormatInt(gotCRP.Generation, 10))}, + {Name: ptr.To("conditionType"), Value: ptr.To(string(placementv1beta1.ClusterResourcePlacementRolloutStartedConditionType))}, + {Name: ptr.To("status"), Value: ptr.To(string(corev1.ConditionUnknown))}, + }, + Gauge: &prometheusclientmodel.Gauge{ + Value: ptr.To(float64(time.Now().UnixNano()) / 1e9), + }, + }, + { + Label: []*prometheusclientmodel.LabelPair{ + {Name: ptr.To("name"), Value: ptr.To(gotCRP.Name)}, + {Name: ptr.To("generation"), Value: ptr.To(strconv.FormatInt(gotCRP.Generation, 10))}, + {Name: ptr.To("conditionType"), Value: ptr.To(string(placementv1beta1.ClusterResourcePlacementWorkSynchronizedConditionType))}, + {Name: ptr.To("status"), Value: ptr.To(string(corev1.ConditionUnknown))}, + }, + Gauge: &prometheusclientmodel.Gauge{ + Value: ptr.To(float64(time.Now().UnixNano()) / 1e9), + }, + }, + } + checkPlacementStatusMetric(customRegistry, wantMetrics) + + By("Create a reportDiff clusterResourceBinding on member-1") + member1Binding = updateClusterResourceBindingWithReportDiff(member1Binding) + + By("Create a reportDiff clusterResourceBinding on member-2") + member2Binding = updateClusterResourceBindingWithReportDiff(member2Binding) + + By("Validate CRP status") + wantCRP.Status.Conditions = []metav1.Condition{ + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ClusterResourcePlacementScheduledConditionType), + Reason: ResourceScheduleSucceededReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ClusterResourcePlacementRolloutStartedConditionType), + Reason: condition.RolloutStartedReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ClusterResourcePlacementOverriddenConditionType), + Reason: condition.OverrideNotSpecifiedReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ClusterResourcePlacementWorkSynchronizedConditionType), + Reason: condition.WorkSynchronizedReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ClusterResourcePlacementDiffReportedConditionType), + Reason: condition.DiffReportedStatusTrueReason, + }, + } + wantCRP.Status.PlacementStatuses = []placementv1beta1.ResourcePlacementStatus{ + { + ClusterName: member1Name, + Conditions: []metav1.Condition{ + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceScheduledConditionType), + Reason: condition.ScheduleSucceededReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceRolloutStartedConditionType), + Reason: condition.RolloutStartedReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceOverriddenConditionType), + Reason: condition.OverriddenSucceededReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceWorkSynchronizedConditionType), + Reason: condition.WorkSynchronizedReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourcesDiffReportedConditionType), + Reason: condition.DiffReportedStatusTrueReason, + }, + }, + }, + { + ClusterName: member2Name, + Conditions: []metav1.Condition{ + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceScheduledConditionType), + Reason: condition.ScheduleSucceededReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceRolloutStartedConditionType), + Reason: condition.RolloutStartedReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceOverriddenConditionType), + Reason: condition.OverriddenSucceededReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceWorkSynchronizedConditionType), + Reason: condition.WorkSynchronizedReason, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourcesDiffReportedConditionType), + Reason: condition.DiffReportedStatusTrueReason, + }, + }, + }, + } + gotCRP = retrieveAndValidateClusterResourcePlacement(testCRPName, wantCRP) + + By("Ensure placement status complete metric was emitted") + wantMetrics = append(wantMetrics, + &prometheusclientmodel.Metric{ + Label: []*prometheusclientmodel.LabelPair{ + {Name: ptr.To("name"), Value: ptr.To(gotCRP.Name)}, + {Name: ptr.To("generation"), Value: ptr.To(strconv.FormatInt(gotCRP.Generation, 10))}, + {Name: ptr.To("conditionType"), Value: ptr.To("ClusterResourcePlacementCompleted")}, + {Name: ptr.To("status"), Value: ptr.To(string(corev1.ConditionTrue))}, + }, + Gauge: &prometheusclientmodel.Gauge{ + Value: ptr.To(float64(time.Now().UnixNano()) / 1e9), + }, + }, + ) + checkPlacementStatusMetric(customRegistry, wantMetrics) + }) + }) +}) + +func registerMetrics() { + // Create a test registry + customRegistry = prometheus.NewRegistry() + Expect(customRegistry.Register(metrics.FleetPlacementStatusLastTimeStampSeconds)).Should(Succeed()) + metrics.FleetPlacementStatusLastTimeStampSeconds.Reset() +} + +func checkPlacementStatusMetric(registry *prometheus.Registry, wantMetrics []*prometheusclientmodel.Metric) { + metricFamilies, err := registry.Gather() + Expect(err).Should(Succeed()) + var placementStatusMetrics []*prometheusclientmodel.Metric + for _, mf := range metricFamilies { + if mf.GetName() == "fleet_workload_placement_status_last_timestamp_seconds" { + placementStatusMetrics = mf.GetMetric() + } + } + // Sort the emitted metrics for comparison + Expect(cmp.Diff(placementStatusMetrics, wantMetrics, metricsUtils.MetricsCmpOptions...)).Should(BeEmpty(), "Placement status metrics do not match diff (-got, +want):") +} diff --git a/pkg/utils/controller/metrics/metrics.go b/pkg/utils/controller/metrics/metrics.go index a1e5fd192..b6fa9187c 100644 --- a/pkg/utils/controller/metrics/metrics.go +++ b/pkg/utils/controller/metrics/metrics.go @@ -61,10 +61,11 @@ var ( Help: "Number of currently used workers per controller", }, []string{"controller"}) - FleetPlacementStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Name: "fleet_workload_placement_complete", - Help: "Placement complete status ", - }, []string{"name"}) + // FleetPlacementStatusLastTimeStampSeconds is a prometheus metric which keeps track of the last placement status. + FleetPlacementStatusLastTimeStampSeconds = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "fleet_workload_placement_status_last_timestamp_seconds", + Help: "Timestamp in seconds of the last current placement status condition of crp.", + }, []string{"name", "generation", "conditionType", "status"}) // FleetEvictionStatus is prometheus metrics which holds the // status of eviction completion. @@ -81,7 +82,7 @@ func init() { FleetReconcileTime, FleetWorkerCount, FleetActiveWorkers, - FleetPlacementStatus, + FleetPlacementStatusLastTimeStampSeconds, FleetEvictionStatus, ) } diff --git a/test/utils/metrics/metrics.go b/test/utils/metrics/metrics.go new file mode 100644 index 000000000..fbf326625 --- /dev/null +++ b/test/utils/metrics/metrics.go @@ -0,0 +1,37 @@ +/* +Copyright 2025 The KubeFleet Authors. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package metrics + +import ( + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + prometheusclientmodel "github.com/prometheus/client_model/go" +) + +var ( + MetricsCmpOptions = []cmp.Option{ + cmpopts.SortSlices(func(a, b *prometheusclientmodel.Metric) bool { + return a.GetGauge().GetValue() < b.GetGauge().GetValue() // sort by time + }), + cmpopts.SortSlices(func(a, b *prometheusclientmodel.LabelPair) bool { + if a.Name == nil || b.Name == nil { + return a.Name == nil + } + return *a.Name < *b.Name // Sort by label + }), + cmp.Comparer(func(a, b *prometheusclientmodel.Gauge) bool { + return (a.GetValue() > 0) == (b.GetValue() > 0) + }), + cmpopts.IgnoreUnexported(prometheusclientmodel.Metric{}, prometheusclientmodel.LabelPair{}, prometheusclientmodel.Gauge{}), + } +) From e99b049686980658f2bf8554da5723bc7f2585c2 Mon Sep 17 00:00:00 2001 From: Britania Rodriguez Reyes Date: Wed, 16 Apr 2025 18:30:36 -0700 Subject: [PATCH 2/3] address comments Signed-off-by: Britania Rodriguez Reyes --- .../clusterresourceplacement/controller.go | 8 +- .../controller_integration_test.go | 127 +++++++++++++++++- test/utils/metrics/metrics.go | 11 +- 3 files changed, 134 insertions(+), 12 deletions(-) diff --git a/pkg/controllers/clusterresourceplacement/controller.go b/pkg/controllers/clusterresourceplacement/controller.go index c4a4ff8ae..6da08eca0 100644 --- a/pkg/controllers/clusterresourceplacement/controller.go +++ b/pkg/controllers/clusterresourceplacement/controller.go @@ -1052,7 +1052,7 @@ func emitPlacementStatusMetric(crp *fleetv1beta1.ClusterResourcePlacement) { status := "nil" cond := crp.GetCondition(string(fleetv1beta1.ClusterResourcePlacementScheduledConditionType)) if !condition.IsConditionStatusTrue(cond, crp.Generation) { - if cond != nil { + if cond != nil && cond.ObservedGeneration == crp.Generation { status = string(cond.Status) } metrics.FleetPlacementStatusLastTimeStampSeconds.WithLabelValues(crp.Name, strconv.FormatInt(crp.Generation, 10), string(fleetv1beta1.ClusterResourcePlacementScheduledConditionType), status).SetToCurrentTime() @@ -1064,7 +1064,7 @@ func emitPlacementStatusMetric(crp *fleetv1beta1.ClusterResourcePlacement) { for _, condType := range expectedCondTypes { cond = crp.GetCondition(string(condType.ClusterResourcePlacementConditionType())) if !condition.IsConditionStatusTrue(cond, crp.Generation) { - if cond != nil { + if cond != nil && cond.ObservedGeneration == crp.Generation { status = string(cond.Status) } metrics.FleetPlacementStatusLastTimeStampSeconds.WithLabelValues(crp.Name, strconv.FormatInt(crp.Generation, 10), string(condType.ClusterResourcePlacementConditionType()), status).SetToCurrentTime() @@ -1072,7 +1072,7 @@ func emitPlacementStatusMetric(crp *fleetv1beta1.ClusterResourcePlacement) { } } - // Emit the "ClusterResourcePlacementCompleted" condition metric to indicate that the CRP has completed. + // Emit the "Completed" condition metric to indicate that the CRP has completed. // This condition is used solely for metric reporting purposes. - metrics.FleetPlacementStatusLastTimeStampSeconds.WithLabelValues(crp.Name, strconv.FormatInt(crp.Generation, 10), "ClusterResourcePlacementCompleted", string(metav1.ConditionTrue)).SetToCurrentTime() + metrics.FleetPlacementStatusLastTimeStampSeconds.WithLabelValues(crp.Name, strconv.FormatInt(crp.Generation, 10), "Completed", string(metav1.ConditionTrue)).SetToCurrentTime() } diff --git a/pkg/controllers/clusterresourceplacement/controller_integration_test.go b/pkg/controllers/clusterresourceplacement/controller_integration_test.go index 0cbc88b49..9825a022f 100644 --- a/pkg/controllers/clusterresourceplacement/controller_integration_test.go +++ b/pkg/controllers/clusterresourceplacement/controller_integration_test.go @@ -319,7 +319,7 @@ func checkClusterResourceSnapshot() *placementv1beta1.ClusterResourceSnapshot { return retrieveAndValidateResourceSnapshot(crp, wantResourceSnapshot) } -func updateClusterSchedulingPolicySnapshotStatus(status metav1.ConditionStatus, clustersSelected bool) { +func updateClusterSchedulingPolicySnapshotStatus(status metav1.ConditionStatus, clustersSelected bool) *placementv1beta1.ClusterSchedulingPolicySnapshot { reason := ResourceScheduleSucceededReason if status == metav1.ConditionFalse { reason = ResourceScheduleFailedReason @@ -360,6 +360,7 @@ func updateClusterSchedulingPolicySnapshotStatus(status metav1.ConditionStatus, // Apply status update Expect(k8sClient.Status().Update(ctx, gotPolicySnapshot)).Should(Succeed(), "Failed to update the policy snapshot status") + return retrieveAndValidatePolicySnapshot(gotCRP, gotPolicySnapshot) } var _ = Describe("Test ClusterResourcePlacement Controller", func() { @@ -745,7 +746,7 @@ var _ = Describe("Test ClusterResourcePlacement Controller", func() { It("Emit metrics when CRP spec updates with different generations", func() { By("Update clusterSchedulingPolicySnapshot status to schedule success") - updateClusterSchedulingPolicySnapshotStatus(metav1.ConditionTrue, true) + gotPolicySnapshot = updateClusterSchedulingPolicySnapshotStatus(metav1.ConditionTrue, true) By("Create an overridden clusterResourceBinding on member-1") member1Binding = createOverriddenClusterResourceBinding(member1Name, gotPolicySnapshot, gotResourceSnapshot) @@ -1002,6 +1003,124 @@ var _ = Describe("Test ClusterResourcePlacement Controller", func() { }, }) checkPlacementStatusMetric(customRegistry, wantMetrics) + + By("Update clusterSchedulingPolicySnapshot status to schedule success") + // Update the annotation to match the CRP generation, which is now 2 + gotPolicySnapshot.Annotations[placementv1beta1.CRPGenerationAnnotation] = strconv.FormatInt(gotCRP.Generation, 10) + gotPolicySnapshot = retrieveAndValidatePolicySnapshot(gotCRP, gotPolicySnapshot) + gotPolicySnapshot = updateClusterSchedulingPolicySnapshotStatus(metav1.ConditionTrue, true) + + By("Validate CRP status with new observed generations for conditions") + wantCRP.Status.Conditions = []metav1.Condition{ + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ClusterResourcePlacementScheduledConditionType), + Reason: ResourceScheduleSucceededReason, + ObservedGeneration: 2, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ClusterResourcePlacementRolloutStartedConditionType), + Reason: condition.RolloutStartedReason, + ObservedGeneration: 2, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ClusterResourcePlacementOverriddenConditionType), + Reason: condition.OverrideNotSpecifiedReason, + ObservedGeneration: 2, + }, + { + Status: metav1.ConditionUnknown, + Type: string(placementv1beta1.ClusterResourcePlacementWorkSynchronizedConditionType), + Reason: condition.WorkSynchronizedUnknownReason, + ObservedGeneration: 2, + }, + } + wantCRP.Status.PlacementStatuses = []placementv1beta1.ResourcePlacementStatus{ + { + ClusterName: member1Name, + Conditions: []metav1.Condition{ + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceScheduledConditionType), + Reason: condition.ScheduleSucceededReason, + ObservedGeneration: 2, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceRolloutStartedConditionType), + Reason: condition.RolloutStartedReason, + ObservedGeneration: 2, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceOverriddenConditionType), + Reason: condition.OverriddenSucceededReason, + ObservedGeneration: 2, + }, + { + Status: metav1.ConditionUnknown, + Type: string(placementv1beta1.ResourceWorkSynchronizedConditionType), + Reason: condition.WorkSynchronizedUnknownReason, + ObservedGeneration: 2, + }, + }, + }, + { + ClusterName: member2Name, + Conditions: []metav1.Condition{ + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceScheduledConditionType), + Reason: condition.ScheduleSucceededReason, + ObservedGeneration: 2, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceRolloutStartedConditionType), + Reason: condition.RolloutStartedReason, + ObservedGeneration: 2, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceOverriddenConditionType), + Reason: condition.OverriddenSucceededReason, + ObservedGeneration: 2, + }, + { + Status: metav1.ConditionTrue, + Type: string(placementv1beta1.ResourceWorkSynchronizedConditionType), + Reason: condition.WorkSynchronizedReason, + ObservedGeneration: 2, + }, + { + Status: metav1.ConditionUnknown, + Type: string(placementv1beta1.ResourcesAppliedConditionType), + Reason: condition.ApplyPendingReason, + ObservedGeneration: 2, + }, + }, + }, + } + gotCRP = retrieveAndValidateClusterResourcePlacement(testCRPName, wantCRP) + + By("Ensure placement status metric was emitted with different generations") + // When a CRP spec is updated, the generation of the CRP changes. Therefore, the observed generation for the conditions will also change. + // Should have multiples of same condition type with different generations. + // In this case we have 2 metrics for different condition types as crp updates and its generation goes from 1 to 2. + wantMetrics = append(wantMetrics, &prometheusclientmodel.Metric{ + Label: []*prometheusclientmodel.LabelPair{ + {Name: ptr.To("name"), Value: ptr.To(gotCRP.Name)}, + {Name: ptr.To("generation"), Value: ptr.To(strconv.FormatInt(gotCRP.Generation, 10))}, + {Name: ptr.To("conditionType"), Value: ptr.To(string(placementv1beta1.ClusterResourcePlacementWorkSynchronizedConditionType))}, + {Name: ptr.To("status"), Value: ptr.To(string(corev1.ConditionUnknown))}, + }, + Gauge: &prometheusclientmodel.Gauge{ + Value: ptr.To(float64(time.Now().UnixNano()) / 1e9), + }, + }) + checkPlacementStatusMetric(customRegistry, wantMetrics) }) It("Emit metrics for complete CRP", func() { @@ -1181,7 +1300,7 @@ var _ = Describe("Test ClusterResourcePlacement Controller", func() { Label: []*prometheusclientmodel.LabelPair{ {Name: ptr.To("name"), Value: ptr.To(gotCRP.Name)}, {Name: ptr.To("generation"), Value: ptr.To(strconv.FormatInt(gotCRP.Generation, 10))}, - {Name: ptr.To("conditionType"), Value: ptr.To("ClusterResourcePlacementCompleted")}, + {Name: ptr.To("conditionType"), Value: ptr.To("Completed")}, {Name: ptr.To("status"), Value: ptr.To(string(corev1.ConditionTrue))}, }, Gauge: &prometheusclientmodel.Gauge{ @@ -1664,7 +1783,7 @@ var _ = Describe("Test ClusterResourcePlacement Controller", func() { Label: []*prometheusclientmodel.LabelPair{ {Name: ptr.To("name"), Value: ptr.To(gotCRP.Name)}, {Name: ptr.To("generation"), Value: ptr.To(strconv.FormatInt(gotCRP.Generation, 10))}, - {Name: ptr.To("conditionType"), Value: ptr.To("ClusterResourcePlacementCompleted")}, + {Name: ptr.To("conditionType"), Value: ptr.To("Completed")}, {Name: ptr.To("status"), Value: ptr.To(string(corev1.ConditionTrue))}, }, Gauge: &prometheusclientmodel.Gauge{ diff --git a/test/utils/metrics/metrics.go b/test/utils/metrics/metrics.go index fbf326625..d618240e6 100644 --- a/test/utils/metrics/metrics.go +++ b/test/utils/metrics/metrics.go @@ -10,6 +10,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + +// Package metrics provides utilities for metrics. package metrics import ( @@ -19,15 +21,16 @@ import ( ) var ( + // MetricsCmpOptions defines comparison options for Prometheus metric structures. + // - Sorting metric value and its labels for deterministic ordering, + // - Comparing gauge values based on whether they were meaningfully set (i.e., > 0), + // - Ignoring unexported fields to avoid false mismatches due to internal state. MetricsCmpOptions = []cmp.Option{ cmpopts.SortSlices(func(a, b *prometheusclientmodel.Metric) bool { return a.GetGauge().GetValue() < b.GetGauge().GetValue() // sort by time }), cmpopts.SortSlices(func(a, b *prometheusclientmodel.LabelPair) bool { - if a.Name == nil || b.Name == nil { - return a.Name == nil - } - return *a.Name < *b.Name // Sort by label + return a.GetName() < b.GetName() // Sort by label }), cmp.Comparer(func(a, b *prometheusclientmodel.Gauge) bool { return (a.GetValue() > 0) == (b.GetValue() > 0) From c7ef88605766f0cb8ef4cdcca9132a4fe0297ed9 Mon Sep 17 00:00:00 2001 From: Britania Rodriguez Reyes Date: Thu, 17 Apr 2025 10:02:01 -0700 Subject: [PATCH 3/3] move defer for emit metrics Signed-off-by: Britania Rodriguez Reyes --- pkg/controllers/clusterresourceplacement/controller.go | 6 ++---- .../clusterresourceplacement/controller_integration_test.go | 5 ++++- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pkg/controllers/clusterresourceplacement/controller.go b/pkg/controllers/clusterresourceplacement/controller.go index 6da08eca0..616143d01 100644 --- a/pkg/controllers/clusterresourceplacement/controller.go +++ b/pkg/controllers/clusterresourceplacement/controller.go @@ -59,14 +59,13 @@ func (r *Reconciler) Reconcile(ctx context.Context, key controller.QueueKey) (ct return ctrl.Result{}, nil // ignore this unexpected error } startTime := time.Now() - crp := fleetv1beta1.ClusterResourcePlacement{} klog.V(2).InfoS("ClusterResourcePlacement reconciliation starts", "clusterResourcePlacement", name) defer func() { latency := time.Since(startTime).Milliseconds() klog.V(2).InfoS("ClusterResourcePlacement reconciliation ends", "clusterResourcePlacement", name, "latency", latency) - emitPlacementStatusMetric(&crp) }() + crp := fleetv1beta1.ClusterResourcePlacement{} if err := r.Client.Get(ctx, types.NamespacedName{Name: name}, &crp); err != nil { if apierrors.IsNotFound(err) { klog.V(4).InfoS("Ignoring NotFound clusterResourcePlacement", "clusterResourcePlacement", name) @@ -88,7 +87,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, key controller.QueueKey) (ct return ctrl.Result{}, controller.NewUpdateIgnoreConflictError(err) } } - + defer emitPlacementStatusMetric(&crp) return r.handleUpdate(ctx, &crp) } @@ -114,7 +113,6 @@ func (r *Reconciler) handleDelete(ctx context.Context, crp *fleetv1beta1.Cluster } klog.V(2).InfoS("Removed crp-cleanup finalizer", "clusterResourcePlacement", crpKObj) r.Recorder.Event(crp, corev1.EventTypeNormal, "PlacementCleanupFinalizerRemoved", "Deleted the snapshots and removed the placement cleanup finalizer") - return ctrl.Result{}, nil } diff --git a/pkg/controllers/clusterresourceplacement/controller_integration_test.go b/pkg/controllers/clusterresourceplacement/controller_integration_test.go index 9825a022f..67cd5d48c 100644 --- a/pkg/controllers/clusterresourceplacement/controller_integration_test.go +++ b/pkg/controllers/clusterresourceplacement/controller_integration_test.go @@ -1,9 +1,12 @@ /* Copyright 2025 The KubeFleet Authors. + Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at -http://www.apache.org/licenses/LICENSE-2.0 + + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.