From 1ebc680d16a7368dbd4fe44ebee4c372b0199086 Mon Sep 17 00:00:00 2001
From: dddddai <dddwq@foxmail.com>
Date: Wed, 25 May 2022 16:35:53 +0800
Subject: [PATCH] add cluster success threshold

Signed-off-by: dddddai <dddwq@foxmail.com>
---
 cmd/agent/app/agent.go                        |  2 ++
 cmd/agent/app/options/options.go              |  3 ++
 .../app/controllermanager.go                  |  2 ++
 cmd/controller-manager/app/options/options.go |  3 ++
 pkg/controllers/context/context.go            |  2 ++
 .../status/cluster_condition_cache.go         | 18 +++++++++--
 .../status/cluster_condition_cache_test.go    | 31 ++++++++++++++++---
 .../status/cluster_status_controller.go       |  9 ++++--
 8 files changed, 59 insertions(+), 11 deletions(-)

diff --git a/cmd/agent/app/agent.go b/cmd/agent/app/agent.go
index 477def79e07c..9d08c75df7ad 100644
--- a/cmd/agent/app/agent.go
+++ b/cmd/agent/app/agent.go
@@ -185,6 +185,7 @@ func setupControllers(mgr controllerruntime.Manager, opts *options.Options, stop
 			ClusterStatusUpdateFrequency:      opts.ClusterStatusUpdateFrequency,
 			ClusterLeaseDuration:              opts.ClusterLeaseDuration,
 			ClusterLeaseRenewIntervalFraction: opts.ClusterLeaseRenewIntervalFraction,
+			ClusterSuccessThreshold:           opts.ClusterSuccessThreshold,
 			ClusterFailureThreshold:           opts.ClusterFailureThreshold,
 			ClusterCacheSyncTimeout:           opts.ClusterCacheSyncTimeout,
 			ClusterAPIQPS:                     opts.ClusterAPIQPS,
@@ -223,6 +224,7 @@ func startClusterStatusController(ctx controllerscontext.Context) (bool, error)
 		ClusterStatusUpdateFrequency:      ctx.Opts.ClusterStatusUpdateFrequency,
 		ClusterLeaseDuration:              ctx.Opts.ClusterLeaseDuration,
 		ClusterLeaseRenewIntervalFraction: ctx.Opts.ClusterLeaseRenewIntervalFraction,
+		ClusterSuccessThreshold:           ctx.Opts.ClusterSuccessThreshold,
 		ClusterFailureThreshold:           ctx.Opts.ClusterFailureThreshold,
 		ClusterCacheSyncTimeout:           ctx.Opts.ClusterCacheSyncTimeout,
 		RateLimiterOptions:                ctx.Opts.RateLimiterOptions,
diff --git a/cmd/agent/app/options/options.go b/cmd/agent/app/options/options.go
index cf303c2fdd3a..b9bb83efd26c 100644
--- a/cmd/agent/app/options/options.go
+++ b/cmd/agent/app/options/options.go
@@ -38,6 +38,8 @@ type Options struct {
 	// ClusterLeaseRenewIntervalFraction is a fraction coordinated with ClusterLeaseDuration that
 	// how long the current holder of a lease has last updated the lease.
 	ClusterLeaseRenewIntervalFraction float64
+	// ClusterSuccessThreshold is the duration of successes for the cluster to be considered healthy after recovery.
+	ClusterSuccessThreshold metav1.Duration
 	// ClusterFailureThreshold is the duration of failure for the cluster to be considered unhealthy.
 	ClusterFailureThreshold metav1.Duration
 	// ClusterAPIQPS is the QPS to use while talking with cluster kube-apiserver.
@@ -100,6 +102,7 @@ func (o *Options) AddFlags(fs *pflag.FlagSet, allControllers []string) {
 		"Specifies the expiration period of a cluster lease.")
 	fs.Float64Var(&o.ClusterLeaseRenewIntervalFraction, "cluster-lease-renew-interval-fraction", 0.25,
 		"Specifies the cluster lease renew interval fraction.")
+	fs.DurationVar(&o.ClusterSuccessThreshold.Duration, "cluster-success-threshold", 30*time.Second, "The duration of successes for the cluster to be considered healthy after recovery.")
 	fs.DurationVar(&o.ClusterFailureThreshold.Duration, "cluster-failure-threshold", 30*time.Second, "The duration of failure for the cluster to be considered unhealthy.")
 	fs.Float32Var(&o.ClusterAPIQPS, "cluster-api-qps", 40.0, "QPS to use while talking with cluster kube-apiserver. Doesn't cover events and node heartbeat apis which rate limiting is controlled by a different set of flags.")
 	fs.IntVar(&o.ClusterAPIBurst, "cluster-api-burst", 60, "Burst to use while talking with cluster kube-apiserver. Doesn't cover events and node heartbeat apis which rate limiting is controlled by a different set of flags.")
diff --git a/cmd/controller-manager/app/controllermanager.go b/cmd/controller-manager/app/controllermanager.go
index 8a4575661950..4f5dabd845d2 100644
--- a/cmd/controller-manager/app/controllermanager.go
+++ b/cmd/controller-manager/app/controllermanager.go
@@ -233,6 +233,7 @@ func startClusterStatusController(ctx controllerscontext.Context) (enabled bool,
 		ClusterStatusUpdateFrequency:      opts.ClusterStatusUpdateFrequency,
 		ClusterLeaseDuration:              opts.ClusterLeaseDuration,
 		ClusterLeaseRenewIntervalFraction: opts.ClusterLeaseRenewIntervalFraction,
+		ClusterSuccessThreshold:           opts.ClusterSuccessThreshold,
 		ClusterFailureThreshold:           opts.ClusterFailureThreshold,
 		ClusterCacheSyncTimeout:           opts.ClusterCacheSyncTimeout,
 		RateLimiterOptions:                ctx.Opts.RateLimiterOptions,
@@ -493,6 +494,7 @@ func setupControllers(mgr controllerruntime.Manager, opts *options.Options, stop
 			FailoverEvictionTimeout:           opts.FailoverEvictionTimeout,
 			ClusterLeaseDuration:              opts.ClusterLeaseDuration,
 			ClusterLeaseRenewIntervalFraction: opts.ClusterLeaseRenewIntervalFraction,
+			ClusterSuccessThreshold:           opts.ClusterSuccessThreshold,
 			ClusterFailureThreshold:           opts.ClusterFailureThreshold,
 			ClusterCacheSyncTimeout:           opts.ClusterCacheSyncTimeout,
 			ClusterAPIQPS:                     opts.ClusterAPIQPS,
diff --git a/cmd/controller-manager/app/options/options.go b/cmd/controller-manager/app/options/options.go
index 982cae509cf2..f6e1fa40772e 100644
--- a/cmd/controller-manager/app/options/options.go
+++ b/cmd/controller-manager/app/options/options.go
@@ -46,6 +46,8 @@ type Options struct {
 	// ClusterLeaseRenewIntervalFraction is a fraction coordinated with ClusterLeaseDuration that
 	// how long the current holder of a lease has last updated the lease.
 	ClusterLeaseRenewIntervalFraction float64
+	// ClusterSuccessThreshold is the duration of successes for the cluster to be considered healthy after recovery.
+	ClusterSuccessThreshold metav1.Duration
 	// ClusterFailureThreshold is the duration of failure for the cluster to be considered unhealthy.
 	ClusterFailureThreshold metav1.Duration
 	// ClusterMonitorPeriod represents cluster-controller monitoring period, i.e. how often does
@@ -136,6 +138,7 @@ func (o *Options) AddFlags(flags *pflag.FlagSet, allControllers, disabledByDefau
 		"Specifies the expiration period of a cluster lease.")
 	flags.Float64Var(&o.ClusterLeaseRenewIntervalFraction, "cluster-lease-renew-interval-fraction", 0.25,
 		"Specifies the cluster lease renew interval fraction.")
+	flags.DurationVar(&o.ClusterSuccessThreshold.Duration, "cluster-success-threshold", 30*time.Second, "The duration of successes for the cluster to be considered healthy after recovery.")
 	flags.DurationVar(&o.ClusterFailureThreshold.Duration, "cluster-failure-threshold", 30*time.Second, "The duration of failure for the cluster to be considered unhealthy.")
 	flags.DurationVar(&o.ClusterMonitorPeriod.Duration, "cluster-monitor-period", 5*time.Second,
 		"Specifies how often karmada-controller-manager monitors cluster health status.")
diff --git a/pkg/controllers/context/context.go b/pkg/controllers/context/context.go
index 45e5d493f839..8404551c69f5 100644
--- a/pkg/controllers/context/context.go
+++ b/pkg/controllers/context/context.go
@@ -40,6 +40,8 @@ type Options struct {
 	// ClusterLeaseRenewIntervalFraction is a fraction coordinated with ClusterLeaseDuration that
 	// how long the current holder of a lease has last updated the lease.
 	ClusterLeaseRenewIntervalFraction float64
+	// ClusterSuccessThreshold is the duration of successes for the cluster to be considered healthy after recovery.
+	ClusterSuccessThreshold metav1.Duration
 	// ClusterFailureThreshold is the duration of failure for the cluster to be considered unhealthy.
 	ClusterFailureThreshold metav1.Duration
 	// ClusterCacheSyncTimeout is the timeout period waiting for cluster cache to sync.
diff --git a/pkg/controllers/status/cluster_condition_cache.go b/pkg/controllers/status/cluster_condition_cache.go
index f66a4ebb0e94..866bf50ad500 100644
--- a/pkg/controllers/status/cluster_condition_cache.go
+++ b/pkg/controllers/status/cluster_condition_cache.go
@@ -19,6 +19,8 @@ type clusterData struct {
 
 type clusterConditionStore struct {
 	clusterDataMap sync.Map
+	// successThreshold is the duration of successes for the cluster to be considered healthy after recovery.
+	successThreshold time.Duration
 	// failureThreshold is the duration of failure for the cluster to be considered unhealthy.
 	failureThreshold time.Duration
 }
@@ -46,9 +48,19 @@ func (c *clusterConditionStore) thresholdAdjustedReadyCondition(cluster *cluster
 		}
 		c.update(cluster.Name, saved)
 	}
-	if observedReadyCondition.Status != metav1.ConditionTrue &&
-		curReadyCondition.Status == metav1.ConditionTrue &&
-		now.Before(saved.thresholdStartTime.Add(c.failureThreshold)) {
+
+	var threshold time.Duration
+	if observedReadyCondition.Status == metav1.ConditionTrue {
+		threshold = c.successThreshold
+	} else {
+		threshold = c.failureThreshold
+	}
+
+	// we only care about true/not true
+	// for unknown->false, just return the observed ready condition
+	if ((observedReadyCondition.Status == metav1.ConditionTrue && curReadyCondition.Status != metav1.ConditionTrue) ||
+		(observedReadyCondition.Status != metav1.ConditionTrue && curReadyCondition.Status == metav1.ConditionTrue)) &&
+		now.Before(saved.thresholdStartTime.Add(threshold)) {
 		// retain old status until threshold exceeded to avoid network unstable problems.
 		return curReadyCondition
 	}
diff --git a/pkg/controllers/status/cluster_condition_cache_test.go b/pkg/controllers/status/cluster_condition_cache_test.go
index e374505af15c..72268d79c7d7 100644
--- a/pkg/controllers/status/cluster_condition_cache_test.go
+++ b/pkg/controllers/status/cluster_condition_cache_test.go
@@ -11,6 +11,7 @@ import (
 )
 
 func TestThresholdAdjustedReadyCondition(t *testing.T) {
+	clusterSuccessThreshold := 30 * time.Second
 	clusterFailureThreshold := 30 * time.Second
 
 	tests := []struct {
@@ -65,7 +66,7 @@ func TestThresholdAdjustedReadyCondition(t *testing.T) {
 			},
 		},
 		{
-			name: "cluster becomes not ready but still not reach threshold",
+			name: "cluster becomes not ready but still not reach failure threshold",
 			clusterData: &clusterData{
 				readyCondition:     metav1.ConditionFalse,
 				thresholdStartTime: time.Now().Add(-clusterFailureThreshold / 2),
@@ -84,7 +85,7 @@ func TestThresholdAdjustedReadyCondition(t *testing.T) {
 			},
 		},
 		{
-			name: "cluster becomes not ready and reaches threshold",
+			name: "cluster becomes not ready and reaches failure threshold",
 			clusterData: &clusterData{
 				readyCondition:     metav1.ConditionFalse,
 				thresholdStartTime: time.Now().Add(-clusterFailureThreshold),
@@ -122,10 +123,29 @@ func TestThresholdAdjustedReadyCondition(t *testing.T) {
 			},
 		},
 		{
-			name: "cluster recovers",
+			name: "cluster recovers but still not reach success threshold",
 			clusterData: &clusterData{
-				readyCondition:     metav1.ConditionFalse,
-				thresholdStartTime: time.Now().Add(-3 * clusterFailureThreshold),
+				readyCondition:     metav1.ConditionTrue,
+				thresholdStartTime: time.Now().Add(-clusterSuccessThreshold / 2),
+			},
+			currentCondition: &metav1.Condition{
+				Type:   clusterv1alpha1.ClusterConditionReady,
+				Status: metav1.ConditionFalse,
+			},
+			observedCondition: &metav1.Condition{
+				Type:   clusterv1alpha1.ClusterConditionReady,
+				Status: metav1.ConditionTrue,
+			},
+			expectedCondition: &metav1.Condition{
+				Type:   clusterv1alpha1.ClusterConditionReady,
+				Status: metav1.ConditionFalse,
+			},
+		},
+		{
+			name: "cluster recovers and reaches success threshold",
+			clusterData: &clusterData{
+				readyCondition:     metav1.ConditionTrue,
+				thresholdStartTime: time.Now().Add(-clusterSuccessThreshold),
 			},
 			currentCondition: &metav1.Condition{
 				Type:   clusterv1alpha1.ClusterConditionReady,
@@ -145,6 +165,7 @@ func TestThresholdAdjustedReadyCondition(t *testing.T) {
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			cache := clusterConditionStore{
+				successThreshold: clusterSuccessThreshold,
 				failureThreshold: clusterFailureThreshold,
 			}
 
diff --git a/pkg/controllers/status/cluster_status_controller.go b/pkg/controllers/status/cluster_status_controller.go
index 556a6baf32fb..196702f6f9bf 100644
--- a/pkg/controllers/status/cluster_status_controller.go
+++ b/pkg/controllers/status/cluster_status_controller.go
@@ -77,6 +77,8 @@ type ClusterStatusController struct {
 	ClusterLeaseRenewIntervalFraction float64
 	// ClusterLeaseControllers store clusters and their corresponding lease controllers.
 	ClusterLeaseControllers sync.Map
+	// ClusterSuccessThreshold is the duration of successes for the cluster to be considered healthy after recovery.
+	ClusterSuccessThreshold metav1.Duration
 	// ClusterFailureThreshold is the duration of failure for the cluster to be considered unhealthy.
 	ClusterFailureThreshold metav1.Duration
 	// clusterConditionCache stores the condition status of each cluster.
@@ -117,6 +119,7 @@ func (c *ClusterStatusController) Reconcile(ctx context.Context, req controllerr
 // SetupWithManager creates a controller and register to controller manager.
 func (c *ClusterStatusController) SetupWithManager(mgr controllerruntime.Manager) error {
 	c.clusterConditionCache = clusterConditionStore{
+		successThreshold: c.ClusterSuccessThreshold.Duration,
 		failureThreshold: c.ClusterFailureThreshold.Duration,
 	}
 	return controllerruntime.NewControllerManagedBy(mgr).For(&clusterv1alpha1.Cluster{}).WithEventFilter(c.PredicateFunc).WithOptions(controller.Options{
@@ -149,7 +152,7 @@ func (c *ClusterStatusController) syncClusterStatus(cluster *clusterv1alpha1.Clu
 	}
 
 	// skip collecting cluster status if not ready
-	if online && healthy {
+	if online && healthy && readyCondition.Status == metav1.ConditionTrue {
 		// get or create informer for pods and nodes in member cluster
 		clusterInformerManager, err := c.buildInformerForCluster(cluster)
 		if err != nil {
@@ -433,7 +436,7 @@ func getNodeSummary(nodes []*corev1.Node) *clusterv1alpha1.NodeSummary {
 		}
 	}
 
-	var nodeSummary = &clusterv1alpha1.NodeSummary{}
+	nodeSummary := &clusterv1alpha1.NodeSummary{}
 	nodeSummary.TotalNum = int32(totalNum)
 	nodeSummary.ReadyNum = int32(readyNum)
 
@@ -445,7 +448,7 @@ func getResourceSummary(nodes []*corev1.Node, pods []*corev1.Pod) *clusterv1alph
 	allocating := getAllocatingResource(pods)
 	allocated := getAllocatedResource(pods)
 
-	var resourceSummary = &clusterv1alpha1.ResourceSummary{}
+	resourceSummary := &clusterv1alpha1.ResourceSummary{}
 	resourceSummary.Allocatable = allocatable
 	resourceSummary.Allocating = allocating
 	resourceSummary.Allocated = allocated