From 1ebc680d16a7368dbd4fe44ebee4c372b0199086 Mon Sep 17 00:00:00 2001 From: dddddai Date: Wed, 25 May 2022 16:35:53 +0800 Subject: [PATCH] add cluster success threshold Signed-off-by: dddddai --- cmd/agent/app/agent.go | 2 ++ cmd/agent/app/options/options.go | 3 ++ .../app/controllermanager.go | 2 ++ cmd/controller-manager/app/options/options.go | 3 ++ pkg/controllers/context/context.go | 2 ++ .../status/cluster_condition_cache.go | 18 +++++++++-- .../status/cluster_condition_cache_test.go | 31 ++++++++++++++++--- .../status/cluster_status_controller.go | 9 ++++-- 8 files changed, 59 insertions(+), 11 deletions(-) diff --git a/cmd/agent/app/agent.go b/cmd/agent/app/agent.go index 477def79e07c..9d08c75df7ad 100644 --- a/cmd/agent/app/agent.go +++ b/cmd/agent/app/agent.go @@ -185,6 +185,7 @@ func setupControllers(mgr controllerruntime.Manager, opts *options.Options, stop ClusterStatusUpdateFrequency: opts.ClusterStatusUpdateFrequency, ClusterLeaseDuration: opts.ClusterLeaseDuration, ClusterLeaseRenewIntervalFraction: opts.ClusterLeaseRenewIntervalFraction, + ClusterSuccessThreshold: opts.ClusterSuccessThreshold, ClusterFailureThreshold: opts.ClusterFailureThreshold, ClusterCacheSyncTimeout: opts.ClusterCacheSyncTimeout, ClusterAPIQPS: opts.ClusterAPIQPS, @@ -223,6 +224,7 @@ func startClusterStatusController(ctx controllerscontext.Context) (bool, error) ClusterStatusUpdateFrequency: ctx.Opts.ClusterStatusUpdateFrequency, ClusterLeaseDuration: ctx.Opts.ClusterLeaseDuration, ClusterLeaseRenewIntervalFraction: ctx.Opts.ClusterLeaseRenewIntervalFraction, + ClusterSuccessThreshold: ctx.Opts.ClusterSuccessThreshold, ClusterFailureThreshold: ctx.Opts.ClusterFailureThreshold, ClusterCacheSyncTimeout: ctx.Opts.ClusterCacheSyncTimeout, RateLimiterOptions: ctx.Opts.RateLimiterOptions, diff --git a/cmd/agent/app/options/options.go b/cmd/agent/app/options/options.go index cf303c2fdd3a..b9bb83efd26c 100644 --- a/cmd/agent/app/options/options.go +++ b/cmd/agent/app/options/options.go @@ -38,6 +38,8 @@ type Options struct { // ClusterLeaseRenewIntervalFraction is a fraction coordinated with ClusterLeaseDuration that // how long the current holder of a lease has last updated the lease. ClusterLeaseRenewIntervalFraction float64 + // ClusterSuccessThreshold is the duration of successes for the cluster to be considered healthy after recovery. + ClusterSuccessThreshold metav1.Duration // ClusterFailureThreshold is the duration of failure for the cluster to be considered unhealthy. ClusterFailureThreshold metav1.Duration // ClusterAPIQPS is the QPS to use while talking with cluster kube-apiserver. @@ -100,6 +102,7 @@ func (o *Options) AddFlags(fs *pflag.FlagSet, allControllers []string) { "Specifies the expiration period of a cluster lease.") fs.Float64Var(&o.ClusterLeaseRenewIntervalFraction, "cluster-lease-renew-interval-fraction", 0.25, "Specifies the cluster lease renew interval fraction.") + fs.DurationVar(&o.ClusterSuccessThreshold.Duration, "cluster-success-threshold", 30*time.Second, "The duration of successes for the cluster to be considered healthy after recovery.") fs.DurationVar(&o.ClusterFailureThreshold.Duration, "cluster-failure-threshold", 30*time.Second, "The duration of failure for the cluster to be considered unhealthy.") fs.Float32Var(&o.ClusterAPIQPS, "cluster-api-qps", 40.0, "QPS to use while talking with cluster kube-apiserver. Doesn't cover events and node heartbeat apis which rate limiting is controlled by a different set of flags.") fs.IntVar(&o.ClusterAPIBurst, "cluster-api-burst", 60, "Burst to use while talking with cluster kube-apiserver. Doesn't cover events and node heartbeat apis which rate limiting is controlled by a different set of flags.") diff --git a/cmd/controller-manager/app/controllermanager.go b/cmd/controller-manager/app/controllermanager.go index 8a4575661950..4f5dabd845d2 100644 --- a/cmd/controller-manager/app/controllermanager.go +++ b/cmd/controller-manager/app/controllermanager.go @@ -233,6 +233,7 @@ func startClusterStatusController(ctx controllerscontext.Context) (enabled bool, ClusterStatusUpdateFrequency: opts.ClusterStatusUpdateFrequency, ClusterLeaseDuration: opts.ClusterLeaseDuration, ClusterLeaseRenewIntervalFraction: opts.ClusterLeaseRenewIntervalFraction, + ClusterSuccessThreshold: opts.ClusterSuccessThreshold, ClusterFailureThreshold: opts.ClusterFailureThreshold, ClusterCacheSyncTimeout: opts.ClusterCacheSyncTimeout, RateLimiterOptions: ctx.Opts.RateLimiterOptions, @@ -493,6 +494,7 @@ func setupControllers(mgr controllerruntime.Manager, opts *options.Options, stop FailoverEvictionTimeout: opts.FailoverEvictionTimeout, ClusterLeaseDuration: opts.ClusterLeaseDuration, ClusterLeaseRenewIntervalFraction: opts.ClusterLeaseRenewIntervalFraction, + ClusterSuccessThreshold: opts.ClusterSuccessThreshold, ClusterFailureThreshold: opts.ClusterFailureThreshold, ClusterCacheSyncTimeout: opts.ClusterCacheSyncTimeout, ClusterAPIQPS: opts.ClusterAPIQPS, diff --git a/cmd/controller-manager/app/options/options.go b/cmd/controller-manager/app/options/options.go index 982cae509cf2..f6e1fa40772e 100644 --- a/cmd/controller-manager/app/options/options.go +++ b/cmd/controller-manager/app/options/options.go @@ -46,6 +46,8 @@ type Options struct { // ClusterLeaseRenewIntervalFraction is a fraction coordinated with ClusterLeaseDuration that // how long the current holder of a lease has last updated the lease. ClusterLeaseRenewIntervalFraction float64 + // ClusterSuccessThreshold is the duration of successes for the cluster to be considered healthy after recovery. + ClusterSuccessThreshold metav1.Duration // ClusterFailureThreshold is the duration of failure for the cluster to be considered unhealthy. ClusterFailureThreshold metav1.Duration // ClusterMonitorPeriod represents cluster-controller monitoring period, i.e. how often does @@ -136,6 +138,7 @@ func (o *Options) AddFlags(flags *pflag.FlagSet, allControllers, disabledByDefau "Specifies the expiration period of a cluster lease.") flags.Float64Var(&o.ClusterLeaseRenewIntervalFraction, "cluster-lease-renew-interval-fraction", 0.25, "Specifies the cluster lease renew interval fraction.") + flags.DurationVar(&o.ClusterSuccessThreshold.Duration, "cluster-success-threshold", 30*time.Second, "The duration of successes for the cluster to be considered healthy after recovery.") flags.DurationVar(&o.ClusterFailureThreshold.Duration, "cluster-failure-threshold", 30*time.Second, "The duration of failure for the cluster to be considered unhealthy.") flags.DurationVar(&o.ClusterMonitorPeriod.Duration, "cluster-monitor-period", 5*time.Second, "Specifies how often karmada-controller-manager monitors cluster health status.") diff --git a/pkg/controllers/context/context.go b/pkg/controllers/context/context.go index 45e5d493f839..8404551c69f5 100644 --- a/pkg/controllers/context/context.go +++ b/pkg/controllers/context/context.go @@ -40,6 +40,8 @@ type Options struct { // ClusterLeaseRenewIntervalFraction is a fraction coordinated with ClusterLeaseDuration that // how long the current holder of a lease has last updated the lease. ClusterLeaseRenewIntervalFraction float64 + // ClusterSuccessThreshold is the duration of successes for the cluster to be considered healthy after recovery. + ClusterSuccessThreshold metav1.Duration // ClusterFailureThreshold is the duration of failure for the cluster to be considered unhealthy. ClusterFailureThreshold metav1.Duration // ClusterCacheSyncTimeout is the timeout period waiting for cluster cache to sync. diff --git a/pkg/controllers/status/cluster_condition_cache.go b/pkg/controllers/status/cluster_condition_cache.go index f66a4ebb0e94..866bf50ad500 100644 --- a/pkg/controllers/status/cluster_condition_cache.go +++ b/pkg/controllers/status/cluster_condition_cache.go @@ -19,6 +19,8 @@ type clusterData struct { type clusterConditionStore struct { clusterDataMap sync.Map + // successThreshold is the duration of successes for the cluster to be considered healthy after recovery. + successThreshold time.Duration // failureThreshold is the duration of failure for the cluster to be considered unhealthy. failureThreshold time.Duration } @@ -46,9 +48,19 @@ func (c *clusterConditionStore) thresholdAdjustedReadyCondition(cluster *cluster } c.update(cluster.Name, saved) } - if observedReadyCondition.Status != metav1.ConditionTrue && - curReadyCondition.Status == metav1.ConditionTrue && - now.Before(saved.thresholdStartTime.Add(c.failureThreshold)) { + + var threshold time.Duration + if observedReadyCondition.Status == metav1.ConditionTrue { + threshold = c.successThreshold + } else { + threshold = c.failureThreshold + } + + // we only care about true/not true + // for unknown->false, just return the observed ready condition + if ((observedReadyCondition.Status == metav1.ConditionTrue && curReadyCondition.Status != metav1.ConditionTrue) || + (observedReadyCondition.Status != metav1.ConditionTrue && curReadyCondition.Status == metav1.ConditionTrue)) && + now.Before(saved.thresholdStartTime.Add(threshold)) { // retain old status until threshold exceeded to avoid network unstable problems. return curReadyCondition } diff --git a/pkg/controllers/status/cluster_condition_cache_test.go b/pkg/controllers/status/cluster_condition_cache_test.go index e374505af15c..72268d79c7d7 100644 --- a/pkg/controllers/status/cluster_condition_cache_test.go +++ b/pkg/controllers/status/cluster_condition_cache_test.go @@ -11,6 +11,7 @@ import ( ) func TestThresholdAdjustedReadyCondition(t *testing.T) { + clusterSuccessThreshold := 30 * time.Second clusterFailureThreshold := 30 * time.Second tests := []struct { @@ -65,7 +66,7 @@ func TestThresholdAdjustedReadyCondition(t *testing.T) { }, }, { - name: "cluster becomes not ready but still not reach threshold", + name: "cluster becomes not ready but still not reach failure threshold", clusterData: &clusterData{ readyCondition: metav1.ConditionFalse, thresholdStartTime: time.Now().Add(-clusterFailureThreshold / 2), @@ -84,7 +85,7 @@ func TestThresholdAdjustedReadyCondition(t *testing.T) { }, }, { - name: "cluster becomes not ready and reaches threshold", + name: "cluster becomes not ready and reaches failure threshold", clusterData: &clusterData{ readyCondition: metav1.ConditionFalse, thresholdStartTime: time.Now().Add(-clusterFailureThreshold), @@ -122,10 +123,29 @@ func TestThresholdAdjustedReadyCondition(t *testing.T) { }, }, { - name: "cluster recovers", + name: "cluster recovers but still not reach success threshold", clusterData: &clusterData{ - readyCondition: metav1.ConditionFalse, - thresholdStartTime: time.Now().Add(-3 * clusterFailureThreshold), + readyCondition: metav1.ConditionTrue, + thresholdStartTime: time.Now().Add(-clusterSuccessThreshold / 2), + }, + currentCondition: &metav1.Condition{ + Type: clusterv1alpha1.ClusterConditionReady, + Status: metav1.ConditionFalse, + }, + observedCondition: &metav1.Condition{ + Type: clusterv1alpha1.ClusterConditionReady, + Status: metav1.ConditionTrue, + }, + expectedCondition: &metav1.Condition{ + Type: clusterv1alpha1.ClusterConditionReady, + Status: metav1.ConditionFalse, + }, + }, + { + name: "cluster recovers and reaches success threshold", + clusterData: &clusterData{ + readyCondition: metav1.ConditionTrue, + thresholdStartTime: time.Now().Add(-clusterSuccessThreshold), }, currentCondition: &metav1.Condition{ Type: clusterv1alpha1.ClusterConditionReady, @@ -145,6 +165,7 @@ func TestThresholdAdjustedReadyCondition(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { cache := clusterConditionStore{ + successThreshold: clusterSuccessThreshold, failureThreshold: clusterFailureThreshold, } diff --git a/pkg/controllers/status/cluster_status_controller.go b/pkg/controllers/status/cluster_status_controller.go index 556a6baf32fb..196702f6f9bf 100644 --- a/pkg/controllers/status/cluster_status_controller.go +++ b/pkg/controllers/status/cluster_status_controller.go @@ -77,6 +77,8 @@ type ClusterStatusController struct { ClusterLeaseRenewIntervalFraction float64 // ClusterLeaseControllers store clusters and their corresponding lease controllers. ClusterLeaseControllers sync.Map + // ClusterSuccessThreshold is the duration of successes for the cluster to be considered healthy after recovery. + ClusterSuccessThreshold metav1.Duration // ClusterFailureThreshold is the duration of failure for the cluster to be considered unhealthy. ClusterFailureThreshold metav1.Duration // clusterConditionCache stores the condition status of each cluster. @@ -117,6 +119,7 @@ func (c *ClusterStatusController) Reconcile(ctx context.Context, req controllerr // SetupWithManager creates a controller and register to controller manager. func (c *ClusterStatusController) SetupWithManager(mgr controllerruntime.Manager) error { c.clusterConditionCache = clusterConditionStore{ + successThreshold: c.ClusterSuccessThreshold.Duration, failureThreshold: c.ClusterFailureThreshold.Duration, } return controllerruntime.NewControllerManagedBy(mgr).For(&clusterv1alpha1.Cluster{}).WithEventFilter(c.PredicateFunc).WithOptions(controller.Options{ @@ -149,7 +152,7 @@ func (c *ClusterStatusController) syncClusterStatus(cluster *clusterv1alpha1.Clu } // skip collecting cluster status if not ready - if online && healthy { + if online && healthy && readyCondition.Status == metav1.ConditionTrue { // get or create informer for pods and nodes in member cluster clusterInformerManager, err := c.buildInformerForCluster(cluster) if err != nil { @@ -433,7 +436,7 @@ func getNodeSummary(nodes []*corev1.Node) *clusterv1alpha1.NodeSummary { } } - var nodeSummary = &clusterv1alpha1.NodeSummary{} + nodeSummary := &clusterv1alpha1.NodeSummary{} nodeSummary.TotalNum = int32(totalNum) nodeSummary.ReadyNum = int32(readyNum) @@ -445,7 +448,7 @@ func getResourceSummary(nodes []*corev1.Node, pods []*corev1.Pod) *clusterv1alph allocating := getAllocatingResource(pods) allocated := getAllocatedResource(pods) - var resourceSummary = &clusterv1alpha1.ResourceSummary{} + resourceSummary := &clusterv1alpha1.ResourceSummary{} resourceSummary.Allocatable = allocatable resourceSummary.Allocating = allocating resourceSummary.Allocated = allocated