Skip to content

Commit

Permalink
add cluster success threshold
Browse files Browse the repository at this point in the history
Signed-off-by: dddddai <dddwq@foxmail.com>
  • Loading branch information
dddddai committed Jun 16, 2022
1 parent 801d187 commit 1ebc680
Show file tree
Hide file tree
Showing 8 changed files with 59 additions and 11 deletions.
2 changes: 2 additions & 0 deletions cmd/agent/app/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ func setupControllers(mgr controllerruntime.Manager, opts *options.Options, stop
ClusterStatusUpdateFrequency: opts.ClusterStatusUpdateFrequency,
ClusterLeaseDuration: opts.ClusterLeaseDuration,
ClusterLeaseRenewIntervalFraction: opts.ClusterLeaseRenewIntervalFraction,
ClusterSuccessThreshold: opts.ClusterSuccessThreshold,
ClusterFailureThreshold: opts.ClusterFailureThreshold,
ClusterCacheSyncTimeout: opts.ClusterCacheSyncTimeout,
ClusterAPIQPS: opts.ClusterAPIQPS,
Expand Down Expand Up @@ -223,6 +224,7 @@ func startClusterStatusController(ctx controllerscontext.Context) (bool, error)
ClusterStatusUpdateFrequency: ctx.Opts.ClusterStatusUpdateFrequency,
ClusterLeaseDuration: ctx.Opts.ClusterLeaseDuration,
ClusterLeaseRenewIntervalFraction: ctx.Opts.ClusterLeaseRenewIntervalFraction,
ClusterSuccessThreshold: ctx.Opts.ClusterSuccessThreshold,
ClusterFailureThreshold: ctx.Opts.ClusterFailureThreshold,
ClusterCacheSyncTimeout: ctx.Opts.ClusterCacheSyncTimeout,
RateLimiterOptions: ctx.Opts.RateLimiterOptions,
Expand Down
3 changes: 3 additions & 0 deletions cmd/agent/app/options/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ type Options struct {
// ClusterLeaseRenewIntervalFraction is a fraction coordinated with ClusterLeaseDuration that
// how long the current holder of a lease has last updated the lease.
ClusterLeaseRenewIntervalFraction float64
// ClusterSuccessThreshold is the duration of successes for the cluster to be considered healthy after recovery.
ClusterSuccessThreshold metav1.Duration
// ClusterFailureThreshold is the duration of failure for the cluster to be considered unhealthy.
ClusterFailureThreshold metav1.Duration
// ClusterAPIQPS is the QPS to use while talking with cluster kube-apiserver.
Expand Down Expand Up @@ -100,6 +102,7 @@ func (o *Options) AddFlags(fs *pflag.FlagSet, allControllers []string) {
"Specifies the expiration period of a cluster lease.")
fs.Float64Var(&o.ClusterLeaseRenewIntervalFraction, "cluster-lease-renew-interval-fraction", 0.25,
"Specifies the cluster lease renew interval fraction.")
fs.DurationVar(&o.ClusterSuccessThreshold.Duration, "cluster-success-threshold", 30*time.Second, "The duration of successes for the cluster to be considered healthy after recovery.")
fs.DurationVar(&o.ClusterFailureThreshold.Duration, "cluster-failure-threshold", 30*time.Second, "The duration of failure for the cluster to be considered unhealthy.")
fs.Float32Var(&o.ClusterAPIQPS, "cluster-api-qps", 40.0, "QPS to use while talking with cluster kube-apiserver. Doesn't cover events and node heartbeat apis which rate limiting is controlled by a different set of flags.")
fs.IntVar(&o.ClusterAPIBurst, "cluster-api-burst", 60, "Burst to use while talking with cluster kube-apiserver. Doesn't cover events and node heartbeat apis which rate limiting is controlled by a different set of flags.")
Expand Down
2 changes: 2 additions & 0 deletions cmd/controller-manager/app/controllermanager.go
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,7 @@ func startClusterStatusController(ctx controllerscontext.Context) (enabled bool,
ClusterStatusUpdateFrequency: opts.ClusterStatusUpdateFrequency,
ClusterLeaseDuration: opts.ClusterLeaseDuration,
ClusterLeaseRenewIntervalFraction: opts.ClusterLeaseRenewIntervalFraction,
ClusterSuccessThreshold: opts.ClusterSuccessThreshold,
ClusterFailureThreshold: opts.ClusterFailureThreshold,
ClusterCacheSyncTimeout: opts.ClusterCacheSyncTimeout,
RateLimiterOptions: ctx.Opts.RateLimiterOptions,
Expand Down Expand Up @@ -493,6 +494,7 @@ func setupControllers(mgr controllerruntime.Manager, opts *options.Options, stop
FailoverEvictionTimeout: opts.FailoverEvictionTimeout,
ClusterLeaseDuration: opts.ClusterLeaseDuration,
ClusterLeaseRenewIntervalFraction: opts.ClusterLeaseRenewIntervalFraction,
ClusterSuccessThreshold: opts.ClusterSuccessThreshold,
ClusterFailureThreshold: opts.ClusterFailureThreshold,
ClusterCacheSyncTimeout: opts.ClusterCacheSyncTimeout,
ClusterAPIQPS: opts.ClusterAPIQPS,
Expand Down
3 changes: 3 additions & 0 deletions cmd/controller-manager/app/options/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ type Options struct {
// ClusterLeaseRenewIntervalFraction is a fraction coordinated with ClusterLeaseDuration that
// how long the current holder of a lease has last updated the lease.
ClusterLeaseRenewIntervalFraction float64
// ClusterSuccessThreshold is the duration of successes for the cluster to be considered healthy after recovery.
ClusterSuccessThreshold metav1.Duration
// ClusterFailureThreshold is the duration of failure for the cluster to be considered unhealthy.
ClusterFailureThreshold metav1.Duration
// ClusterMonitorPeriod represents cluster-controller monitoring period, i.e. how often does
Expand Down Expand Up @@ -136,6 +138,7 @@ func (o *Options) AddFlags(flags *pflag.FlagSet, allControllers, disabledByDefau
"Specifies the expiration period of a cluster lease.")
flags.Float64Var(&o.ClusterLeaseRenewIntervalFraction, "cluster-lease-renew-interval-fraction", 0.25,
"Specifies the cluster lease renew interval fraction.")
flags.DurationVar(&o.ClusterSuccessThreshold.Duration, "cluster-success-threshold", 30*time.Second, "The duration of successes for the cluster to be considered healthy after recovery.")
flags.DurationVar(&o.ClusterFailureThreshold.Duration, "cluster-failure-threshold", 30*time.Second, "The duration of failure for the cluster to be considered unhealthy.")
flags.DurationVar(&o.ClusterMonitorPeriod.Duration, "cluster-monitor-period", 5*time.Second,
"Specifies how often karmada-controller-manager monitors cluster health status.")
Expand Down
2 changes: 2 additions & 0 deletions pkg/controllers/context/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ type Options struct {
// ClusterLeaseRenewIntervalFraction is a fraction coordinated with ClusterLeaseDuration that
// how long the current holder of a lease has last updated the lease.
ClusterLeaseRenewIntervalFraction float64
// ClusterSuccessThreshold is the duration of successes for the cluster to be considered healthy after recovery.
ClusterSuccessThreshold metav1.Duration
// ClusterFailureThreshold is the duration of failure for the cluster to be considered unhealthy.
ClusterFailureThreshold metav1.Duration
// ClusterCacheSyncTimeout is the timeout period waiting for cluster cache to sync.
Expand Down
18 changes: 15 additions & 3 deletions pkg/controllers/status/cluster_condition_cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ type clusterData struct {

type clusterConditionStore struct {
clusterDataMap sync.Map
// successThreshold is the duration of successes for the cluster to be considered healthy after recovery.
successThreshold time.Duration
// failureThreshold is the duration of failure for the cluster to be considered unhealthy.
failureThreshold time.Duration
}
Expand Down Expand Up @@ -46,9 +48,19 @@ func (c *clusterConditionStore) thresholdAdjustedReadyCondition(cluster *cluster
}
c.update(cluster.Name, saved)
}
if observedReadyCondition.Status != metav1.ConditionTrue &&
curReadyCondition.Status == metav1.ConditionTrue &&
now.Before(saved.thresholdStartTime.Add(c.failureThreshold)) {

var threshold time.Duration
if observedReadyCondition.Status == metav1.ConditionTrue {
threshold = c.successThreshold
} else {
threshold = c.failureThreshold
}

// we only care about true/not true
// for unknown->false, just return the observed ready condition
if ((observedReadyCondition.Status == metav1.ConditionTrue && curReadyCondition.Status != metav1.ConditionTrue) ||
(observedReadyCondition.Status != metav1.ConditionTrue && curReadyCondition.Status == metav1.ConditionTrue)) &&
now.Before(saved.thresholdStartTime.Add(threshold)) {
// retain old status until threshold exceeded to avoid network unstable problems.
return curReadyCondition
}
Expand Down
31 changes: 26 additions & 5 deletions pkg/controllers/status/cluster_condition_cache_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
)

func TestThresholdAdjustedReadyCondition(t *testing.T) {
clusterSuccessThreshold := 30 * time.Second
clusterFailureThreshold := 30 * time.Second

tests := []struct {
Expand Down Expand Up @@ -65,7 +66,7 @@ func TestThresholdAdjustedReadyCondition(t *testing.T) {
},
},
{
name: "cluster becomes not ready but still not reach threshold",
name: "cluster becomes not ready but still not reach failure threshold",
clusterData: &clusterData{
readyCondition: metav1.ConditionFalse,
thresholdStartTime: time.Now().Add(-clusterFailureThreshold / 2),
Expand All @@ -84,7 +85,7 @@ func TestThresholdAdjustedReadyCondition(t *testing.T) {
},
},
{
name: "cluster becomes not ready and reaches threshold",
name: "cluster becomes not ready and reaches failure threshold",
clusterData: &clusterData{
readyCondition: metav1.ConditionFalse,
thresholdStartTime: time.Now().Add(-clusterFailureThreshold),
Expand Down Expand Up @@ -122,10 +123,29 @@ func TestThresholdAdjustedReadyCondition(t *testing.T) {
},
},
{
name: "cluster recovers",
name: "cluster recovers but still not reach success threshold",
clusterData: &clusterData{
readyCondition: metav1.ConditionFalse,
thresholdStartTime: time.Now().Add(-3 * clusterFailureThreshold),
readyCondition: metav1.ConditionTrue,
thresholdStartTime: time.Now().Add(-clusterSuccessThreshold / 2),
},
currentCondition: &metav1.Condition{
Type: clusterv1alpha1.ClusterConditionReady,
Status: metav1.ConditionFalse,
},
observedCondition: &metav1.Condition{
Type: clusterv1alpha1.ClusterConditionReady,
Status: metav1.ConditionTrue,
},
expectedCondition: &metav1.Condition{
Type: clusterv1alpha1.ClusterConditionReady,
Status: metav1.ConditionFalse,
},
},
{
name: "cluster recovers and reaches success threshold",
clusterData: &clusterData{
readyCondition: metav1.ConditionTrue,
thresholdStartTime: time.Now().Add(-clusterSuccessThreshold),
},
currentCondition: &metav1.Condition{
Type: clusterv1alpha1.ClusterConditionReady,
Expand All @@ -145,6 +165,7 @@ func TestThresholdAdjustedReadyCondition(t *testing.T) {
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
cache := clusterConditionStore{
successThreshold: clusterSuccessThreshold,
failureThreshold: clusterFailureThreshold,
}

Expand Down
9 changes: 6 additions & 3 deletions pkg/controllers/status/cluster_status_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ type ClusterStatusController struct {
ClusterLeaseRenewIntervalFraction float64
// ClusterLeaseControllers store clusters and their corresponding lease controllers.
ClusterLeaseControllers sync.Map
// ClusterSuccessThreshold is the duration of successes for the cluster to be considered healthy after recovery.
ClusterSuccessThreshold metav1.Duration
// ClusterFailureThreshold is the duration of failure for the cluster to be considered unhealthy.
ClusterFailureThreshold metav1.Duration
// clusterConditionCache stores the condition status of each cluster.
Expand Down Expand Up @@ -117,6 +119,7 @@ func (c *ClusterStatusController) Reconcile(ctx context.Context, req controllerr
// SetupWithManager creates a controller and register to controller manager.
func (c *ClusterStatusController) SetupWithManager(mgr controllerruntime.Manager) error {
c.clusterConditionCache = clusterConditionStore{
successThreshold: c.ClusterSuccessThreshold.Duration,
failureThreshold: c.ClusterFailureThreshold.Duration,
}
return controllerruntime.NewControllerManagedBy(mgr).For(&clusterv1alpha1.Cluster{}).WithEventFilter(c.PredicateFunc).WithOptions(controller.Options{
Expand Down Expand Up @@ -149,7 +152,7 @@ func (c *ClusterStatusController) syncClusterStatus(cluster *clusterv1alpha1.Clu
}

// skip collecting cluster status if not ready
if online && healthy {
if online && healthy && readyCondition.Status == metav1.ConditionTrue {
// get or create informer for pods and nodes in member cluster
clusterInformerManager, err := c.buildInformerForCluster(cluster)
if err != nil {
Expand Down Expand Up @@ -433,7 +436,7 @@ func getNodeSummary(nodes []*corev1.Node) *clusterv1alpha1.NodeSummary {
}
}

var nodeSummary = &clusterv1alpha1.NodeSummary{}
nodeSummary := &clusterv1alpha1.NodeSummary{}
nodeSummary.TotalNum = int32(totalNum)
nodeSummary.ReadyNum = int32(readyNum)

Expand All @@ -445,7 +448,7 @@ func getResourceSummary(nodes []*corev1.Node, pods []*corev1.Pod) *clusterv1alph
allocating := getAllocatingResource(pods)
allocated := getAllocatedResource(pods)

var resourceSummary = &clusterv1alpha1.ResourceSummary{}
resourceSummary := &clusterv1alpha1.ResourceSummary{}
resourceSummary.Allocatable = allocatable
resourceSummary.Allocating = allocating
resourceSummary.Allocated = allocated
Expand Down

0 comments on commit 1ebc680

Please sign in to comment.