Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add cluster success threshold #1884

Merged
merged 1 commit into from
Jun 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cmd/agent/app/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ func setupControllers(mgr controllerruntime.Manager, opts *options.Options, stop
ClusterStatusUpdateFrequency: opts.ClusterStatusUpdateFrequency,
ClusterLeaseDuration: opts.ClusterLeaseDuration,
ClusterLeaseRenewIntervalFraction: opts.ClusterLeaseRenewIntervalFraction,
ClusterSuccessThreshold: opts.ClusterSuccessThreshold,
ClusterFailureThreshold: opts.ClusterFailureThreshold,
ClusterCacheSyncTimeout: opts.ClusterCacheSyncTimeout,
ClusterAPIQPS: opts.ClusterAPIQPS,
Expand Down Expand Up @@ -223,6 +224,7 @@ func startClusterStatusController(ctx controllerscontext.Context) (bool, error)
ClusterStatusUpdateFrequency: ctx.Opts.ClusterStatusUpdateFrequency,
ClusterLeaseDuration: ctx.Opts.ClusterLeaseDuration,
ClusterLeaseRenewIntervalFraction: ctx.Opts.ClusterLeaseRenewIntervalFraction,
ClusterSuccessThreshold: ctx.Opts.ClusterSuccessThreshold,
ClusterFailureThreshold: ctx.Opts.ClusterFailureThreshold,
ClusterCacheSyncTimeout: ctx.Opts.ClusterCacheSyncTimeout,
RateLimiterOptions: ctx.Opts.RateLimiterOptions,
Expand Down
3 changes: 3 additions & 0 deletions cmd/agent/app/options/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ type Options struct {
// ClusterLeaseRenewIntervalFraction is a fraction coordinated with ClusterLeaseDuration that
// how long the current holder of a lease has last updated the lease.
ClusterLeaseRenewIntervalFraction float64
// ClusterSuccessThreshold is the duration of successes for the cluster to be considered healthy after recovery.
ClusterSuccessThreshold metav1.Duration
// ClusterFailureThreshold is the duration of failure for the cluster to be considered unhealthy.
ClusterFailureThreshold metav1.Duration
// ClusterAPIQPS is the QPS to use while talking with cluster kube-apiserver.
Expand Down Expand Up @@ -100,6 +102,7 @@ func (o *Options) AddFlags(fs *pflag.FlagSet, allControllers []string) {
"Specifies the expiration period of a cluster lease.")
fs.Float64Var(&o.ClusterLeaseRenewIntervalFraction, "cluster-lease-renew-interval-fraction", 0.25,
"Specifies the cluster lease renew interval fraction.")
fs.DurationVar(&o.ClusterSuccessThreshold.Duration, "cluster-success-threshold", 30*time.Second, "The duration of successes for the cluster to be considered healthy after recovery.")
fs.DurationVar(&o.ClusterFailureThreshold.Duration, "cluster-failure-threshold", 30*time.Second, "The duration of failure for the cluster to be considered unhealthy.")
fs.Float32Var(&o.ClusterAPIQPS, "cluster-api-qps", 40.0, "QPS to use while talking with cluster kube-apiserver. Doesn't cover events and node heartbeat apis which rate limiting is controlled by a different set of flags.")
fs.IntVar(&o.ClusterAPIBurst, "cluster-api-burst", 60, "Burst to use while talking with cluster kube-apiserver. Doesn't cover events and node heartbeat apis which rate limiting is controlled by a different set of flags.")
Expand Down
2 changes: 2 additions & 0 deletions cmd/controller-manager/app/controllermanager.go
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,7 @@ func startClusterStatusController(ctx controllerscontext.Context) (enabled bool,
ClusterStatusUpdateFrequency: opts.ClusterStatusUpdateFrequency,
ClusterLeaseDuration: opts.ClusterLeaseDuration,
ClusterLeaseRenewIntervalFraction: opts.ClusterLeaseRenewIntervalFraction,
ClusterSuccessThreshold: opts.ClusterSuccessThreshold,
ClusterFailureThreshold: opts.ClusterFailureThreshold,
ClusterCacheSyncTimeout: opts.ClusterCacheSyncTimeout,
RateLimiterOptions: ctx.Opts.RateLimiterOptions,
Expand Down Expand Up @@ -493,6 +494,7 @@ func setupControllers(mgr controllerruntime.Manager, opts *options.Options, stop
FailoverEvictionTimeout: opts.FailoverEvictionTimeout,
ClusterLeaseDuration: opts.ClusterLeaseDuration,
ClusterLeaseRenewIntervalFraction: opts.ClusterLeaseRenewIntervalFraction,
ClusterSuccessThreshold: opts.ClusterSuccessThreshold,
ClusterFailureThreshold: opts.ClusterFailureThreshold,
ClusterCacheSyncTimeout: opts.ClusterCacheSyncTimeout,
ClusterAPIQPS: opts.ClusterAPIQPS,
Expand Down
3 changes: 3 additions & 0 deletions cmd/controller-manager/app/options/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ type Options struct {
// ClusterLeaseRenewIntervalFraction is a fraction coordinated with ClusterLeaseDuration that
// how long the current holder of a lease has last updated the lease.
ClusterLeaseRenewIntervalFraction float64
// ClusterSuccessThreshold is the duration of successes for the cluster to be considered healthy after recovery.
ClusterSuccessThreshold metav1.Duration
// ClusterFailureThreshold is the duration of failure for the cluster to be considered unhealthy.
ClusterFailureThreshold metav1.Duration
// ClusterMonitorPeriod represents cluster-controller monitoring period, i.e. how often does
Expand Down Expand Up @@ -136,6 +138,7 @@ func (o *Options) AddFlags(flags *pflag.FlagSet, allControllers, disabledByDefau
"Specifies the expiration period of a cluster lease.")
flags.Float64Var(&o.ClusterLeaseRenewIntervalFraction, "cluster-lease-renew-interval-fraction", 0.25,
"Specifies the cluster lease renew interval fraction.")
flags.DurationVar(&o.ClusterSuccessThreshold.Duration, "cluster-success-threshold", 30*time.Second, "The duration of successes for the cluster to be considered healthy after recovery.")
flags.DurationVar(&o.ClusterFailureThreshold.Duration, "cluster-failure-threshold", 30*time.Second, "The duration of failure for the cluster to be considered unhealthy.")
flags.DurationVar(&o.ClusterMonitorPeriod.Duration, "cluster-monitor-period", 5*time.Second,
"Specifies how often karmada-controller-manager monitors cluster health status.")
Expand Down
2 changes: 2 additions & 0 deletions pkg/controllers/context/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ type Options struct {
// ClusterLeaseRenewIntervalFraction is a fraction coordinated with ClusterLeaseDuration that
// how long the current holder of a lease has last updated the lease.
ClusterLeaseRenewIntervalFraction float64
// ClusterSuccessThreshold is the duration of successes for the cluster to be considered healthy after recovery.
ClusterSuccessThreshold metav1.Duration
// ClusterFailureThreshold is the duration of failure for the cluster to be considered unhealthy.
ClusterFailureThreshold metav1.Duration
// ClusterCacheSyncTimeout is the timeout period waiting for cluster cache to sync.
Expand Down
18 changes: 15 additions & 3 deletions pkg/controllers/status/cluster_condition_cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ type clusterData struct {

type clusterConditionStore struct {
clusterDataMap sync.Map
// successThreshold is the duration of successes for the cluster to be considered healthy after recovery.
successThreshold time.Duration
// failureThreshold is the duration of failure for the cluster to be considered unhealthy.
failureThreshold time.Duration
}
Expand Down Expand Up @@ -46,9 +48,19 @@ func (c *clusterConditionStore) thresholdAdjustedReadyCondition(cluster *cluster
}
c.update(cluster.Name, saved)
}
if observedReadyCondition.Status != metav1.ConditionTrue &&
curReadyCondition.Status == metav1.ConditionTrue &&
now.Before(saved.thresholdStartTime.Add(c.failureThreshold)) {

var threshold time.Duration
if observedReadyCondition.Status == metav1.ConditionTrue {
threshold = c.successThreshold
} else {
threshold = c.failureThreshold
}

// we only care about true/not true
// for unknown->false, just return the observed ready condition
if ((observedReadyCondition.Status == metav1.ConditionTrue && curReadyCondition.Status != metav1.ConditionTrue) ||
(observedReadyCondition.Status != metav1.ConditionTrue && curReadyCondition.Status == metav1.ConditionTrue)) &&
now.Before(saved.thresholdStartTime.Add(threshold)) {
// retain old status until threshold exceeded to avoid network unstable problems.
return curReadyCondition
}
Expand Down
31 changes: 26 additions & 5 deletions pkg/controllers/status/cluster_condition_cache_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
)

func TestThresholdAdjustedReadyCondition(t *testing.T) {
clusterSuccessThreshold := 30 * time.Second
clusterFailureThreshold := 30 * time.Second

tests := []struct {
Expand Down Expand Up @@ -65,7 +66,7 @@ func TestThresholdAdjustedReadyCondition(t *testing.T) {
},
},
{
name: "cluster becomes not ready but still not reach threshold",
name: "cluster becomes not ready but still not reach failure threshold",
clusterData: &clusterData{
readyCondition: metav1.ConditionFalse,
thresholdStartTime: time.Now().Add(-clusterFailureThreshold / 2),
Expand All @@ -84,7 +85,7 @@ func TestThresholdAdjustedReadyCondition(t *testing.T) {
},
},
{
name: "cluster becomes not ready and reaches threshold",
name: "cluster becomes not ready and reaches failure threshold",
clusterData: &clusterData{
readyCondition: metav1.ConditionFalse,
thresholdStartTime: time.Now().Add(-clusterFailureThreshold),
Expand Down Expand Up @@ -122,10 +123,29 @@ func TestThresholdAdjustedReadyCondition(t *testing.T) {
},
},
{
name: "cluster recovers",
name: "cluster recovers but still not reach success threshold",
clusterData: &clusterData{
readyCondition: metav1.ConditionFalse,
thresholdStartTime: time.Now().Add(-3 * clusterFailureThreshold),
readyCondition: metav1.ConditionTrue,
thresholdStartTime: time.Now().Add(-clusterSuccessThreshold / 2),
},
currentCondition: &metav1.Condition{
Type: clusterv1alpha1.ClusterConditionReady,
Status: metav1.ConditionFalse,
},
observedCondition: &metav1.Condition{
Type: clusterv1alpha1.ClusterConditionReady,
Status: metav1.ConditionTrue,
},
expectedCondition: &metav1.Condition{
Type: clusterv1alpha1.ClusterConditionReady,
Status: metav1.ConditionFalse,
},
},
{
name: "cluster recovers and reaches success threshold",
clusterData: &clusterData{
readyCondition: metav1.ConditionTrue,
thresholdStartTime: time.Now().Add(-clusterSuccessThreshold),
},
currentCondition: &metav1.Condition{
Type: clusterv1alpha1.ClusterConditionReady,
Expand All @@ -145,6 +165,7 @@ func TestThresholdAdjustedReadyCondition(t *testing.T) {
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
cache := clusterConditionStore{
successThreshold: clusterSuccessThreshold,
failureThreshold: clusterFailureThreshold,
}

Expand Down
9 changes: 6 additions & 3 deletions pkg/controllers/status/cluster_status_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ type ClusterStatusController struct {
ClusterLeaseRenewIntervalFraction float64
// ClusterLeaseControllers store clusters and their corresponding lease controllers.
ClusterLeaseControllers sync.Map
// ClusterSuccessThreshold is the duration of successes for the cluster to be considered healthy after recovery.
ClusterSuccessThreshold metav1.Duration
// ClusterFailureThreshold is the duration of failure for the cluster to be considered unhealthy.
ClusterFailureThreshold metav1.Duration
// clusterConditionCache stores the condition status of each cluster.
Expand Down Expand Up @@ -117,6 +119,7 @@ func (c *ClusterStatusController) Reconcile(ctx context.Context, req controllerr
// SetupWithManager creates a controller and register to controller manager.
func (c *ClusterStatusController) SetupWithManager(mgr controllerruntime.Manager) error {
c.clusterConditionCache = clusterConditionStore{
successThreshold: c.ClusterSuccessThreshold.Duration,
failureThreshold: c.ClusterFailureThreshold.Duration,
}
return controllerruntime.NewControllerManagedBy(mgr).For(&clusterv1alpha1.Cluster{}).WithEventFilter(c.PredicateFunc).WithOptions(controller.Options{
Expand Down Expand Up @@ -149,7 +152,7 @@ func (c *ClusterStatusController) syncClusterStatus(cluster *clusterv1alpha1.Clu
}

// skip collecting cluster status if not ready
if online && healthy {
if online && healthy && readyCondition.Status == metav1.ConditionTrue {
// get or create informer for pods and nodes in member cluster
clusterInformerManager, err := c.buildInformerForCluster(cluster)
if err != nil {
Expand Down Expand Up @@ -433,7 +436,7 @@ func getNodeSummary(nodes []*corev1.Node) *clusterv1alpha1.NodeSummary {
}
}

var nodeSummary = &clusterv1alpha1.NodeSummary{}
nodeSummary := &clusterv1alpha1.NodeSummary{}
nodeSummary.TotalNum = int32(totalNum)
nodeSummary.ReadyNum = int32(readyNum)

Expand All @@ -445,7 +448,7 @@ func getResourceSummary(nodes []*corev1.Node, pods []*corev1.Pod) *clusterv1alph
allocating := getAllocatingResource(pods)
allocated := getAllocatedResource(pods)

var resourceSummary = &clusterv1alpha1.ResourceSummary{}
resourceSummary := &clusterv1alpha1.ResourceSummary{}
resourceSummary.Allocatable = allocatable
resourceSummary.Allocating = allocating
resourceSummary.Allocated = allocated
Expand Down