Skip to content

Commit

Permalink
validate cluster n times in rolling update
Browse files Browse the repository at this point in the history
  • Loading branch information
zetaab committed Apr 8, 2020
1 parent ef10d54 commit 7e35133
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 26 deletions.
10 changes: 8 additions & 2 deletions cmd/kops/rollingupdatecluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,9 @@ type RollingUpdateOptions struct {
// ValidationTimeout is the timeout for validation to succeed after the drain and pause
ValidationTimeout time.Duration

// ValidateTimes is the amount of time that a cluster needs to be validated after single node update
ValidateTimes int32

// MasterInterval is the minimum time to wait after stopping a master node. This does not include drain and validate time.
MasterInterval time.Duration

Expand Down Expand Up @@ -158,6 +161,7 @@ func (o *RollingUpdateOptions) InitDefaults() {

o.PostDrainDelay = 5 * time.Second
o.ValidationTimeout = 15 * time.Minute
o.ValidateTimes = 2
}

func NewCmdRollingUpdateCluster(f *util.Factory, out io.Writer) *cobra.Command {
Expand All @@ -177,6 +181,7 @@ func NewCmdRollingUpdateCluster(f *util.Factory, out io.Writer) *cobra.Command {
cmd.Flags().BoolVar(&options.CloudOnly, "cloudonly", options.CloudOnly, "Perform rolling update without confirming progress with k8s")

cmd.Flags().DurationVar(&options.ValidationTimeout, "validation-timeout", options.ValidationTimeout, "Maximum time to wait for a cluster to validate")
cmd.Flags().Int32Var(&options.ValidateTimes, "validate-times", options.ValidateTimes, "Amount of times that a cluster needs to be validated after single node update")
cmd.Flags().DurationVar(&options.MasterInterval, "master-interval", options.MasterInterval, "Time to wait between restarting masters")
cmd.Flags().DurationVar(&options.NodeInterval, "node-interval", options.NodeInterval, "Time to wait between restarting nodes")
cmd.Flags().DurationVar(&options.BastionInterval, "bastion-interval", options.BastionInterval, "Time to wait between restarting bastions")
Expand Down Expand Up @@ -333,9 +338,10 @@ func RunRollingUpdateCluster(f *util.Factory, out io.Writer, options *RollingUpd
ClusterName: options.ClusterName,
PostDrainDelay: options.PostDrainDelay,
ValidationTimeout: options.ValidationTimeout,
ValidateTimes: options.ValidateTimes,
ValidateSucceeded: 0,
// TODO should we expose this to the UI?
ValidateTickDuration: 30 * time.Second,
ValidateSuccessDuration: 10 * time.Second,
ValidateTickDuration: 30 * time.Second,
}

err = d.AdjustNeedUpdate(groups, cluster, list)
Expand Down
10 changes: 6 additions & 4 deletions pkg/instancegroups/instancegroups.go
Original file line number Diff line number Diff line change
Expand Up @@ -430,10 +430,12 @@ func (c *RollingUpdateCluster) validateClusterWithDuration(duration time.Duratio
func (c *RollingUpdateCluster) tryValidateCluster(duration time.Duration) bool {
result, err := c.ClusterValidator.Validate()

if err == nil && len(result.Failures) == 0 && c.ValidateSuccessDuration > 0 {
klog.Infof("Cluster validated; revalidating in %s to make sure it does not flap.", c.ValidateSuccessDuration)
time.Sleep(c.ValidateSuccessDuration)
result, err = c.ClusterValidator.Validate()
if err == nil && len(result.Failures) == 0 && c.ValidateTimes > 0 {
c.ValidateSucceeded++
if c.ValidateSucceeded < c.ValidateTimes {
klog.Infof("Cluster validated; revalidating %d time(s) to make sure it does not flap.", c.ValidateTimes-c.ValidateSucceeded)
return false
}
}

if err != nil {
Expand Down
8 changes: 5 additions & 3 deletions pkg/instancegroups/rollingupdate.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,11 @@ type RollingUpdateCluster struct {
// ValidateTickDuration is the amount of time to wait between cluster validation attempts
ValidateTickDuration time.Duration

// ValidateSuccessDuration is the amount of time a cluster must continue to validate successfully
// before updating the next node
ValidateSuccessDuration time.Duration
// ValidateTimes is the amount of time that a cluster needs to be validated after single node update
ValidateTimes int32

// ValidateSucceeded is the amount of times that a cluster validate is succeeded already
ValidateSucceeded int32
}

// AdjustNeedUpdate adjusts the set of instances that need updating, using factors outside those known by the cloud implementation
Expand Down
29 changes: 12 additions & 17 deletions pkg/instancegroups/rollingupdate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,16 +59,17 @@ func getTestSetup() (*RollingUpdateCluster, *awsup.MockAWSCloud, *kopsapi.Cluste
cluster.Name = "test.k8s.local"

c := &RollingUpdateCluster{
Cloud: mockcloud,
MasterInterval: 1 * time.Millisecond,
NodeInterval: 1 * time.Millisecond,
BastionInterval: 1 * time.Millisecond,
Force: false,
K8sClient: k8sClient,
ClusterValidator: &successfulClusterValidator{},
FailOnValidate: true,
ValidateTickDuration: 1 * time.Millisecond,
ValidateSuccessDuration: 5 * time.Millisecond,
Cloud: mockcloud,
MasterInterval: 1 * time.Millisecond,
NodeInterval: 1 * time.Millisecond,
BastionInterval: 1 * time.Millisecond,
Force: false,
K8sClient: k8sClient,
ClusterValidator: &successfulClusterValidator{},
FailOnValidate: true,
ValidateTickDuration: 1 * time.Millisecond,
ValidateTimes: 1,
ValidateSucceeded: 0,
}

return c, mockcloud, cluster
Expand Down Expand Up @@ -511,6 +512,7 @@ func (v *flappingClusterValidator) Validate() (*validation.ValidationCluster, er

func TestRollingUpdateFlappingValidation(t *testing.T) {
c, cloud, cluster := getTestSetup()
c.ValidateTimes = 3

// This should only take a few milliseconds,
// but we have to pad to allow for random delays (e.g. GC)
Expand Down Expand Up @@ -977,7 +979,6 @@ func TestRollingUpdateMaxUnavailableAllNeedUpdate(t *testing.T) {
c, cloud, cluster := getTestSetup()

concurrentTest := newConcurrentTest(t, cloud, 0, true)
c.ValidateSuccessDuration = 0
c.ClusterValidator = concurrentTest
cloud.MockEC2 = concurrentTest

Expand All @@ -1000,7 +1001,6 @@ func TestRollingUpdateMaxUnavailableAllButOneNeedUpdate(t *testing.T) {
c, cloud, cluster := getTestSetup()

concurrentTest := newConcurrentTest(t, cloud, 0, false)
c.ValidateSuccessDuration = 0
c.ClusterValidator = concurrentTest
cloud.MockEC2 = concurrentTest

Expand All @@ -1022,7 +1022,6 @@ func TestRollingUpdateMaxUnavailableAllNeedUpdateMaster(t *testing.T) {
c, cloud, cluster := getTestSetup()

concurrentTest := newConcurrentTest(t, cloud, 0, true)
c.ValidateSuccessDuration = 0
c.ClusterValidator = concurrentTest
cloud.MockEC2 = concurrentTest

Expand Down Expand Up @@ -1074,7 +1073,6 @@ func TestRollingUpdateMaxSurgeAllNeedUpdate(t *testing.T) {
c, cloud, cluster := getTestSetup()

concurrentTest := newConcurrentTest(t, cloud, 2, true)
c.ValidateSuccessDuration = 0
c.ClusterValidator = concurrentTest
cloud.MockAutoscaling = &concurrentTestAutoscaling{
AutoScalingAPI: cloud.MockAutoscaling,
Expand All @@ -1101,7 +1099,6 @@ func TestRollingUpdateMaxSurgeAllButOneNeedUpdate(t *testing.T) {
c, cloud, cluster := getTestSetup()

concurrentTest := newConcurrentTest(t, cloud, 2, false)
c.ValidateSuccessDuration = 0
c.ClusterValidator = concurrentTest
cloud.MockAutoscaling = &concurrentTestAutoscaling{
AutoScalingAPI: cloud.MockAutoscaling,
Expand Down Expand Up @@ -1248,7 +1245,6 @@ func TestRollingUpdateMaxSurgeAllNeedUpdateOneAlreadyDetached(t *testing.T) {
detached: map[string]bool{},
}

c.ValidateSuccessDuration = 0
c.ClusterValidator = alreadyDetachedTest
cloud.MockAutoscaling = &alreadyDetachedTestAutoscaling{
AutoScalingAPI: cloud.MockAutoscaling,
Expand Down Expand Up @@ -1277,7 +1273,6 @@ func TestRollingUpdateMaxSurgeAllNeedUpdateMaxAlreadyDetached(t *testing.T) {

// Should behave the same as TestRollingUpdateMaxUnavailableAllNeedUpdate
concurrentTest := newConcurrentTest(t, cloud, 0, true)
c.ValidateSuccessDuration = 0
c.ClusterValidator = concurrentTest
cloud.MockEC2 = concurrentTest

Expand Down

0 comments on commit 7e35133

Please sign in to comment.