Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Try validating multiple times before updating instancegroup #9165

Merged
merged 3 commits into from
May 29, 2020
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 20 additions & 51 deletions pkg/instancegroups/instancegroups.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,16 +88,8 @@ func (c *RollingUpdateCluster) rollingUpdateInstanceGroup(ctx context.Context, c

if isBastion {
klog.V(3).Info("Not validating the cluster as instance is a bastion.")
} else if c.CloudOnly {
klog.V(3).Info("Not validating cluster as validation is turned off via the cloud-only flag.")
} else {
if err = c.validateCluster(); err != nil {
if c.FailOnValidate {
return err
}
klog.V(2).Infof("Ignoring cluster validation error: %v", err)
klog.Info("Cluster validation failed, but proceeding since fail-on-validate-error is set to false")
}
} else if err = c.maybeValidate("", 1); err != nil {
johngmyers marked this conversation as resolved.
Show resolved Hide resolved
return err
johngmyers marked this conversation as resolved.
Show resolved Hide resolved
}

if !c.CloudOnly {
Expand Down Expand Up @@ -156,7 +148,7 @@ func (c *RollingUpdateCluster) rollingUpdateInstanceGroup(ctx context.Context, c
klog.Infof("waiting for %v after detaching instance", sleepAfterTerminate)
time.Sleep(sleepAfterTerminate)

if err := c.maybeValidate(c.ValidationTimeout, "detaching"); err != nil {
if err := c.maybeValidate(" after detaching instance", c.ValidateCount); err != nil {
return err
}
noneReady = false
Expand Down Expand Up @@ -185,7 +177,7 @@ func (c *RollingUpdateCluster) rollingUpdateInstanceGroup(ctx context.Context, c
return waitForPendingBeforeReturningError(runningDrains, terminateChan, err)
}

err = c.maybeValidate(c.ValidationTimeout, "removing")
err = c.maybeValidate(" after terminating instance", c.ValidateCount)
if err != nil {
return waitForPendingBeforeReturningError(runningDrains, terminateChan, err)
}
Expand Down Expand Up @@ -231,7 +223,7 @@ func (c *RollingUpdateCluster) rollingUpdateInstanceGroup(ctx context.Context, c
}
}

err = c.maybeValidate(c.ValidationTimeout, "removing")
err = c.maybeValidate(" after terminating instance", c.ValidateCount)
if err != nil {
return err
}
Expand Down Expand Up @@ -383,42 +375,34 @@ func (c *RollingUpdateCluster) drainTerminateAndWait(ctx context.Context, u *clo
return nil
}

func (c *RollingUpdateCluster) maybeValidate(validationTimeout time.Duration, operation string) error {
func (c *RollingUpdateCluster) maybeValidate(operation string, validateCount int) error {
if c.CloudOnly {
klog.Warningf("Not validating cluster as cloudonly flag is set.")

} else {
klog.Info("Validating the cluster.")

if err := c.validateClusterWithDuration(validationTimeout); err != nil {
if err := c.validateClusterWithTimeout(validateCount); err != nil {

if c.FailOnValidate {
klog.Errorf("Cluster did not validate within %s", validationTimeout)
return fmt.Errorf("error validating cluster after %s a node: %v", operation, err)
klog.Errorf("Cluster did not validate within %s", c.ValidationTimeout)
return fmt.Errorf("error validating cluster%s: %v", operation, err)
}

klog.Warningf("Cluster validation failed after %s instance, proceeding since fail-on-validate is set to false: %v", operation, err)
klog.Warningf("Cluster validation failed%s, proceeding since fail-on-validate is set to false: %v", operation, err)
}
}
return nil
}

// validateClusterWithDuration runs validation.ValidateCluster until either we get positive result or the timeout expires
func (c *RollingUpdateCluster) validateClusterWithDuration(validationTimeout time.Duration) error {
ctx, cancel := context.WithTimeout(context.Background(), validationTimeout)
// validateClusterWithTimeout runs validation.ValidateCluster until either we get positive result or the timeout expires
func (c *RollingUpdateCluster) validateClusterWithTimeout(validateCount int) error {
ctx, cancel := context.WithTimeout(context.Background(), c.ValidationTimeout)
defer cancel()

if c.tryValidateCluster(ctx) {
return nil
}

return fmt.Errorf("cluster did not validate within a duration of %q", validationTimeout)
}

func (c *RollingUpdateCluster) tryValidateCluster(ctx context.Context) bool {
if c.ValidateCount == 0 {
if validateCount == 0 {
klog.Warningf("skipping cluster validation because validate-count was 0")
return true
return nil
}

successCount := 0
Expand All @@ -428,9 +412,9 @@ func (c *RollingUpdateCluster) tryValidateCluster(ctx context.Context) bool {
result, err := c.ClusterValidator.Validate()
if err == nil && len(result.Failures) == 0 {
successCount++
if successCount >= c.ValidateCount {
if successCount >= validateCount {
klog.Info("Cluster validated.")
return true
return nil
} else {
klog.Infof("Cluster validated; revalidating in %s to make sure it does not flap.", c.ValidateSuccessDuration)
time.Sleep(c.ValidateSuccessDuration)
Expand All @@ -441,7 +425,7 @@ func (c *RollingUpdateCluster) tryValidateCluster(ctx context.Context) bool {
if err != nil {
if ctx.Err() != nil {
klog.Infof("Cluster did not validate within deadline: %v.", err)
return false
break
}
klog.Infof("Cluster did not validate, will retry in %q: %v.", c.ValidateTickDuration, err)
} else if len(result.Failures) > 0 {
Expand All @@ -451,7 +435,7 @@ func (c *RollingUpdateCluster) tryValidateCluster(ctx context.Context) bool {
}
if ctx.Err() != nil {
klog.Infof("Cluster did not pass validation within deadline: %s.", strings.Join(messages, ", "))
return false
break
}
klog.Infof("Cluster did not pass validation, will retry in %q: %s.", c.ValidateTickDuration, strings.Join(messages, ", "))
}
Expand All @@ -463,23 +447,8 @@ func (c *RollingUpdateCluster) tryValidateCluster(ctx context.Context) bool {
// TODO: Should we check if we have enough time left before the deadline?
time.Sleep(c.ValidateTickDuration)
}
}

// validateCluster runs our validation methods on the K8s Cluster.
func (c *RollingUpdateCluster) validateCluster() error {
result, err := c.ClusterValidator.Validate()
if err != nil {
return fmt.Errorf("cluster %q did not validate: %v", c.ClusterName, err)
}
if len(result.Failures) > 0 {
messages := []string{}
for _, failure := range result.Failures {
messages = append(messages, failure.Message)
}
return fmt.Errorf("cluster %q did not pass validation: %s", c.ClusterName, strings.Join(messages, ", "))
}

return nil
return fmt.Errorf("cluster did not validate within a duration of %q", c.ValidationTimeout)
}

// detachInstance detaches a Cloud Instance
Expand Down