Skip to content

Commit

Permalink
deploymentwatcher: fail early whenever possible
Browse files Browse the repository at this point in the history
Given a deployment that has a progress_deadline, if a task group runs
out of reschedule attempts, allow it to fail at this time instead of
waiting until the progress_deadline is reached.

See #17260
  • Loading branch information
nicoche committed May 29, 2023
1 parent 087ac3a commit 7a639a9
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 11 deletions.
33 changes: 22 additions & 11 deletions nomad/deploymentwatcher/deployment_watcher.go
Expand Up @@ -615,12 +615,30 @@ func (w *deploymentWatcher) handleAllocUpdate(allocs []*structs.AllocListStub) (
continue
}

// Determine if the update block for this group is progress based
progressBased := dstate.ProgressDeadline != 0
hasProgressDeadline := dstate.ProgressDeadline != 0
failDeployment := false

// Fail on the first unhealthy allocation if no progress deadline is specified.
if !hasProgressDeadline && alloc.DeploymentStatus.IsUnhealthy() {
w.logger.Debug("failing deployment because an allocation failed and the deployment is not progress based", alloc.ID)
failDeployment = true
}

if hasProgressDeadline && alloc.DeploymentStatus.IsUnhealthy() &&
deployment.Active() {
reschedulePolicy := w.j.LookupTaskGroup(alloc.TaskGroup).ReschedulePolicy
isRescheduleEligible := alloc.RescheduleEligible(reschedulePolicy, time.Unix(alloc.ModifyTime, 0))
if !isRescheduleEligible {
// We have run out of reschedule attempts: do not wait for the progress deadline to expire because
// we can fail early
w.logger.Debug("failing deployment because an allocation has failed and the task group has run out of reschedule attempts", alloc.ID)
failDeployment = true
}
}

// Check if the allocation has failed and we need to mark it for allow
// replacements
if progressBased && alloc.DeploymentStatus.IsUnhealthy() &&
if !failDeployment && alloc.DeploymentStatus.IsUnhealthy() &&
deployment.Active() && !alloc.DesiredTransition.ShouldReschedule() {
res.allowReplacements = append(res.allowReplacements, alloc.ID)
continue
Expand All @@ -631,19 +649,12 @@ func (w *deploymentWatcher) handleAllocUpdate(allocs []*structs.AllocListStub) (
res.createEval = true
}

// If the group is using a progress deadline, we don't have to do anything.
if progressBased {
continue
}

// Fail on the first bad allocation
if alloc.DeploymentStatus.IsUnhealthy() {
if failDeployment {
// Check if the group has autorevert set
if dstate.AutoRevert {
res.rollback = true
}

// Since we have an unhealthy allocation, fail the deployment
res.failDeployment = true
}

Expand Down
42 changes: 42 additions & 0 deletions nomad/structs/structs.go
Expand Up @@ -11069,6 +11069,48 @@ type AllocListStub struct {
ModifyTime int64
}

// RescheduleEligible returns if the allocation is eligible to be rescheduled according
// to its ReschedulePolicy and the current state of its reschedule trackers
func (a *AllocListStub) RescheduleEligible(reschedulePolicy *ReschedulePolicy, failTime time.Time) bool {
if reschedulePolicy == nil {
return false
}
attempts := reschedulePolicy.Attempts
enabled := attempts > 0 || reschedulePolicy.Unlimited
if !enabled {
return false
}
if reschedulePolicy.Unlimited {
return true
}
// Early return true if there are no attempts yet and the number of allowed attempts is > 0
if (a.RescheduleTracker == nil || len(a.RescheduleTracker.Events) == 0) && attempts > 0 {
return true
}
attempted, _ := a.rescheduleInfo(reschedulePolicy, failTime)
return attempted < attempts
}

func (a *AllocListStub) rescheduleInfo(reschedulePolicy *ReschedulePolicy, failTime time.Time) (int, int) {
if reschedulePolicy == nil {
return 0, 0
}
attempts := reschedulePolicy.Attempts
interval := reschedulePolicy.Interval

attempted := 0
if a.RescheduleTracker != nil && attempts > 0 {
for j := len(a.RescheduleTracker.Events) - 1; j >= 0; j-- {
lastAttempt := a.RescheduleTracker.Events[j].RescheduleTime
timeDiff := failTime.UTC().UnixNano() - lastAttempt
if timeDiff < interval.Nanoseconds() {
attempted += 1
}
}
}
return attempted, attempts
}

// SetEventDisplayMessages populates the display message if its not already
// set, a temporary fix to handle old allocations that don't have it. This
// method will be removed in a future release.
Expand Down

0 comments on commit 7a639a9

Please sign in to comment.