Skip to content

Commit

Permalink
drain: use client status to determine drain is complete (#14348)
Browse files Browse the repository at this point in the history
If an allocation is slow to stop because of `kill_timeout` or `shutdown_delay`,
the node drain is marked as complete prematurely, even though drain monitoring
will continue to report allocation migrations. This impacts the UI or API
clients that monitor node draining to shut down nodes.

This changeset updates the behavior to wait until the client status of all
drained allocs are terminal before marking the node as done draining.
  • Loading branch information
tgross committed Apr 13, 2023
1 parent 38d0a2f commit f91bf84
Show file tree
Hide file tree
Showing 7 changed files with 362 additions and 216 deletions.
3 changes: 3 additions & 0 deletions .changelog/14348.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:bug
drain: Fixed a bug where drains would complete based on the server status and not the client status of an allocation
```
43 changes: 43 additions & 0 deletions e2e/nodedrain/input/drain_killtimeout.nomad
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: MPL-2.0

job "drain_killtimeout" {

constraint {
attribute = "${attr.kernel.name}"
value = "linux"
}

group "group" {

task "task" {
driver = "docker"

kill_timeout = "30s" # matches the agent's max_kill_timeout

config {
image = "busybox:1"
command = "/bin/sh"
args = ["local/script.sh"]
}

# this job traps SIGINT so that we can assert that we've forced the drain
# to wait until the client status has been updated
template {
data = <<EOF
#!/bin/sh
trap 'sleep 60' 2
sleep 600
EOF

destination = "local/script.sh"
change_mode = "noop"
}

resources {
cpu = 256
memory = 128
}
}
}
}
57 changes: 57 additions & 0 deletions e2e/nodedrain/node_drain_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ func TestNodeDrain(t *testing.T) {
t.Run("IgnoreSystem", testIgnoreSystem)
t.Run("EphemeralMigrate", testEphemeralMigrate)
t.Run("KeepIneligible", testKeepIneligible)
t.Run("KillTimeout", testKillTimeout)
t.Run("DeadlineFlag", testDeadlineFlag)
t.Run("ForceFlag", testForceFlag)
}
Expand Down Expand Up @@ -184,6 +185,62 @@ func testKeepIneligible(t *testing.T) {
}
}

// testKillTimeout tests that we block drains until the client status has been
// updated, not the server status.
func testKillTimeout(t *testing.T) {

nomadClient := e2eutil.NomadClient(t)
t.Cleanup(cleanupDrainState(t))

jobID := "test-node-drain-" + uuid.Short()

must.NoError(t, e2eutil.Register(jobID, "./input/drain_killtimeout.nomad"))
allocs := waitForRunningAllocs(t, nomadClient, jobID, 1)

t.Cleanup(cleanupJobState(t, jobID))
oldAllocID := allocs[0].ID
oldNodeID := allocs[0].NodeID

t.Logf("draining node %v", oldNodeID)
out, err := e2eutil.Command(
"nomad", "node", "drain",
"-enable", "-yes", "-detach", oldNodeID)
must.NoError(t, err, must.Sprintf("'nomad node drain %v' failed: %v\n%v", oldNodeID, err, out))

// the job will hang with kill_timeout for up to 30s, so we want to assert
// that we don't complete draining before that window expires. But we also
// can't guarantee we've started this assertion with exactly 30s left on the
// clock, so cut the deadline close without going over to avoid test
// flakiness
t.Log("waiting for kill_timeout to expire")
must.Wait(t, wait.ContinualSuccess(
wait.BoolFunc(func() bool {
node, _, err := nomadClient.Nodes().Info(oldNodeID, nil)
must.NoError(t, err)
return node.DrainStrategy != nil
}),
wait.Timeout(time.Second*25),
wait.Gap(500*time.Millisecond),
))

// the allocation will then get force-killed, so wait for the alloc
// eventually be migrated and for the node's drain to be complete
t.Log("waiting for migration to complete")
newAllocs := waitForAllocDrainComplete(t, nomadClient, jobID,
oldAllocID, oldNodeID, time.Second*60)
must.Len(t, 1, newAllocs, must.Sprint("expected 1 new alloc"))

must.Wait(t, wait.InitialSuccess(
wait.BoolFunc(func() bool {
node, _, err := nomadClient.Nodes().Info(oldNodeID, nil)
must.NoError(t, err)
return node.DrainStrategy == nil
}),
wait.Timeout(time.Second*5),
wait.Gap(500*time.Millisecond),
))
}

// testDeadlineFlag tests the enforcement of the node drain deadline so that
// allocations are moved even if max_parallel says we should be waiting
func testDeadlineFlag(t *testing.T) {
Expand Down
2 changes: 1 addition & 1 deletion nomad/drainer/draining_node.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ func (n *drainingNode) IsDone() (bool, error) {
}

// If there is a non-terminal we aren't done
if !alloc.TerminalStatus() {
if !alloc.ClientTerminalStatus() {
return false, nil
}
}
Expand Down
10 changes: 5 additions & 5 deletions nomad/drainer/watch_jobs.go
Original file line number Diff line number Diff line change
Expand Up @@ -367,10 +367,10 @@ func handleTaskGroup(snap *state.StateSnapshot, batch bool, tg *structs.TaskGrou
}

// Check if the alloc should be considered migrated. A migrated
// allocation is one that is terminal, is on a draining
// allocation, and has only happened since our last handled index to
// allocation is one that is terminal on the client, is on a draining
// allocation, and has been updated since our last handled index to
// avoid emitting many duplicate migrate events.
if alloc.TerminalStatus() &&
if alloc.ClientTerminalStatus() &&
onDrainingNode &&
alloc.ModifyIndex > lastHandledIndex {
result.migrated = append(result.migrated, alloc)
Expand All @@ -385,8 +385,8 @@ func handleTaskGroup(snap *state.StateSnapshot, batch bool, tg *structs.TaskGrou

// An alloc can't be considered for migration if:
// - It isn't on a draining node
// - It is already terminal
if !onDrainingNode || alloc.TerminalStatus() {
// - It is already terminal on the client
if !onDrainingNode || alloc.ClientTerminalStatus() {
continue
}

Expand Down
Loading

0 comments on commit f91bf84

Please sign in to comment.