drain: use client status to determine drain is complete (#14348)

If an allocation is slow to stop because of `kill_timeout` or `shutdown_delay`, the node drain is marked as complete prematurely, even though drain monitoring will continue to report allocation migrations. This impacts the UI or API clients that monitor node draining to shut down nodes. This changeset updates the behavior to wait until the client status of all drained allocs are terminal before marking the node as done draining.
hashicorp · Apr 13, 2023 · f91bf84 · f91bf84
1 parent 38d0a2f
commit f91bf84
Show file tree

Hide file tree

Showing 7 changed files with 362 additions and 216 deletions.
diff --git a/.changelog/14348.txt b/.changelog/14348.txt
@@ -0,0 +1,3 @@
+```release-note:bug
+drain: Fixed a bug where drains would complete based on the server status and not the client status of an allocation
+```
diff --git a/e2e/nodedrain/input/drain_killtimeout.nomad b/e2e/nodedrain/input/drain_killtimeout.nomad
@@ -0,0 +1,43 @@
+# Copyright (c) HashiCorp, Inc.
+# SPDX-License-Identifier: MPL-2.0
+
+job "drain_killtimeout" {
+
+  constraint {
+    attribute = "${attr.kernel.name}"
+    value     = "linux"
+  }
+
+  group "group" {
+
+    task "task" {
+      driver = "docker"
+
+      kill_timeout = "30s" # matches the agent's max_kill_timeout
+
+      config {
+        image   = "busybox:1"
+        command = "/bin/sh"
+        args    = ["local/script.sh"]
+      }
+
+      # this job traps SIGINT so that we can assert that we've forced the drain
+      # to wait until the client status has been updated
+      template {
+        data = <<EOF
+#!/bin/sh
+trap 'sleep 60' 2
+sleep 600
+EOF
+
+        destination = "local/script.sh"
+        change_mode = "noop"
+      }
+
+      resources {
+        cpu    = 256
+        memory = 128
+      }
+    }
+  }
+}
diff --git a/e2e/nodedrain/node_drain_test.go b/e2e/nodedrain/node_drain_test.go
@@ -30,6 +30,7 @@ func TestNodeDrain(t *testing.T) {
 	t.Run("IgnoreSystem", testIgnoreSystem)
 	t.Run("EphemeralMigrate", testEphemeralMigrate)
 	t.Run("KeepIneligible", testKeepIneligible)
+	t.Run("KillTimeout", testKillTimeout)
 	t.Run("DeadlineFlag", testDeadlineFlag)
 	t.Run("ForceFlag", testForceFlag)
 }
@@ -184,6 +185,62 @@ func testKeepIneligible(t *testing.T) {
 	}
 }
 
+// testKillTimeout tests that we block drains until the client status has been
+// updated, not the server status.
+func testKillTimeout(t *testing.T) {
+
+	nomadClient := e2eutil.NomadClient(t)
+	t.Cleanup(cleanupDrainState(t))
+
+	jobID := "test-node-drain-" + uuid.Short()
+
+	must.NoError(t, e2eutil.Register(jobID, "./input/drain_killtimeout.nomad"))
+	allocs := waitForRunningAllocs(t, nomadClient, jobID, 1)
+
+	t.Cleanup(cleanupJobState(t, jobID))
+	oldAllocID := allocs[0].ID
+	oldNodeID := allocs[0].NodeID
+
+	t.Logf("draining node %v", oldNodeID)
+	out, err := e2eutil.Command(
+		"nomad", "node", "drain",
+		"-enable", "-yes", "-detach", oldNodeID)
+	must.NoError(t, err, must.Sprintf("'nomad node drain %v' failed: %v\n%v", oldNodeID, err, out))
+
+	// the job will hang with kill_timeout for up to 30s, so we want to assert
+	// that we don't complete draining before that window expires. But we also
+	// can't guarantee we've started this assertion with exactly 30s left on the
+	// clock, so cut the deadline close without going over to avoid test
+	// flakiness
+	t.Log("waiting for kill_timeout to expire")
+	must.Wait(t, wait.ContinualSuccess(
+		wait.BoolFunc(func() bool {
+			node, _, err := nomadClient.Nodes().Info(oldNodeID, nil)
+			must.NoError(t, err)
+			return node.DrainStrategy != nil
+		}),
+		wait.Timeout(time.Second*25),
+		wait.Gap(500*time.Millisecond),
+	))
+
+	// the allocation will then get force-killed, so wait for the alloc
+	// eventually be migrated and for the node's drain to be complete
+	t.Log("waiting for migration to complete")
+	newAllocs := waitForAllocDrainComplete(t, nomadClient, jobID,
+		oldAllocID, oldNodeID, time.Second*60)
+	must.Len(t, 1, newAllocs, must.Sprint("expected 1 new alloc"))
+
+	must.Wait(t, wait.InitialSuccess(
+		wait.BoolFunc(func() bool {
+			node, _, err := nomadClient.Nodes().Info(oldNodeID, nil)
+			must.NoError(t, err)
+			return node.DrainStrategy == nil
+		}),
+		wait.Timeout(time.Second*5),
+		wait.Gap(500*time.Millisecond),
+	))
+}
+
 // testDeadlineFlag tests the enforcement of the node drain deadline so that
 // allocations are moved even if max_parallel says we should be waiting
 func testDeadlineFlag(t *testing.T) {

diff --git a/nomad/drainer/draining_node.go b/nomad/drainer/draining_node.go
@@ -76,7 +76,7 @@ func (n *drainingNode) IsDone() (bool, error) {
 		}
 
 		// If there is a non-terminal we aren't done
-		if !alloc.TerminalStatus() {
+		if !alloc.ClientTerminalStatus() {
 			return false, nil
 		}
 	}

diff --git a/nomad/drainer/watch_jobs.go b/nomad/drainer/watch_jobs.go
@@ -367,10 +367,10 @@ func handleTaskGroup(snap *state.StateSnapshot, batch bool, tg *structs.TaskGrou
 		}
 
 		// Check if the alloc should be considered migrated. A migrated
-		// allocation is one that is terminal, is on a draining
-		// allocation, and has only happened since our last handled index to
+		// allocation is one that is terminal on the client, is on a draining
+		// allocation, and has been updated since our last handled index to
 		// avoid emitting many duplicate migrate events.
-		if alloc.TerminalStatus() &&
+		if alloc.ClientTerminalStatus() &&
 			onDrainingNode &&
 			alloc.ModifyIndex > lastHandledIndex {
 			result.migrated = append(result.migrated, alloc)
@@ -385,8 +385,8 @@ func handleTaskGroup(snap *state.StateSnapshot, batch bool, tg *structs.TaskGrou
 
 		// An alloc can't be considered for migration if:
 		// - It isn't on a draining node
-		// - It is already terminal
-		if !onDrainingNode || alloc.TerminalStatus() {
+		// - It is already terminal on the client
+		if !onDrainingNode || alloc.ClientTerminalStatus() {
 			continue
 		}