From 0735ed361b12efd28a2ea6a39fa31467fa62694d Mon Sep 17 00:00:00 2001
From: Tim Gross <tgross@hashicorp.com>
Date: Tue, 27 Sep 2022 10:07:35 -0400
Subject: [PATCH] backport: CSI: skip node unpublish on GC'd or down nodes
 (#14720)

---
 .changelog/14720.txt  |  3 +++
 nomad/csi_endpoint.go | 16 ++++++++++++++++
 2 files changed, 19 insertions(+)
 create mode 100644 .changelog/14720.txt

diff --git a/.changelog/14720.txt b/.changelog/14720.txt
new file mode 100644
index 000000000000..c6ee35f40bc5
--- /dev/null
+++ b/.changelog/14720.txt
@@ -0,0 +1,3 @@
+```release-note:bug
+csi: Fixed a bug where volume claims on lost or garbage collected nodes could not be freed
+```
diff --git a/nomad/csi_endpoint.go b/nomad/csi_endpoint.go
index f63097d8e890..7c0d8a675cad 100644
--- a/nomad/csi_endpoint.go
+++ b/nomad/csi_endpoint.go
@@ -636,6 +636,22 @@ func (v *CSIVolume) nodeUnpublishVolume(vol *structs.CSIVolume, claim *structs.C
 		return err
 	}
 
+	// If the node has been GC'd or is down, we can't send it a node
+	// unpublish. We need to assume the node has unpublished at its
+	// end. If it hasn't, any controller unpublish will potentially
+	// hang or error and need to be retried.
+	if claim.NodeID != "" {
+		node, err := snap.NodeByID(memdb.NewWatchSet(), claim.NodeID)
+		if err != nil {
+			return err
+		}
+		if node == nil || node.Status == structs.NodeStatusDown {
+			v.logger.Debug("skipping node unpublish for down or GC'd node")
+			claim.State = structs.CSIVolumeClaimStateNodeDetached
+			return v.checkpointClaim(vol, claim)
+		}
+	}
+
 	if claim.AllocationID != "" {
 		err := v.nodeUnpublishVolumeImpl(vol, claim)
 		if err != nil {