From 0735ed361b12efd28a2ea6a39fa31467fa62694d Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Tue, 27 Sep 2022 10:07:35 -0400 Subject: [PATCH] backport: CSI: skip node unpublish on GC'd or down nodes (#14720) --- .changelog/14720.txt | 3 +++ nomad/csi_endpoint.go | 16 ++++++++++++++++ 2 files changed, 19 insertions(+) create mode 100644 .changelog/14720.txt diff --git a/.changelog/14720.txt b/.changelog/14720.txt new file mode 100644 index 000000000000..c6ee35f40bc5 --- /dev/null +++ b/.changelog/14720.txt @@ -0,0 +1,3 @@ +```release-note:bug +csi: Fixed a bug where volume claims on lost or garbage collected nodes could not be freed +``` diff --git a/nomad/csi_endpoint.go b/nomad/csi_endpoint.go index f63097d8e890..7c0d8a675cad 100644 --- a/nomad/csi_endpoint.go +++ b/nomad/csi_endpoint.go @@ -636,6 +636,22 @@ func (v *CSIVolume) nodeUnpublishVolume(vol *structs.CSIVolume, claim *structs.C return err } + // If the node has been GC'd or is down, we can't send it a node + // unpublish. We need to assume the node has unpublished at its + // end. If it hasn't, any controller unpublish will potentially + // hang or error and need to be retried. + if claim.NodeID != "" { + node, err := snap.NodeByID(memdb.NewWatchSet(), claim.NodeID) + if err != nil { + return err + } + if node == nil || node.Status == structs.NodeStatusDown { + v.logger.Debug("skipping node unpublish for down or GC'd node") + claim.State = structs.CSIVolumeClaimStateNodeDetached + return v.checkpointClaim(vol, claim) + } + } + if claim.AllocationID != "" { err := v.nodeUnpublishVolumeImpl(vol, claim) if err != nil {