-
Notifications
You must be signed in to change notification settings - Fork 38.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Don't try to attach volumes which are already attached to other nodes #45346
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -25,10 +25,11 @@ import ( | |||
|
||||
"github.com/golang/glog" | ||||
"k8s.io/apimachinery/pkg/util/wait" | ||||
"k8s.io/kubernetes/pkg/api/v1" | ||||
"k8s.io/kubernetes/pkg/controller/volume/attachdetach/cache" | ||||
"k8s.io/kubernetes/pkg/controller/volume/attachdetach/statusupdater" | ||||
"k8s.io/kubernetes/pkg/util/goroutinemap/exponentialbackoff" | ||||
"k8s.io/kubernetes/pkg/volume/util/nestedpendingoperations" | ||||
"k8s.io/kubernetes/pkg/volume" | ||||
"k8s.io/kubernetes/pkg/volume/util/operationexecutor" | ||||
) | ||||
|
||||
|
@@ -125,6 +126,41 @@ func (rc *reconciler) syncStates() { | |||
rc.attacherDetacher.VerifyVolumesAreAttached(volumesPerNode, rc.actualStateOfWorld) | ||||
} | ||||
|
||||
// isMultiAttachForbidden checks if attaching this volume to multiple nodes is definitely not allowed/possible. | ||||
// In its current form, this function can only reliably say for which volumes it's definitely forbidden. If it returns | ||||
// false, it is not guaranteed that multi-attach is actually supported by the volume type and we must rely on the | ||||
// attacher to fail fast in such cases. | ||||
// Please see https://github.com/kubernetes/kubernetes/issues/40669 and https://github.com/kubernetes/kubernetes/pull/40148#discussion_r98055047 | ||||
func (rc *reconciler) isMultiAttachForbidden(volumeSpec *volume.Spec) bool { | ||||
if volumeSpec.Volume != nil { | ||||
// Check for volume types which are known to fail slow or cause trouble when trying to multi-attach | ||||
if volumeSpec.Volume.AzureDisk != nil || | ||||
volumeSpec.Volume.Cinder != nil { | ||||
return true | ||||
} | ||||
} | ||||
|
||||
// Only if this volume is a persistent volume, we have reliable information on wether it's allowed or not to | ||||
// multi-attach. We trust in the individual volume implementations to not allow unsupported access modes | ||||
if volumeSpec.PersistentVolume != nil { | ||||
if len(volumeSpec.PersistentVolume.Spec.AccessModes) == 0 { | ||||
// No access mode specified so we don't know for sure. Let the attacher fail if needed | ||||
return false | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this shouldn't happen, log an error There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. AccessModes is optional according to kubernetes/pkg/api/v1/types.go Line 477 in 17d33ea
|
||||
} | ||||
|
||||
// check if this volume is allowed to be attached to multiple PODs/nodes, if yes, return false | ||||
for _, accessMode := range volumeSpec.PersistentVolume.Spec.AccessModes { | ||||
if accessMode == v1.ReadWriteMany || accessMode == v1.ReadOnlyMany { | ||||
return false | ||||
} | ||||
} | ||||
return true | ||||
} | ||||
|
||||
// we don't know if it's supported or not and let the attacher fail later in cases it's not supported | ||||
return false | ||||
} | ||||
|
||||
func (rc *reconciler) reconcile() { | ||||
// Detaches are triggered before attaches so that volumes referenced by | ||||
// pods that are rescheduled to a different node are detached first. | ||||
|
@@ -133,6 +169,16 @@ func (rc *reconciler) reconcile() { | |||
for _, attachedVolume := range rc.actualStateOfWorld.GetAttachedVolumes() { | ||||
if !rc.desiredStateOfWorld.VolumeExists( | ||||
attachedVolume.VolumeName, attachedVolume.NodeName) { | ||||
|
||||
// Don't even try to start an operation if there is already one running | ||||
// This check must be done before we do any other checks, as otherwise the other checks | ||||
// may pass while at the same time the volume leaves the pending state, resulting in | ||||
// double detach attempts | ||||
if rc.attacherDetacher.IsOperationPending(attachedVolume.VolumeName, "") { | ||||
glog.V(10).Infof("Operation for volume %q is already running. Can't start detach for %q", attachedVolume.VolumeName, attachedVolume.NodeName) | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I might forget, is this also a bug fix or just an improvement? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please see our old discussion at #40148 (review) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. got it. |
||||
continue | ||||
} | ||||
|
||||
// Set the detach request time | ||||
elapsedTime, err := rc.actualStateOfWorld.SetDetachRequestTime(attachedVolume.VolumeName, attachedVolume.NodeName) | ||||
if err != nil { | ||||
|
@@ -177,10 +223,8 @@ func (rc *reconciler) reconcile() { | |||
glog.Infof(attachedVolume.GenerateMsgDetailed("attacherDetacher.DetachVolume started", fmt.Sprintf("This volume is not safe to detach, but maxWaitForUnmountDuration %v expired, force detaching", rc.maxWaitForUnmountDuration))) | ||||
} | ||||
} | ||||
if err != nil && | ||||
!nestedpendingoperations.IsAlreadyExists(err) && | ||||
!exponentialbackoff.IsExponentialBackoff(err) { | ||||
// Ignore nestedpendingoperations.IsAlreadyExists && exponentialbackoff.IsExponentialBackoff errors, they are expected. | ||||
if err != nil && !exponentialbackoff.IsExponentialBackoff(err) { | ||||
// Ignore exponentialbackoff.IsExponentialBackoff errors, they are expected. | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why remove isAlreadyExists error? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please see our old discussion at #40148 (review) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Now I am thinking although reconciler is currently the only place issue those volume operations, it might not be true in the future. So it might be better to put the isAlreadyExists error back so we know what is the problem. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The same thought was the reason to remove the check. Please note that the check was there to actually prevent the logging of the error, as it was probably considered "normal" that it happened. Before this PR, the reconciler would try to start the operation every 100ms, only succeeding the first time and erroring for every try that followed. This check was the reason that there was never an error message shown. Now that we have the |
||||
// Log all other errors. | ||||
glog.Errorf(attachedVolume.GenerateErrorDetailed("attacherDetacher.DetachVolume failed to start", err).Error()) | ||||
} | ||||
|
@@ -195,16 +239,28 @@ func (rc *reconciler) reconcile() { | |||
glog.V(5).Infof(volumeToAttach.GenerateMsgDetailed("Volume attached--touching", "")) | ||||
rc.actualStateOfWorld.ResetDetachRequestTime(volumeToAttach.VolumeName, volumeToAttach.NodeName) | ||||
} else { | ||||
// Don't even try to start an operation if there is already one running | ||||
if rc.attacherDetacher.IsOperationPending(volumeToAttach.VolumeName, "") { | ||||
glog.V(10).Infof("Operation for volume %q is already running. Can't start attach for %q", volumeToAttach.VolumeName, volumeToAttach.NodeName) | ||||
continue | ||||
} | ||||
|
||||
if rc.isMultiAttachForbidden(volumeToAttach.VolumeSpec) { | ||||
nodes := rc.actualStateOfWorld.GetNodesForVolume(volumeToAttach.VolumeName) | ||||
if len(nodes) > 0 { | ||||
glog.V(4).Infof("Volume %q is already exclusively attached to node %q and can't be attached to %q", volumeToAttach.VolumeName, nodes, volumeToAttach.NodeName) | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if it is not spamming, V(3) is more helpful. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This one is spamming (10 times per second) if it happens. On Azure it always happened while volumes were being moved to other nodes as long as it was not detached from the old node yet. |
||||
continue | ||||
} | ||||
} | ||||
|
||||
// Volume/Node doesn't exist, spawn a goroutine to attach it | ||||
glog.V(5).Infof(volumeToAttach.GenerateMsgDetailed("Starting attacherDetacher.AttachVolume", "")) | ||||
err := rc.attacherDetacher.AttachVolume(volumeToAttach.VolumeToAttach, rc.actualStateOfWorld) | ||||
if err == nil { | ||||
glog.Infof(volumeToAttach.GenerateMsgDetailed("attacherDetacher.AttachVolume started", "")) | ||||
} | ||||
if err != nil && | ||||
!nestedpendingoperations.IsAlreadyExists(err) && | ||||
!exponentialbackoff.IsExponentialBackoff(err) { | ||||
// Ignore nestedpendingoperations.IsAlreadyExists && exponentialbackoff.IsExponentialBackoff errors, they are expected. | ||||
if err != nil && !exponentialbackoff.IsExponentialBackoff(err) { | ||||
// Ignore exponentialbackoff.IsExponentialBackoff errors, they are expected. | ||||
// Log all other errors. | ||||
glog.Errorf(volumeToAttach.GenerateErrorDetailed("attacherDetacher.AttachVolume failed to start", err).Error()) | ||||
} | ||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
aws too?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
according to this page https://kubernetes.io/docs/concepts/storage/persistent-volumes/#access-modes
There are 6 types of storage that support multi-write, while 13 of them not. Maybe we can make a map of it as a constant, and check whether the volume type is one of them?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd suggest to do this refactoring after this PR is merged. Currently it's hard for me to do extensive testing as I'm working on non-k8s projects atm.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
make a note here for follow up.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done: #45596