Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Automated cherry pick of #84181: Lower AWS DescribeVolume frequency #85675: Fix AWS eventual consistency of AttachDisk #89894

Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
80 changes: 61 additions & 19 deletions staging/src/k8s.io/legacy-cloud-providers/aws/aws.go
Expand Up @@ -219,13 +219,15 @@ const nodeWithImpairedVolumes = "NodeWithImpairedVolumes"
const (
// volumeAttachmentConsecutiveErrorLimit is the number of consecutive errors we will ignore when waiting for a volume to attach/detach
volumeAttachmentStatusConsecutiveErrorLimit = 10
// most attach/detach operations on AWS finish within 1-4 seconds
// By using 1 second starting interval with a backoff of 1.8
// we get - [1, 1.8, 3.24, 5.832000000000001, 10.4976]
// in total we wait for 2601 seconds
volumeAttachmentStatusInitialDelay = 1 * time.Second
volumeAttachmentStatusFactor = 1.8
volumeAttachmentStatusSteps = 13

// Attach typically takes 2-5 seconds (average is 2). Asking before 2 seconds is just waste of API quota.
volumeAttachmentStatusInitialDelay = 2 * time.Second
// Detach typically takes 5-10 seconds (average is 6). Asking before 5 seconds is just waste of API quota.
volumeDetachmentStatusInitialDelay = 5 * time.Second
// After the initial delay, poll attach/detach with exponential backoff (2046 seconds total)
volumeAttachmentStatusPollDelay = 2 * time.Second
volumeAttachmentStatusFactor = 2
volumeAttachmentStatusSteps = 11

// createTag* is configuration of exponential backoff for CreateTag call. We
// retry mainly because if we create an object, we cannot tag it until it is
Expand Down Expand Up @@ -2069,17 +2071,24 @@ func (c *Cloud) applyUnSchedulableTaint(nodeName types.NodeName, reason string)

// waitForAttachmentStatus polls until the attachment status is the expected value
// On success, it returns the last attachment state.
func (d *awsDisk) waitForAttachmentStatus(status string) (*ec2.VolumeAttachment, error) {
func (d *awsDisk) waitForAttachmentStatus(status string, expectedInstance, expectedDevice string) (*ec2.VolumeAttachment, error) {
backoff := wait.Backoff{
Duration: volumeAttachmentStatusInitialDelay,
Duration: volumeAttachmentStatusPollDelay,
Factor: volumeAttachmentStatusFactor,
Steps: volumeAttachmentStatusSteps,
}

// Because of rate limiting, we often see errors from describeVolume
// Because of rate limiting, we often see errors from describeVolume.
// Or AWS eventual consistency returns unexpected data.
// So we tolerate a limited number of failures.
// But once we see more than 10 errors in a row, we return the error
describeErrorCount := 0
// But once we see more than 10 errors in a row, we return the error.
errorCount := 0

// Attach/detach usually takes time. It does not make sense to start
// polling DescribeVolumes before some initial delay to let AWS
// process the request.
time.Sleep(getInitialAttachDetachDelay(status))

var attachment *ec2.VolumeAttachment

err := wait.ExponentialBackoff(backoff, func() (bool, error) {
Expand All @@ -2102,8 +2111,8 @@ func (d *awsDisk) waitForAttachmentStatus(status string) (*ec2.VolumeAttachment,
return false, err
}
}
describeErrorCount++
if describeErrorCount > volumeAttachmentStatusConsecutiveErrorLimit {
errorCount++
if errorCount > volumeAttachmentStatusConsecutiveErrorLimit {
// report the error
return false, err
}
Expand All @@ -2112,8 +2121,6 @@ func (d *awsDisk) waitForAttachmentStatus(status string) (*ec2.VolumeAttachment,
return false, nil
}

describeErrorCount = 0

if len(info.Attachments) > 1 {
// Shouldn't happen; log so we know if it is
klog.Warningf("Found multiple attachments for volume %q: %v", d.awsID, info)
Expand All @@ -2135,15 +2142,42 @@ func (d *awsDisk) waitForAttachmentStatus(status string) (*ec2.VolumeAttachment,
if attachmentStatus == "" {
attachmentStatus = "detached"
}
if attachment != nil {
// AWS eventual consistency can go back in time.
// For example, we're waiting for a volume to be attached as /dev/xvdba, but AWS can tell us it's
// attached as /dev/xvdbb, where it was attached before and it was already detached.
// Retry couple of times, hoping AWS starts reporting the right status.
device := aws.StringValue(attachment.Device)
if expectedDevice != "" && device != "" && device != expectedDevice {
klog.Warningf("Expected device %s %s for volume %s, but found device %s %s", expectedDevice, status, d.name, device, attachmentStatus)
errorCount++
if errorCount > volumeAttachmentStatusConsecutiveErrorLimit {
// report the error
return false, fmt.Errorf("attachment of disk %q failed: requested device %q but found %q", d.name, expectedDevice, device)
}
return false, nil
}
instanceID := aws.StringValue(attachment.InstanceId)
if expectedInstance != "" && instanceID != "" && instanceID != expectedInstance {
klog.Warningf("Expected instance %s/%s for volume %s, but found instance %s/%s", expectedInstance, status, d.name, instanceID, attachmentStatus)
errorCount++
if errorCount > volumeAttachmentStatusConsecutiveErrorLimit {
// report the error
return false, fmt.Errorf("attachment of disk %q failed: requested device %q but found %q", d.name, expectedDevice, device)
}
return false, nil
}
}

if attachmentStatus == status {
// Attachment is in requested state, finish waiting
return true, nil
}
// continue waiting
errorCount = 0
klog.V(2).Infof("Waiting for volume %q state: actual=%s, desired=%s", d.awsID, attachmentStatus, status)
return false, nil
})

return attachment, err
}

Expand Down Expand Up @@ -2280,7 +2314,7 @@ func (c *Cloud) AttachDisk(diskName KubernetesVolumeID, nodeName types.NodeName)
klog.V(2).Infof("AttachVolume volume=%q instance=%q request returned %v", disk.awsID, awsInstance.awsID, attachResponse)
}

attachment, err := disk.waitForAttachmentStatus("attached")
attachment, err := disk.waitForAttachmentStatus("attached", awsInstance.awsID, ec2Device)

if err != nil {
if err == wait.ErrWaitTimeout {
Expand All @@ -2300,6 +2334,7 @@ func (c *Cloud) AttachDisk(diskName KubernetesVolumeID, nodeName types.NodeName)
return "", fmt.Errorf("unexpected state: attachment nil after attached %q to %q", diskName, nodeName)
}
if ec2Device != aws.StringValue(attachment.Device) {
// Already checked in waitForAttachmentStatus(), but just to be sure...
return "", fmt.Errorf("disk attachment of %q to %q failed: requested device %q but found %q", diskName, nodeName, ec2Device, aws.StringValue(attachment.Device))
}
if awsInstance.awsID != aws.StringValue(attachment.InstanceId) {
Expand Down Expand Up @@ -2357,7 +2392,7 @@ func (c *Cloud) DetachDisk(diskName KubernetesVolumeID, nodeName types.NodeName)
return "", errors.New("no response from DetachVolume")
}

attachment, err := diskInfo.disk.waitForAttachmentStatus("detached")
attachment, err := diskInfo.disk.waitForAttachmentStatus("detached", awsInstance.awsID, "")
if err != nil {
return "", err
}
Expand Down Expand Up @@ -4619,3 +4654,10 @@ func setNodeDisk(
}
volumeMap[volumeID] = check
}

func getInitialAttachDetachDelay(status string) time.Duration {
if status == "detached" {
return volumeDetachmentStatusInitialDelay
}
return volumeAttachmentStatusInitialDelay
}