Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Apply taint when a volume is stuck in attaching state #55558

Merged
Changes from all commits
Commits
File filter...
Filter file types
Jump to…
Jump to file or symbol
Failed to load files and symbols.

Always

Just for now

@@ -49,9 +49,14 @@ go_library(
"//vendor/github.com/prometheus/client_golang/prometheus:go_default_library",
"//vendor/gopkg.in/gcfg.v1:go_default_library",
"//vendor/k8s.io/api/core/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/types:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/wait:go_default_library",
"//vendor/k8s.io/client-go/kubernetes:go_default_library",
"//vendor/k8s.io/client-go/kubernetes/scheme:go_default_library",
"//vendor/k8s.io/client-go/kubernetes/typed/core/v1:go_default_library",
"//vendor/k8s.io/client-go/tools/record:go_default_library",
],
)

@@ -41,13 +41,18 @@ import (
"github.com/aws/aws-sdk-go/service/kms"
"github.com/golang/glog"
"github.com/prometheus/client_golang/prometheus"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/record"

"path"

"k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/kubernetes/scheme"
v1core "k8s.io/client-go/kubernetes/typed/core/v1"
"k8s.io/kubernetes/pkg/api/v1/service"
"k8s.io/kubernetes/pkg/cloudprovider"
"k8s.io/kubernetes/pkg/controller"
@@ -144,6 +149,12 @@ const ServiceAnnotationLoadBalancerBEProtocol = "service.beta.kubernetes.io/aws-
// For example: "Key1=Val1,Key2=Val2,KeyNoVal1=,KeyNoVal2"
const ServiceAnnotationLoadBalancerAdditionalTags = "service.beta.kubernetes.io/aws-load-balancer-additional-resource-tags"

// Event key when a volume is stuck on attaching state when being attached to a volume
const volumeAttachmentStuck = "VolumeAttachmentStuck"

// Indicates that a node has volumes stuck in attaching state and hence it is not fit for scheduling more pods
const nodeWithImpairedVolumes = "NodeWithImpairedVolumes"

const (
// volumeAttachmentConsecutiveErrorLimit is the number of consecutive errors we will ignore when waiting for a volume to attach/detach
volumeAttachmentStatusConsecutiveErrorLimit = 10
@@ -398,6 +409,11 @@ type Cloud struct {

instanceCache instanceCache

clientBuilder controller.ControllerClientBuilder
kubeClient clientset.Interface
eventBroadcaster record.EventBroadcaster
eventRecorder record.EventRecorder

// We keep an active list of devices we have assigned but not yet
// attached, to avoid a race condition where we assign a device mapping
// and then get a second request before we attach the volume
@@ -957,7 +973,14 @@ func newAWSCloud(config io.Reader, awsServices Services) (*Cloud, error) {
}

// Initialize passes a Kubernetes clientBuilder interface to the cloud provider
func (c *Cloud) Initialize(clientBuilder controller.ControllerClientBuilder) {}
func (c *Cloud) Initialize(clientBuilder controller.ControllerClientBuilder) {
c.clientBuilder = clientBuilder
c.kubeClient = clientBuilder.ClientOrDie("cloud-provider")
c.eventBroadcaster = record.NewBroadcaster()
c.eventBroadcaster.StartLogging(glog.Infof)
c.eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: v1core.New(c.kubeClient.CoreV1().RESTClient()).Events("")})
c.eventRecorder = c.eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "aws-cloudprovider"})
}

// Clusters returns the list of clusters.
func (c *Cloud) Clusters() (cloudprovider.Clusters, bool) {
@@ -1525,6 +1548,28 @@ func (d *awsDisk) describeVolume() (*ec2.Volume, error) {
return volumes[0], nil
}

// applyUnSchedulableTaint applies a unschedulable taint to a node after verifying
// if node has become unusable because of volumes getting stuck in attaching state.
func (c *Cloud) applyUnSchedulableTaint(nodeName types.NodeName, reason string) {
node, fetchErr := c.kubeClient.CoreV1().Nodes().Get(string(nodeName), metav1.GetOptions{})

This comment has been minimized.

Copy link
@jhorwit2

jhorwit2 Dec 2, 2017

Member

AddOrUpdateTaintOnNode immediately calls get for the node, so this is causing 2x the calls. You could check for not found error below to accomplish the same log w/ less API calls.

This comment has been minimized.

Copy link
@gnufied

gnufied Dec 4, 2017

Author Member

We still need the node object below to create event that is emitted in next line.

if fetchErr != nil {
glog.Errorf("Error fetching node %s with %v", nodeName, fetchErr)
return
}

taint := &v1.Taint{
Key: nodeWithImpairedVolumes,
Value: "true",
Effect: v1.TaintEffectNoSchedule,
}
err := controller.AddOrUpdateTaintOnNode(c.kubeClient, string(nodeName), taint)
if err != nil {
glog.Errorf("Error applying taint to node %s with error %v", nodeName, err)
return
}
c.eventRecorder.Eventf(node, v1.EventTypeWarning, volumeAttachmentStuck, reason)
}

// waitForAttachmentStatus polls until the attachment status is the expected value
// On success, it returns the last attachment state.
func (d *awsDisk) waitForAttachmentStatus(status string) (*ec2.VolumeAttachment, error) {
@@ -1741,7 +1786,11 @@ func (c *Cloud) AttachDisk(diskName KubernetesVolumeID, nodeName types.NodeName,
}

attachment, err := disk.waitForAttachmentStatus("attached")

if err != nil {
if err == wait.ErrWaitTimeout {
c.applyUnSchedulableTaint(nodeName, "Volume stuck in attaching state - node needs reboot to fix impaired state.")
}
return "", err
}

ProTip! Use n and p to navigate between commits in a pull request.
You can’t perform that action at this time.