Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CloudNodeLifecycleController: check node existence before shutdown status #90737

Merged
merged 3 commits into from
May 5, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
65 changes: 31 additions & 34 deletions pkg/controller/cloud/node_lifecycle_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,24 +147,6 @@ func (c *CloudNodeLifecycleController) MonitorNodes() {
continue
}

// we need to check this first to get taint working in similar in all cloudproviders
// current problem is that shutdown nodes are not working in similar way ie. all cloudproviders
// does not delete node from kubernetes cluster when instance it is shutdown see issue #46442
shutdown, err := shutdownInCloudProvider(context.TODO(), c.cloud, node)
if err != nil {
klog.Errorf("error checking if node %s is shutdown: %v", node.Name, err)
}

if shutdown && err == nil {
// if node is shutdown add shutdown taint
err = controller.AddOrUpdateTaintOnNode(c.kubeClient, node.Name, ShutdownTaint)
if err != nil {
klog.Errorf("failed to apply shutdown taint to node %s, it may have been deleted.", node.Name)
}
// Continue checking the remaining nodes since the current one is shutdown.
continue
}

// At this point the node has NotReady status, we need to check if the node has been removed
// from the cloud provider. If node cannot be found in cloudprovider, then delete the node
exists, err := ensureNodeExistsByProviderID(context.TODO(), instances, node)
Expand All @@ -173,26 +155,41 @@ func (c *CloudNodeLifecycleController) MonitorNodes() {
continue
}

if exists {
// Continue checking the remaining nodes since the current one is fine.
continue
}
if !exists {
// Current node does not exist, we should delete it, its taints do not matter anymore

klog.V(2).Infof("deleting node since it is no longer present in cloud provider: %s", node.Name)
klog.V(2).Infof("deleting node since it is no longer present in cloud provider: %s", node.Name)

ref := &v1.ObjectReference{
Kind: "Node",
Name: node.Name,
UID: types.UID(node.UID),
Namespace: "",
}
ref := &v1.ObjectReference{
Kind: "Node",
Name: node.Name,
UID: types.UID(node.UID),
Namespace: "",
}

c.recorder.Eventf(ref, v1.EventTypeNormal,
fmt.Sprintf("Deleting node %v because it does not exist in the cloud provider", node.Name),
"Node %s event: %s", node.Name, deleteNodeEvent)

c.recorder.Eventf(ref, v1.EventTypeNormal,
fmt.Sprintf("Deleting node %v because it does not exist in the cloud provider", node.Name),
"Node %s event: %s", node.Name, deleteNodeEvent)
if err := c.kubeClient.CoreV1().Nodes().Delete(context.TODO(), node.Name, metav1.DeleteOptions{}); err != nil {
klog.Errorf("unable to delete node %q: %v", node.Name, err)
}
} else {
// Node exists. We need to check this to get taint working in similar in all cloudproviders
// current problem is that shutdown nodes are not working in similar way ie. all cloudproviders
// does not delete node from kubernetes cluster when instance it is shutdown see issue #46442
shutdown, err := shutdownInCloudProvider(context.TODO(), c.cloud, node)
if err != nil {
klog.Errorf("error checking if node %s is shutdown: %v", node.Name, err)
}

if err := c.kubeClient.CoreV1().Nodes().Delete(context.TODO(), node.Name, metav1.DeleteOptions{}); err != nil {
klog.Errorf("unable to delete node %q: %v", node.Name, err)
if shutdown && err == nil {
// if node is shutdown add shutdown taint
err = controller.AddOrUpdateTaintOnNode(c.kubeClient, node.Name, ShutdownTaint)
if err != nil {
klog.Errorf("failed to apply shutdown taint to node %s, it may have been deleted.", node.Name)
}
}
}
}
}
Expand Down
38 changes: 37 additions & 1 deletion pkg/controller/cloud/node_lifecycle_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -275,14 +275,17 @@ func Test_NodesShutdown(t *testing.T) {
updatedNodes []*v1.Node
}{
{
name: "node is not ready and was shutdown",
name: "node is not ready and was shutdown, but exists",
fnh: &testutil.FakeNodeHandler{
Existing: []*v1.Node{
{
ObjectMeta: metav1.ObjectMeta{
Name: "node0",
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.Local),
},
Spec: v1.NodeSpec{
ProviderID: "node0",
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Expand All @@ -300,6 +303,7 @@ func Test_NodesShutdown(t *testing.T) {
},
fakeCloud: &fakecloud.Cloud{
NodeShutdown: true,
ExistsByProviderID: true,
ErrShutdownByProviderID: nil,
},
updatedNodes: []*v1.Node{
Expand All @@ -309,6 +313,7 @@ func Test_NodesShutdown(t *testing.T) {
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.Local),
},
Spec: v1.NodeSpec{
ProviderID: "node0",
Taints: []v1.Taint{
*ShutdownTaint,
},
Expand Down Expand Up @@ -416,6 +421,37 @@ func Test_NodesShutdown(t *testing.T) {
},
updatedNodes: []*v1.Node{},
},
{
name: "node is shutdown but provider says it does not exist",
fnh: &testutil.FakeNodeHandler{
Existing: []*v1.Node{
{
ObjectMeta: metav1.ObjectMeta{
Name: "node0",
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.Local),
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionUnknown,
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
},
},
},
},
},
Clientset: fake.NewSimpleClientset(),
UpdatedNodes: []*v1.Node{},
},
fakeCloud: &fakecloud.Cloud{
NodeShutdown: true,
ExistsByProviderID: false,
ErrShutdownByProviderID: nil,
},
updatedNodes: []*v1.Node{}, // should be empty because node does not exist
},
}

for _, testcase := range testcases {
Expand Down