From 84dfaace5d0c26bf4e4ccd05297c9acb8aa00ddf Mon Sep 17 00:00:00 2001 From: Indeed Date: Mon, 4 May 2020 10:57:01 -0700 Subject: [PATCH 1/3] add test about shutdown but non-existing node. --- .../cloud/node_lifecycle_controller_test.go | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/pkg/controller/cloud/node_lifecycle_controller_test.go b/pkg/controller/cloud/node_lifecycle_controller_test.go index 974daea95bcb3..9db1cfd45051f 100644 --- a/pkg/controller/cloud/node_lifecycle_controller_test.go +++ b/pkg/controller/cloud/node_lifecycle_controller_test.go @@ -416,6 +416,37 @@ func Test_NodesShutdown(t *testing.T) { }, updatedNodes: []*v1.Node{}, }, + { + name: "node is shutdown but provider says it does not exist", + fnh: &testutil.FakeNodeHandler{ + Existing: []*v1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.Local), + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionUnknown, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local), + }, + }, + }, + }, + }, + Clientset: fake.NewSimpleClientset(), + UpdatedNodes: []*v1.Node{}, + }, + fakeCloud: &fakecloud.Cloud{ + NodeShutdown: true, + ExistsByProviderID: false, + ErrShutdownByProviderID: nil, + }, + updatedNodes: []*v1.Node{}, // should be empty because node does not exist + }, } for _, testcase := range testcases { From 78ace246701b9e726867a155b527692c1de8d2f8 Mon Sep 17 00:00:00 2001 From: Indeed Date: Mon, 4 May 2020 11:35:40 -0700 Subject: [PATCH 2/3] check node existence before shutdown status. --- .../cloud/node_lifecycle_controller.go | 65 +++++++++---------- 1 file changed, 31 insertions(+), 34 deletions(-) diff --git a/pkg/controller/cloud/node_lifecycle_controller.go b/pkg/controller/cloud/node_lifecycle_controller.go index 078b2bf17c20d..8509a712e55bd 100644 --- a/pkg/controller/cloud/node_lifecycle_controller.go +++ b/pkg/controller/cloud/node_lifecycle_controller.go @@ -147,24 +147,6 @@ func (c *CloudNodeLifecycleController) MonitorNodes() { continue } - // we need to check this first to get taint working in similar in all cloudproviders - // current problem is that shutdown nodes are not working in similar way ie. all cloudproviders - // does not delete node from kubernetes cluster when instance it is shutdown see issue #46442 - shutdown, err := shutdownInCloudProvider(context.TODO(), c.cloud, node) - if err != nil { - klog.Errorf("error checking if node %s is shutdown: %v", node.Name, err) - } - - if shutdown && err == nil { - // if node is shutdown add shutdown taint - err = controller.AddOrUpdateTaintOnNode(c.kubeClient, node.Name, ShutdownTaint) - if err != nil { - klog.Errorf("failed to apply shutdown taint to node %s, it may have been deleted.", node.Name) - } - // Continue checking the remaining nodes since the current one is shutdown. - continue - } - // At this point the node has NotReady status, we need to check if the node has been removed // from the cloud provider. If node cannot be found in cloudprovider, then delete the node exists, err := ensureNodeExistsByProviderID(context.TODO(), instances, node) @@ -173,26 +155,41 @@ func (c *CloudNodeLifecycleController) MonitorNodes() { continue } - if exists { - // Continue checking the remaining nodes since the current one is fine. - continue - } + if !exists { + // Current node does not exist, we should delete it, its taints do not matter anymore - klog.V(2).Infof("deleting node since it is no longer present in cloud provider: %s", node.Name) + klog.V(2).Infof("deleting node since it is no longer present in cloud provider: %s", node.Name) - ref := &v1.ObjectReference{ - Kind: "Node", - Name: node.Name, - UID: types.UID(node.UID), - Namespace: "", - } + ref := &v1.ObjectReference{ + Kind: "Node", + Name: node.Name, + UID: types.UID(node.UID), + Namespace: "", + } + + c.recorder.Eventf(ref, v1.EventTypeNormal, + fmt.Sprintf("Deleting node %v because it does not exist in the cloud provider", node.Name), + "Node %s event: %s", node.Name, deleteNodeEvent) - c.recorder.Eventf(ref, v1.EventTypeNormal, - fmt.Sprintf("Deleting node %v because it does not exist in the cloud provider", node.Name), - "Node %s event: %s", node.Name, deleteNodeEvent) + if err := c.kubeClient.CoreV1().Nodes().Delete(context.TODO(), node.Name, metav1.DeleteOptions{}); err != nil { + klog.Errorf("unable to delete node %q: %v", node.Name, err) + } + } else { + // Node exists. We need to check this to get taint working in similar in all cloudproviders + // current problem is that shutdown nodes are not working in similar way ie. all cloudproviders + // does not delete node from kubernetes cluster when instance it is shutdown see issue #46442 + shutdown, err := shutdownInCloudProvider(context.TODO(), c.cloud, node) + if err != nil { + klog.Errorf("error checking if node %s is shutdown: %v", node.Name, err) + } - if err := c.kubeClient.CoreV1().Nodes().Delete(context.TODO(), node.Name, metav1.DeleteOptions{}); err != nil { - klog.Errorf("unable to delete node %q: %v", node.Name, err) + if shutdown && err == nil { + // if node is shutdown add shutdown taint + err = controller.AddOrUpdateTaintOnNode(c.kubeClient, node.Name, ShutdownTaint) + if err != nil { + klog.Errorf("failed to apply shutdown taint to node %s, it may have been deleted.", node.Name) + } + } } } } From 3117b6a08469cdaa61fa4283287fc8694185425a Mon Sep 17 00:00:00 2001 From: Indeed Date: Mon, 4 May 2020 11:37:18 -0700 Subject: [PATCH 3/3] fix case where node exists but shutdown. --- pkg/controller/cloud/node_lifecycle_controller_test.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pkg/controller/cloud/node_lifecycle_controller_test.go b/pkg/controller/cloud/node_lifecycle_controller_test.go index 9db1cfd45051f..6188e75fb4046 100644 --- a/pkg/controller/cloud/node_lifecycle_controller_test.go +++ b/pkg/controller/cloud/node_lifecycle_controller_test.go @@ -275,7 +275,7 @@ func Test_NodesShutdown(t *testing.T) { updatedNodes []*v1.Node }{ { - name: "node is not ready and was shutdown", + name: "node is not ready and was shutdown, but exists", fnh: &testutil.FakeNodeHandler{ Existing: []*v1.Node{ { @@ -283,6 +283,9 @@ func Test_NodesShutdown(t *testing.T) { Name: "node0", CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.Local), }, + Spec: v1.NodeSpec{ + ProviderID: "node0", + }, Status: v1.NodeStatus{ Conditions: []v1.NodeCondition{ { @@ -300,6 +303,7 @@ func Test_NodesShutdown(t *testing.T) { }, fakeCloud: &fakecloud.Cloud{ NodeShutdown: true, + ExistsByProviderID: true, ErrShutdownByProviderID: nil, }, updatedNodes: []*v1.Node{ @@ -309,6 +313,7 @@ func Test_NodesShutdown(t *testing.T) { CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.Local), }, Spec: v1.NodeSpec{ + ProviderID: "node0", Taints: []v1.Taint{ *ShutdownTaint, },