From ebf9dacb27a2ef40a30cebbffc84edffe0b26e14 Mon Sep 17 00:00:00 2001 From: ravisantoshgudimetla Date: Wed, 31 Jul 2019 10:05:58 -0700 Subject: [PATCH 1/4] Convert tbe e2e to integration test --- test/e2e/scheduling/BUILD | 3 - test/e2e/scheduling/taint_based_evictions.go | 193 ---------------- test/integration/scheduler/BUILD | 3 + test/integration/scheduler/taint_test.go | 231 +++++++++++++++++++ 4 files changed, 234 insertions(+), 196 deletions(-) delete mode 100644 test/e2e/scheduling/taint_based_evictions.go diff --git a/test/e2e/scheduling/BUILD b/test/e2e/scheduling/BUILD index cb6f9752fed02..42a2997ef8bbf 100644 --- a/test/e2e/scheduling/BUILD +++ b/test/e2e/scheduling/BUILD @@ -11,7 +11,6 @@ go_library( "predicates.go", "preemption.go", "priorities.go", - "taint_based_evictions.go", "taints.go", "ubernetes_lite.go", "ubernetes_lite_volumes.go", @@ -23,14 +22,12 @@ go_library( "//pkg/apis/extensions:go_default_library", "//pkg/apis/scheduling:go_default_library", "//pkg/scheduler/algorithm/priorities/util:go_default_library", - "//pkg/scheduler/api:go_default_library", "//staging/src/k8s.io/api/apps/v1:go_default_library", "//staging/src/k8s.io/api/core/v1:go_default_library", "//staging/src/k8s.io/api/scheduling/v1:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/api/errors:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/api/resource:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", - "//staging/src/k8s.io/apimachinery/pkg/fields:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/labels:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/runtime:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/util/intstr:go_default_library", diff --git a/test/e2e/scheduling/taint_based_evictions.go b/test/e2e/scheduling/taint_based_evictions.go deleted file mode 100644 index 686cfc7b8d811..0000000000000 --- a/test/e2e/scheduling/taint_based_evictions.go +++ /dev/null @@ -1,193 +0,0 @@ -/* -Copyright 2018 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package scheduling - -import ( - "errors" - "fmt" - "time" - - "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/fields" - clientset "k8s.io/client-go/kubernetes" - schedulerapi "k8s.io/kubernetes/pkg/scheduler/api" - "k8s.io/kubernetes/test/e2e/framework" - - . "github.com/onsi/ginkgo" -) - -func newUnreachableNoExecuteTaint() *v1.Taint { - return &v1.Taint{ - Key: schedulerapi.TaintNodeUnreachable, - Effect: v1.TaintEffectNoExecute, - } -} - -func getTolerationSeconds(tolerations []v1.Toleration) (int64, error) { - for _, t := range tolerations { - if t.Key == schedulerapi.TaintNodeUnreachable && t.Effect == v1.TaintEffectNoExecute && t.Operator == v1.TolerationOpExists { - return *t.TolerationSeconds, nil - } - } - return 0, errors.New("cannot find toleration") -} - -var _ = SIGDescribe("TaintBasedEvictions [Serial]", func() { - f := framework.NewDefaultFramework("sched-taint-based-evictions") - var cs clientset.Interface - var ns string - - BeforeEach(func() { - cs = f.ClientSet - ns = f.Namespace.Name - // skip if TaintBasedEvictions is not enabled - // TODO(Huang-Wei): remove this when TaintBasedEvictions is GAed - framework.SkipUnlessTaintBasedEvictionsEnabled() - // it's required to run on a cluster that has more than 1 node - // otherwise node lifecycle manager enters a fully disruption mode - framework.SkipUnlessNodeCountIsAtLeast(2) - }) - - // This test verifies that when a node becomes unreachable - // 1. node lifecycle manager generate a status change: [NodeReady=true, status=ConditionUnknown] - // 1. it's applied with node.kubernetes.io/unreachable=:NoExecute taint - // 2. pods without toleration are applied with toleration with tolerationSeconds=300 - // 3. pods with toleration and without tolerationSeconds won't be modified, and won't be evicted - // 4. pods with toleration and with tolerationSeconds won't be modified, and will be evicted after tolerationSeconds - // When network issue recovers, it's expected to see: - // 5. node lifecycle manager generate a status change: [NodeReady=true, status=ConditionTrue] - // 6. node.kubernetes.io/unreachable=:NoExecute taint is taken off the node - It("Checks that the node becomes unreachable", func() { - // find an available node - nodeName := GetNodeThatCanRunPod(f) - By("Finding an available node " + nodeName) - - // pod0 is a pod with unschedulable=:NoExecute toleration, and tolerationSeconds=0s - // pod1 is a pod with unschedulable=:NoExecute toleration, and tolerationSeconds=200s - // pod2 is a pod without any toleration - base := "taint-based-eviction" - tolerationSeconds := []int64{0, 200} - numPods := len(tolerationSeconds) + 1 - By(fmt.Sprintf("Preparing %v pods", numPods)) - pods := make([]*v1.Pod, numPods) - zero := int64(0) - // build pod0, pod1 - for i := 0; i < numPods-1; i++ { - pods[i] = createPausePod(f, pausePodConfig{ - Name: fmt.Sprintf("%v-%v", base, i), - NodeName: nodeName, - Tolerations: []v1.Toleration{ - { - Key: schedulerapi.TaintNodeUnreachable, - Operator: v1.TolerationOpExists, - Effect: v1.TaintEffectNoExecute, - TolerationSeconds: &tolerationSeconds[i], - }, - }, - DeletionGracePeriodSeconds: &zero, - }) - } - // build pod2 - pods[numPods-1] = createPausePod(f, pausePodConfig{ - Name: fmt.Sprintf("%v-%v", base, numPods-1), - NodeName: nodeName, - }) - - By("Verifying all pods are running properly") - for _, pod := range pods { - framework.ExpectNoError(framework.WaitForPodRunningInNamespace(cs, pod)) - } - - // get the node API object - nodeSelector := fields.OneTermEqualSelector("metadata.name", nodeName) - nodeList, err := cs.CoreV1().Nodes().List(metav1.ListOptions{FieldSelector: nodeSelector.String()}) - if err != nil || len(nodeList.Items) != 1 { - framework.Failf("expected no err, got %v; expected len(nodes) = 1, got %v", err, len(nodeList.Items)) - } - node := nodeList.Items[0] - - By(fmt.Sprintf("Blocking traffic from node %s to the master", nodeName)) - host, err := framework.GetNodeExternalIP(&node) - // TODO(Huang-Wei): make this case work for local provider - // if err != nil { - // host, err = framework.GetNodeInternalIP(&node) - // } - framework.ExpectNoError(err) - masterAddresses := framework.GetAllMasterAddresses(cs) - taint := newUnreachableNoExecuteTaint() - - defer func() { - By(fmt.Sprintf("Unblocking traffic from node %s to the master", node.Name)) - for _, masterAddress := range masterAddresses { - framework.UnblockNetwork(host, masterAddress) - } - - if CurrentGinkgoTestDescription().Failed { - framework.Failf("Current e2e test has failed, so return from here.") - return - } - - By(fmt.Sprintf("Expecting to see node %q becomes Ready", nodeName)) - framework.WaitForNodeToBeReady(cs, nodeName, time.Minute*1) - By("Expecting to see unreachable=:NoExecute taint is taken off") - err := framework.WaitForNodeHasTaintOrNot(cs, nodeName, taint, false, time.Second*30) - framework.ExpectNoError(err) - }() - - for _, masterAddress := range masterAddresses { - framework.BlockNetwork(host, masterAddress) - } - - By(fmt.Sprintf("Expecting to see node %q becomes NotReady", nodeName)) - if !framework.WaitForNodeToBeNotReady(cs, nodeName, time.Minute*3) { - framework.Failf("node %q doesn't turn to NotReady after 3 minutes", nodeName) - } - By("Expecting to see unreachable=:NoExecute taint is applied") - err = framework.WaitForNodeHasTaintOrNot(cs, nodeName, taint, true, time.Second*30) - framework.ExpectNoError(err) - - By("Expecting pod0 to be evicted immediately") - err = framework.WaitForPodCondition(cs, ns, pods[0].Name, "pod0 terminating", time.Second*15, func(pod *v1.Pod) (bool, error) { - // as node is unreachable, pod0 is expected to be in Terminating status - // rather than getting deleted - if pod.DeletionTimestamp != nil { - return true, nil - } - return false, nil - }) - framework.ExpectNoError(err) - - By("Expecting pod2 to be updated with a toleration with tolerationSeconds=300") - err = framework.WaitForPodCondition(cs, ns, pods[2].Name, "pod2 updated with tolerationSeconds=300", time.Second*15, func(pod *v1.Pod) (bool, error) { - if seconds, err := getTolerationSeconds(pod.Spec.Tolerations); err == nil { - return seconds == 300, nil - } - return false, nil - }) - framework.ExpectNoError(err) - - By("Expecting pod1 to be unchanged") - livePod1, err := cs.CoreV1().Pods(pods[1].Namespace).Get(pods[1].Name, metav1.GetOptions{}) - framework.ExpectNoError(err) - seconds, err := getTolerationSeconds(livePod1.Spec.Tolerations) - framework.ExpectNoError(err) - if seconds != 200 { - framework.Failf("expect tolerationSeconds of pod1 is 200, but got %v", seconds) - } - }) -}) diff --git a/test/integration/scheduler/BUILD b/test/integration/scheduler/BUILD index fd1ced4b9d12e..0b2e438bf953b 100644 --- a/test/integration/scheduler/BUILD +++ b/test/integration/scheduler/BUILD @@ -39,6 +39,7 @@ go_test( "//pkg/scheduler/plugins/v1alpha1:go_default_library", "//pkg/volume:go_default_library", "//pkg/volume/testing:go_default_library", + "//plugin/pkg/admission/defaulttolerationseconds:go_default_library", "//plugin/pkg/admission/podtolerationrestriction:go_default_library", "//plugin/pkg/admission/podtolerationrestriction/apis/podtolerationrestriction:go_default_library", "//staging/src/k8s.io/api/core/v1:go_default_library", @@ -53,6 +54,7 @@ go_test( "//staging/src/k8s.io/apimachinery/pkg/util/rand:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/util/sets:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/util/wait:go_default_library", + "//staging/src/k8s.io/apiserver/pkg/admission:go_default_library", "//staging/src/k8s.io/apiserver/pkg/util/feature:go_default_library", "//staging/src/k8s.io/apiserver/pkg/util/feature/testing:go_default_library", "//staging/src/k8s.io/client-go/informers:go_default_library", @@ -62,6 +64,7 @@ go_test( "//staging/src/k8s.io/client-go/rest:go_default_library", "//staging/src/k8s.io/client-go/tools/cache:go_default_library", "//staging/src/k8s.io/client-go/tools/record:go_default_library", + "//test/e2e/framework:go_default_library", "//test/integration/framework:go_default_library", "//test/utils:go_default_library", "//test/utils/image:go_default_library", diff --git a/test/integration/scheduler/taint_test.go b/test/integration/scheduler/taint_test.go index ee10f1f123177..65587f30ad26b 100644 --- a/test/integration/scheduler/taint_test.go +++ b/test/integration/scheduler/taint_test.go @@ -27,6 +27,7 @@ import ( "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apiserver/pkg/admission" utilfeature "k8s.io/apiserver/pkg/util/feature" utilfeaturetesting "k8s.io/apiserver/pkg/util/feature/testing" "k8s.io/client-go/informers" @@ -36,8 +37,11 @@ import ( "k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/pkg/scheduler/algorithmprovider" schedulerapi "k8s.io/kubernetes/pkg/scheduler/api" + "k8s.io/kubernetes/plugin/pkg/admission/defaulttolerationseconds" "k8s.io/kubernetes/plugin/pkg/admission/podtolerationrestriction" pluginapi "k8s.io/kubernetes/plugin/pkg/admission/podtolerationrestriction/apis/podtolerationrestriction" + "k8s.io/kubernetes/test/e2e/framework/pod" + imageutils "k8s.io/kubernetes/test/utils/image" ) func newPod(nsName, name string, req, limit v1.ResourceList) *v1.Pod { @@ -571,3 +575,230 @@ func TestTaintNodeByCondition(t *testing.T) { }) } } + +// TestTaintBasedEvictions tests related cases for the TaintBasedEvictions feature +func TestTaintBasedEvictions(t *testing.T) { + // we need at least 2 nodes to prevent lifecycle manager from entering "fully-disrupted" mode + nodeCount := 3 + zero := int64(0) + gracePeriod := int64(1) + testPod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "testpod1", DeletionGracePeriodSeconds: &zero}, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + {Name: "container", Image: imageutils.GetPauseImageName()}, + }, + Tolerations: []v1.Toleration{ + { + Key: schedulerapi.TaintNodeNotReady, + Operator: v1.TolerationOpExists, + Effect: v1.TaintEffectNoExecute, + }, + }, + TerminationGracePeriodSeconds: &gracePeriod, + }, + } + tolerationSeconds := []int64{200, 300, 0} + tests := []struct { + name string + nodeTaints []v1.Taint + nodeConditions []v1.NodeCondition + pod *v1.Pod + waitForPodCondition string + }{ + { + name: "Taint based evictions for NodeNotReady and 200 tolerationseconds", + nodeTaints: []v1.Taint{{Key: schedulerapi.TaintNodeNotReady, Effect: v1.TaintEffectNoExecute}}, + nodeConditions: []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionFalse}}, + pod: testPod, + waitForPodCondition: "updated with tolerationSeconds of 200", + }, + { + name: "Taint based evictions for NodeNotReady with no pod tolerations", + nodeTaints: []v1.Taint{{Key: schedulerapi.TaintNodeNotReady, Effect: v1.TaintEffectNoExecute}}, + nodeConditions: []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionFalse}}, + pod: &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "testpod1"}, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + {Name: "container", Image: imageutils.GetPauseImageName()}, + }, + }, + }, + waitForPodCondition: "updated with tolerationSeconds=300", + }, + { + name: "Taint based evictions for NodeNotReady and 0 tolerationseconds", + nodeTaints: []v1.Taint{{Key: schedulerapi.TaintNodeNotReady, Effect: v1.TaintEffectNoExecute}}, + nodeConditions: []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionFalse}}, + pod: testPod, + waitForPodCondition: "terminating", + }, + { + name: "Taint based evictions for NodeUnreachable", + nodeTaints: []v1.Taint{{Key: schedulerapi.TaintNodeUnreachable, Effect: v1.TaintEffectNoExecute}}, + nodeConditions: []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionUnknown}}, + }, + } + + // Enable TaintBasedEvictions + defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.TaintBasedEvictions, true)() + // ApplyFeatureGates() is called to ensure TaintNodesByCondition related logic is applied/restored properly. + defer algorithmprovider.ApplyFeatureGates()() + + // Build admission chain handler. + podTolerations := podtolerationrestriction.NewPodTolerationsPlugin(&pluginapi.Configuration{}) + admission := admission.NewChainHandler( + podTolerations, + defaulttolerationseconds.NewDefaultTolerationSeconds(), + ) + for i, test := range tests { + t.Run(test.name, func(t *testing.T) { + context := initTestMaster(t, "taint-based-evictions", admission) + // Build clientset and informers for controllers. + externalClientset := kubernetes.NewForConfigOrDie(&restclient.Config{ + QPS: -1, + Host: context.httpServer.URL, + ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}}) + externalInformers := informers.NewSharedInformerFactory(externalClientset, time.Second) + podTolerations.SetExternalKubeClientSet(externalClientset) + podTolerations.SetExternalKubeInformerFactory(externalInformers) + + context = initTestScheduler(t, context, true, nil) + cs := context.clientSet + informers := context.informerFactory + _, err := cs.CoreV1().Namespaces().Create(context.ns) + if err != nil { + t.Errorf("Failed to create namespace %+v", err) + } + + // Start NodeLifecycleController for taint. + nc, err := nodelifecycle.NewNodeLifecycleController( + informers.Coordination().V1beta1().Leases(), + informers.Core().V1().Pods(), + informers.Core().V1().Nodes(), + informers.Apps().V1().DaemonSets(), + cs, + 5*time.Second, // Node monitor grace period + time.Minute, // Node startup grace period + time.Millisecond, // Node monitor period + time.Second, // Pod eviction timeout + 100, // Eviction limiter QPS + 100, // Secondary eviction limiter QPS + 50, // Large cluster threshold + 0.55, // Unhealthy zone threshold + true, // Run taint manager + true, // Use taint based evictions + false, // Enabled TaintNodeByCondition feature + ) + if err != nil { + t.Errorf("Failed to create node controller: %v", err) + return + } + + go nc.Run(context.stopCh) + + // Waiting for all controller sync. + externalInformers.Start(context.stopCh) + externalInformers.WaitForCacheSync(context.stopCh) + informers.Start(context.stopCh) + informers.WaitForCacheSync(context.stopCh) + + nodeRes := v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("4000m"), + v1.ResourceMemory: resource.MustParse("16Gi"), + v1.ResourcePods: resource.MustParse("110"), + } + + var nodes []*v1.Node + for i := 0; i < nodeCount; i++ { + nodes = append(nodes, &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf("node-%d", i), + Labels: map[string]string{v1.LabelZoneRegion: "region1", v1.LabelZoneFailureDomain: "zone1"}, + }, + Spec: v1.NodeSpec{}, + Status: v1.NodeStatus{ + Capacity: nodeRes, + Allocatable: nodeRes, + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + }, + }, + }, + }) + if _, err := cs.CoreV1().Nodes().Create(nodes[i]); err != nil { + t.Errorf("Failed to create node, err: %v", err) + } + } + neededNode := nodes[1] + if test.pod != nil { + test.pod.Name = fmt.Sprintf("testpod-%d", i) + if len(test.pod.Spec.Tolerations) > 0 { + test.pod.Spec.Tolerations[0].TolerationSeconds = &tolerationSeconds[i] + } + + test.pod, err = cs.CoreV1().Pods(context.ns.Name).Create(test.pod) + if err != nil { + t.Fatalf("Test Failed: error: %v, while creating pod", err) + } + + if err := waitForPodToSchedule(cs, test.pod); err != nil { + t.Errorf("Failed to schedule pod %s/%s on the node, err: %v", + test.pod.Namespace, test.pod.Name, err) + } + test.pod, err = cs.CoreV1().Pods(context.ns.Name).Get(test.pod.Name, metav1.GetOptions{}) + if err != nil { + t.Fatalf("Test Failed: error: %v, while creating pod", err) + } + neededNode, err = cs.CoreV1().Nodes().Get(test.pod.Spec.NodeName, metav1.GetOptions{}) + if err != nil { + t.Fatalf("Error while getting node associated with pod %v with err %v", test.pod.Name, err) + } + } + + neededNode.Status.Conditions = test.nodeConditions + // Update node condition. + err = updateNodeStatus(cs, neededNode) + if err != nil { + t.Fatalf("Cannot update node: %v", err) + } + + if err := waitForNodeTaints(cs, neededNode, test.nodeTaints); err != nil { + t.Errorf("Failed to taint node in test %d <%s>, err: %v", i, neededNode.Name, err) + } + + if test.pod != nil { + err = pod.WaitForPodCondition(cs, context.ns.Name, test.pod.Name, test.waitForPodCondition, time.Second*15, func(pod *v1.Pod) (bool, error) { + // as node is unreachable, pod0 is expected to be in Terminating status + // rather than getting deleted + if tolerationSeconds[i] == 0 { + return pod.DeletionTimestamp != nil, nil + } + if seconds, err := getTolerationSeconds(pod.Spec.Tolerations); err == nil { + return seconds == tolerationSeconds[i], nil + } + return false, nil + }) + if err != nil { + pod, _ := cs.CoreV1().Pods(context.ns.Name).Get(test.pod.Name, metav1.GetOptions{}) + t.Fatalf("Error: %v, Expected test pod to be %s but it's %v", err, test.waitForPodCondition, pod) + } + cleanupPods(cs, t, []*v1.Pod{test.pod}) + } + cleanupNodes(cs, t) + waitForSchedulerCacheCleanup(context.scheduler, t) + }) + } +} + +func getTolerationSeconds(tolerations []v1.Toleration) (int64, error) { + for _, t := range tolerations { + if t.Key == schedulerapi.TaintNodeNotReady && t.Effect == v1.TaintEffectNoExecute && t.Operator == v1.TolerationOpExists { + return *t.TolerationSeconds, nil + } + } + return 0, fmt.Errorf("cannot find toleration") +} From 8b29ae2ef94ab45786513039b62cdadabfaeb134 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Thu, 17 Oct 2019 01:25:12 -0700 Subject: [PATCH 2/4] Ensure TaintBasedEviction int test not rely on TaintNodeByConditions --- test/integration/scheduler/taint_test.go | 2 +- test/integration/scheduler/util.go | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/test/integration/scheduler/taint_test.go b/test/integration/scheduler/taint_test.go index 65587f30ad26b..0ed4ce5ab4dca 100644 --- a/test/integration/scheduler/taint_test.go +++ b/test/integration/scheduler/taint_test.go @@ -689,7 +689,7 @@ func TestTaintBasedEvictions(t *testing.T) { 0.55, // Unhealthy zone threshold true, // Run taint manager true, // Use taint based evictions - false, // Enabled TaintNodeByCondition feature + true, // Enabled TaintNodeByCondition feature ) if err != nil { t.Errorf("Failed to create node controller: %v", err) diff --git a/test/integration/scheduler/util.go b/test/integration/scheduler/util.go index 6351fbeee60c4..4c2ebaa238b9b 100644 --- a/test/integration/scheduler/util.go +++ b/test/integration/scheduler/util.go @@ -393,7 +393,8 @@ func nodeTainted(cs clientset.Interface, nodeName string, taints []v1.Taint) wai return false, err } - if len(taints) != len(node.Spec.Taints) { + // node.Spec.Taints may have more taints + if len(taints) > len(node.Spec.Taints) { return false, nil } From b44f8c28ed577b890de3c26a9a1604d673383d58 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Mon, 4 Nov 2019 16:20:28 -0800 Subject: [PATCH 3/4] Fix a TaintBasedEviction integration test flake --- test/integration/scheduler/taint_test.go | 39 ++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/test/integration/scheduler/taint_test.go b/test/integration/scheduler/taint_test.go index 0ed4ce5ab4dca..08edaadda6b36 100644 --- a/test/integration/scheduler/taint_test.go +++ b/test/integration/scheduler/taint_test.go @@ -733,6 +733,34 @@ func TestTaintBasedEvictions(t *testing.T) { t.Errorf("Failed to create node, err: %v", err) } } + + // Regularly send heartbeat event to APIServer so that the cluster doesn't enter fullyDisruption mode. + // TODO(Huang-Wei): use "NodeDisruptionExclusion" feature to simply the below logic when it's beta. + var heartbeatChans []chan struct{} + for i := 0; i < nodeCount; i++ { + heartbeatChans = append(heartbeatChans, make(chan struct{})) + } + for i := 0; i < nodeCount; i++ { + // Spin up goroutines to send heartbeat event to APIServer periodically. + go func(i int) { + for { + select { + case <-heartbeatChans[i]: + return + case <-time.Tick(2 * time.Second): + nodes[i].Status.Conditions = []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: metav1.Now(), + }, + } + updateNodeStatus(cs, nodes[i]) + } + } + }(i) + } + neededNode := nodes[1] if test.pod != nil { test.pod.Name = fmt.Sprintf("testpod-%d", i) @@ -759,6 +787,13 @@ func TestTaintBasedEvictions(t *testing.T) { } } + for i := 0; i < nodeCount; i++ { + // Stop the neededNode's heartbeat goroutine. + if neededNode.Name == fmt.Sprintf("node-%d", i) { + heartbeatChans[i] <- struct{}{} + break + } + } neededNode.Status.Conditions = test.nodeConditions // Update node condition. err = updateNodeStatus(cs, neededNode) @@ -788,6 +823,10 @@ func TestTaintBasedEvictions(t *testing.T) { } cleanupPods(cs, t, []*v1.Pod{test.pod}) } + // Close all heartbeat channels. + for i := 0; i < nodeCount; i++ { + close(heartbeatChans[i]) + } cleanupNodes(cs, t) waitForSchedulerCacheCleanup(context.scheduler, t) }) From a2cbdde957eb02046923e29130b7d1a85c1f881e Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Thu, 7 Nov 2019 15:59:20 -0800 Subject: [PATCH 4/4] Update test logic to simulate NodeReady/False and NodeReady/Unknown events correctly - optimize code to use one loop to spin up goroutines - add `defer cleanupTest()` to avoid goroutine leaks - use only one heartbeat channel --- test/integration/scheduler/taint_test.go | 129 ++++++++++++++--------- 1 file changed, 81 insertions(+), 48 deletions(-) diff --git a/test/integration/scheduler/taint_test.go b/test/integration/scheduler/taint_test.go index 08edaadda6b36..ab61f525d5efa 100644 --- a/test/integration/scheduler/taint_test.go +++ b/test/integration/scheduler/taint_test.go @@ -19,11 +19,13 @@ package scheduler // This file tests the Taint feature. import ( + "errors" "fmt" "testing" "time" - "k8s.io/api/core/v1" + v1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime/schema" @@ -40,7 +42,7 @@ import ( "k8s.io/kubernetes/plugin/pkg/admission/defaulttolerationseconds" "k8s.io/kubernetes/plugin/pkg/admission/podtolerationrestriction" pluginapi "k8s.io/kubernetes/plugin/pkg/admission/podtolerationrestriction/apis/podtolerationrestriction" - "k8s.io/kubernetes/test/e2e/framework/pod" + "k8s.io/kubernetes/test/e2e/framework" imageutils "k8s.io/kubernetes/test/utils/image" ) @@ -582,6 +584,7 @@ func TestTaintBasedEvictions(t *testing.T) { nodeCount := 3 zero := int64(0) gracePeriod := int64(1) + heartbeatInternal := time.Second * 2 testPod := &v1.Pod{ ObjectMeta: metav1.ObjectMeta{Name: "testpod1", DeletionGracePeriodSeconds: &zero}, Spec: v1.PodSpec{ @@ -642,9 +645,9 @@ func TestTaintBasedEvictions(t *testing.T) { } // Enable TaintBasedEvictions - defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.TaintBasedEvictions, true)() + defer utilfeaturetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.TaintBasedEvictions, true)() // ApplyFeatureGates() is called to ensure TaintNodesByCondition related logic is applied/restored properly. - defer algorithmprovider.ApplyFeatureGates()() + defer algorithmprovider.ApplyFeatureGates() // Build admission chain handler. podTolerations := podtolerationrestriction.NewPodTolerationsPlugin(&pluginapi.Configuration{}) @@ -655,6 +658,7 @@ func TestTaintBasedEvictions(t *testing.T) { for i, test := range tests { t.Run(test.name, func(t *testing.T) { context := initTestMaster(t, "taint-based-evictions", admission) + // Build clientset and informers for controllers. externalClientset := kubernetes.NewForConfigOrDie(&restclient.Config{ QPS: -1, @@ -665,6 +669,7 @@ func TestTaintBasedEvictions(t *testing.T) { podTolerations.SetExternalKubeInformerFactory(externalInformers) context = initTestScheduler(t, context, true, nil) + defer cleanupTest(t, context) cs := context.clientSet informers := context.informerFactory _, err := cs.CoreV1().Namespaces().Create(context.ns) @@ -723,8 +728,9 @@ func TestTaintBasedEvictions(t *testing.T) { Allocatable: nodeRes, Conditions: []v1.NodeCondition{ { - Type: v1.NodeReady, - Status: v1.ConditionTrue, + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: metav1.Now(), }, }, }, @@ -734,33 +740,6 @@ func TestTaintBasedEvictions(t *testing.T) { } } - // Regularly send heartbeat event to APIServer so that the cluster doesn't enter fullyDisruption mode. - // TODO(Huang-Wei): use "NodeDisruptionExclusion" feature to simply the below logic when it's beta. - var heartbeatChans []chan struct{} - for i := 0; i < nodeCount; i++ { - heartbeatChans = append(heartbeatChans, make(chan struct{})) - } - for i := 0; i < nodeCount; i++ { - // Spin up goroutines to send heartbeat event to APIServer periodically. - go func(i int) { - for { - select { - case <-heartbeatChans[i]: - return - case <-time.Tick(2 * time.Second): - nodes[i].Status.Conditions = []v1.NodeCondition{ - { - Type: v1.NodeReady, - Status: v1.ConditionTrue, - LastHeartbeatTime: metav1.Now(), - }, - } - updateNodeStatus(cs, nodes[i]) - } - } - }(i) - } - neededNode := nodes[1] if test.pod != nil { test.pod.Name = fmt.Sprintf("testpod-%d", i) @@ -787,18 +766,53 @@ func TestTaintBasedEvictions(t *testing.T) { } } + // Regularly send heartbeat event to APIServer so that the cluster doesn't enter fullyDisruption mode. + // TODO(Huang-Wei): use "NodeDisruptionExclusion" feature to simply the below logic when it's beta. for i := 0; i < nodeCount; i++ { - // Stop the neededNode's heartbeat goroutine. - if neededNode.Name == fmt.Sprintf("node-%d", i) { - heartbeatChans[i] <- struct{}{} - break + var conditions []v1.NodeCondition + // If current node is not + if neededNode.Name != nodes[i].Name { + conditions = []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + }, + } + } else { + c, err := nodeReadyStatus(test.nodeConditions) + if err != nil { + t.Error(err) + } + // Need to distinguish NodeReady/False and NodeReady/Unknown. + // If we try to update the node with condition NotReady/False, i.e. expect a NotReady:NoExecute taint + // we need to keep sending the update event to keep it alive, rather than just sending once. + if c == v1.ConditionFalse { + conditions = test.nodeConditions + } else if c == v1.ConditionUnknown { + // If it's expected to update the node with condition NotReady/Unknown, + // i.e. expect a Unreachable:NoExecute taint, + // we need to only send the update event once to simulate the network unreachable scenario. + nodeCopy := nodeCopyWithConditions(nodes[i], test.nodeConditions) + if err := updateNodeStatus(cs, nodeCopy); err != nil && !apierrors.IsNotFound(err) { + t.Errorf("Cannot update node: %v", err) + } + continue + } } - } - neededNode.Status.Conditions = test.nodeConditions - // Update node condition. - err = updateNodeStatus(cs, neededNode) - if err != nil { - t.Fatalf("Cannot update node: %v", err) + // Keeping sending NodeReady/True or NodeReady/False events. + go func(i int) { + for { + select { + case <-context.stopCh: + return + case <-time.Tick(heartbeatInternal): + nodeCopy := nodeCopyWithConditions(nodes[i], conditions) + if err := updateNodeStatus(cs, nodeCopy); err != nil && !apierrors.IsNotFound(err) { + t.Errorf("Cannot update node: %v", err) + } + } + } + }(i) } if err := waitForNodeTaints(cs, neededNode, test.nodeTaints); err != nil { @@ -806,7 +820,7 @@ func TestTaintBasedEvictions(t *testing.T) { } if test.pod != nil { - err = pod.WaitForPodCondition(cs, context.ns.Name, test.pod.Name, test.waitForPodCondition, time.Second*15, func(pod *v1.Pod) (bool, error) { + err = framework.WaitForPodCondition(cs, context.ns.Name, test.pod.Name, test.waitForPodCondition, time.Second*15, func(pod *v1.Pod) (bool, error) { // as node is unreachable, pod0 is expected to be in Terminating status // rather than getting deleted if tolerationSeconds[i] == 0 { @@ -823,10 +837,6 @@ func TestTaintBasedEvictions(t *testing.T) { } cleanupPods(cs, t, []*v1.Pod{test.pod}) } - // Close all heartbeat channels. - for i := 0; i < nodeCount; i++ { - close(heartbeatChans[i]) - } cleanupNodes(cs, t) waitForSchedulerCacheCleanup(context.scheduler, t) }) @@ -841,3 +851,26 @@ func getTolerationSeconds(tolerations []v1.Toleration) (int64, error) { } return 0, fmt.Errorf("cannot find toleration") } + +// nodeReadyStatus returns the status of first condition with type NodeReady. +// If none of the condition is of type NodeReady, returns an error. +func nodeReadyStatus(conditions []v1.NodeCondition) (v1.ConditionStatus, error) { + for _, c := range conditions { + if c.Type != v1.NodeReady { + continue + } + // Just return the first condition with type NodeReady + return c.Status, nil + } + return v1.ConditionFalse, errors.New("None of the conditions is of type NodeReady") +} + +func nodeCopyWithConditions(node *v1.Node, conditions []v1.NodeCondition) *v1.Node { + copy := node.DeepCopy() + copy.ResourceVersion = "0" + copy.Status.Conditions = conditions + for i := range copy.Status.Conditions { + copy.Status.Conditions[i].LastHeartbeatTime = metav1.Now() + } + return copy +}