e2e: add basic topology updater test

Co-authored-by: Swati Sehgal <swsehgal@redhat.com> Co-authored-by: Francesco Romani <fromani@redhat.com> Signed-off-by: Artyom Lukianov <alukiano@redhat.com>
kubernetes-sigs · Jun 14, 2022 · 60c564f · 60c564f
1 parent fa1f653
commit 60c564f
Show file tree

Hide file tree

Showing 5 changed files with 752 additions and 2 deletions.
diff --git a/go.mod b/go.mod
@@ -22,12 +22,13 @@ require (
 	google.golang.org/protobuf v1.27.1
 	gopkg.in/yaml.v2 v2.4.0
 	k8s.io/api v0.23.1
+	k8s.io/apiextensions-apiserver v0.23.1
 	k8s.io/apimachinery v0.23.1
 	k8s.io/client-go v0.23.1
 	k8s.io/klog/v2 v2.30.0
-	k8s.io/kubectl v0.23.1
 	k8s.io/kubelet v0.23.1
 	k8s.io/kubernetes v1.23.1
+	k8s.io/utils v0.0.0-20210930125809-cb0fa318a74b
 	sigs.k8s.io/yaml v1.2.0
 )
 
@@ -197,9 +198,9 @@ require (
 	k8s.io/kube-openapi v0.0.0-20211115234752-e816edb12b65 // indirect
 	k8s.io/kube-proxy v0.0.0 // indirect
 	k8s.io/kube-scheduler v0.0.0 // indirect
+	k8s.io/kubectl v0.23.1 // indirect
 	k8s.io/legacy-cloud-providers v0.0.0 // indirect
 	k8s.io/mount-utils v0.23.1 // indirect
-	k8s.io/utils v0.0.0-20210930125809-cb0fa318a74b // indirect
 	sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.0.25 // indirect
 	sigs.k8s.io/json v0.0.0-20211020170558-c049b76a60c6 // indirect
 	sigs.k8s.io/structured-merge-diff/v4 v4.1.2 // indirect

diff --git a/test/e2e/topology_updater.go b/test/e2e/topology_updater.go
@@ -0,0 +1,246 @@
+/*
+Copyright 2020-2022 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package e2e
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	"github.com/k8stopologyawareschedwg/noderesourcetopology-api/pkg/apis/topology/v1alpha1"
+	topologyclientset "github.com/k8stopologyawareschedwg/noderesourcetopology-api/pkg/generated/clientset/versioned"
+	"github.com/onsi/ginkgo"
+	"github.com/onsi/gomega"
+
+	v1 "k8s.io/api/core/v1"
+	apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
+	extclient "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/labels"
+	kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
+	"k8s.io/kubernetes/test/e2e/framework"
+	e2ekubelet "k8s.io/kubernetes/test/e2e/framework/kubelet"
+	e2enetwork "k8s.io/kubernetes/test/e2e/framework/network"
+
+	testutils "sigs.k8s.io/node-feature-discovery/test/e2e/utils"
+)
+
+var _ = SIGDescribe("Node Feature Discovery topology updater", func() {
+	var (
+		extClient           *extclient.Clientset
+		topologyClient      *topologyclientset.Clientset
+		crd                 *apiextensionsv1.CustomResourceDefinition
+		topologyUpdaterNode *v1.Node
+		workerNodes         []v1.Node
+		kubeletConfig       *kubeletconfig.KubeletConfiguration
+	)
+
+	f := framework.NewDefaultFramework("node-topology-updater")
+
+	ginkgo.BeforeEach(func() {
+		var err error
+
+		if extClient == nil {
+			extClient, err = extclient.NewForConfig(f.ClientConfig())
+			gomega.Expect(err).NotTo(gomega.HaveOccurred())
+		}
+
+		if topologyClient == nil {
+			topologyClient, err = topologyclientset.NewForConfig(f.ClientConfig())
+			gomega.Expect(err).NotTo(gomega.HaveOccurred())
+		}
+
+		ginkgo.By("Creating the node resource topologies CRD")
+		crd, err = testutils.CreateNodeResourceTopologies(extClient)
+		gomega.Expect(err).NotTo(gomega.HaveOccurred())
+
+		err = testutils.ConfigureRBAC(f.ClientSet, f.Namespace.Name)
+		gomega.Expect(err).NotTo(gomega.HaveOccurred())
+
+		image := fmt.Sprintf("%s:%s", *dockerRepo, *dockerTag)
+		f.PodClient().CreateSync(testutils.NFDMasterPod(image, false))
+
+		// Create nfd-master service
+		masterService, err := testutils.CreateService(f.ClientSet, f.Namespace.Name)
+		gomega.Expect(err).NotTo(gomega.HaveOccurred())
+
+		ginkgo.By("Waiting for the nfd-master service to be up")
+		gomega.Expect(e2enetwork.WaitForService(f.ClientSet, f.Namespace.Name, masterService.Name, true, time.Second, 10*time.Second)).NotTo(gomega.HaveOccurred())
+
+		ginkgo.By("Creating nfd-topology-updater daemonset")
+		topologyUpdaterDaemonSet := testutils.NFDTopologyUpdaterDaemonSet(fmt.Sprintf("%s:%s", *dockerRepo, *dockerTag), []string{})
+		topologyUpdaterDaemonSet, err = f.ClientSet.AppsV1().DaemonSets(f.Namespace.Name).Create(context.TODO(), topologyUpdaterDaemonSet, metav1.CreateOptions{})
+		gomega.Expect(err).NotTo(gomega.HaveOccurred())
+
+		ginkgo.By("Waiting for daemonset pods to be ready")
+		gomega.Expect(testutils.WaitForPodsReady(f.ClientSet, f.Namespace.Name, topologyUpdaterDaemonSet.Spec.Template.Labels["name"], 5)).NotTo(gomega.HaveOccurred())
+
+		label := labels.SelectorFromSet(map[string]string{"name": topologyUpdaterDaemonSet.Spec.Template.Labels["name"]})
+		pods, err := f.ClientSet.CoreV1().Pods(f.Namespace.Name).List(context.TODO(), metav1.ListOptions{LabelSelector: label.String()})
+		gomega.Expect(err).NotTo(gomega.HaveOccurred())
+		gomega.Expect(pods.Items).ToNot(gomega.BeEmpty())
+
+		topologyUpdaterNode, err = f.ClientSet.CoreV1().Nodes().Get(context.TODO(), pods.Items[0].Spec.NodeName, metav1.GetOptions{})
+		gomega.Expect(err).NotTo(gomega.HaveOccurred())
+
+		kubeletConfig, err = e2ekubelet.GetCurrentKubeletConfig(topologyUpdaterNode.Name, "", true)
+		gomega.Expect(err).NotTo(gomega.HaveOccurred())
+
+		workerNodes, err = testutils.GetWorkerNodes(f)
+		gomega.Expect(err).NotTo(gomega.HaveOccurred())
+	})
+
+	ginkgo.Context("with single nfd-master pod", func() {
+		ginkgo.It("should fill the node resource topologies CR with the data", func() {
+			nodeTopology := testutils.GetNodeTopology(topologyClient, topologyUpdaterNode.Name)
+			isValid := testutils.IsValidNodeTopology(nodeTopology, kubeletConfig)
+			gomega.Expect(isValid).To(gomega.BeTrue(), "received invalid topology: %v", nodeTopology)
+		})
+
+		ginkgo.It("it should not account for any cpus if a container doesn't request exclusive cpus (best effort QOS)", func() {
+			ginkgo.By("getting the initial topology information")
+			initialNodeTopo := testutils.GetNodeTopology(topologyClient, topologyUpdaterNode.Name)
+			ginkgo.By("creating a pod consuming resources from the shared, non-exclusive CPU pool (best-effort QoS)")
+			sleeperPod := testutils.BestEffortSleeperPod()
+
+			podMap := make(map[string]*v1.Pod)
+			pod := f.PodClient().CreateSync(sleeperPod)
+			podMap[pod.Name] = pod
+			defer testutils.DeletePodsAsync(f, podMap)
+
+			cooldown := 30 * time.Second
+			ginkgo.By(fmt.Sprintf("getting the updated topology - sleeping for %v", cooldown))
+			// the object, hance the resource version must NOT change, so we can only sleep
+			time.Sleep(cooldown)
+			ginkgo.By("checking the changes in the updated topology - expecting none")
+			finalNodeTopo := testutils.GetNodeTopology(topologyClient, topologyUpdaterNode.Name)
+
+			initialAllocRes := testutils.AllocatableResourceListFromNodeResourceTopology(initialNodeTopo)
+			finalAllocRes := testutils.AllocatableResourceListFromNodeResourceTopology(finalNodeTopo)
+			if len(initialAllocRes) == 0 || len(finalAllocRes) == 0 {
+				ginkgo.Fail(fmt.Sprintf("failed to find allocatable resources from node topology initial=%v final=%v", initialAllocRes, finalAllocRes))
+			}
+			zoneName, resName, cmp, ok := testutils.CompareAllocatableResources(initialAllocRes, finalAllocRes)
+			framework.Logf("zone=%q resource=%q cmp=%v ok=%v", zoneName, resName, cmp, ok)
+			if !ok {
+				ginkgo.Fail(fmt.Sprintf("failed to compare allocatable resources from node topology initial=%v final=%v", initialAllocRes, finalAllocRes))
+			}
+
+			// This is actually a workaround.
+			// Depending on the (random, by design) order on which ginkgo runs the tests, a test which exclusively allocates CPUs may run before.
+			// We cannot (nor should) care about what runs before this test, but we know that this may happen.
+			// The proper solution is to wait for ALL the container requesting exclusive resources to be gone before to end the related test.
+			// To date, we don't yet have a clean way to wait for these pod (actually containers) to be completely gone
+			// (hence, releasing the exclusively allocated CPUs) before to end the test, so this test can run with some leftovers hanging around,
+			// which makes the accounting harder. And this is what we handle here.
+			isGreaterEqual := (cmp >= 0)
+			gomega.Expect(isGreaterEqual).To(gomega.BeTrue(), fmt.Sprintf("final allocatable resources not restored - cmp=%d initial=%v final=%v", cmp, initialAllocRes, finalAllocRes))
+		})
+
+		ginkgo.It("it should not account for any cpus if a container doesn't request exclusive cpus (guaranteed QOS, nonintegral cpu request)", func() {
+			ginkgo.By("getting the initial topology information")
+			initialNodeTopo := testutils.GetNodeTopology(topologyClient, topologyUpdaterNode.Name)
+			ginkgo.By("creating a pod consuming resources from the shared, non-exclusive CPU pool (guaranteed QoS, nonintegral request)")
+			sleeperPod := testutils.GuaranteedSleeperPod("500m")
+
+			podMap := make(map[string]*v1.Pod)
+			pod := f.PodClient().CreateSync(sleeperPod)
+			podMap[pod.Name] = pod
+			defer testutils.DeletePodsAsync(f, podMap)
+
+			cooldown := 30 * time.Second
+			ginkgo.By(fmt.Sprintf("getting the updated topology - sleeping for %v", cooldown))
+			// the object, hance the resource version must NOT change, so we can only sleep
+			time.Sleep(cooldown)
+			ginkgo.By("checking the changes in the updated topology - expecting none")
+			finalNodeTopo := testutils.GetNodeTopology(topologyClient, topologyUpdaterNode.Name)
+
+			initialAllocRes := testutils.AllocatableResourceListFromNodeResourceTopology(initialNodeTopo)
+			finalAllocRes := testutils.AllocatableResourceListFromNodeResourceTopology(finalNodeTopo)
+			if len(initialAllocRes) == 0 || len(finalAllocRes) == 0 {
+				ginkgo.Fail(fmt.Sprintf("failed to find allocatable resources from node topology initial=%v final=%v", initialAllocRes, finalAllocRes))
+			}
+			zoneName, resName, cmp, ok := testutils.CompareAllocatableResources(initialAllocRes, finalAllocRes)
+			framework.Logf("zone=%q resource=%q cmp=%v ok=%v", zoneName, resName, cmp, ok)
+			if !ok {
+				ginkgo.Fail(fmt.Sprintf("failed to compare allocatable resources from node topology initial=%v final=%v", initialAllocRes, finalAllocRes))
+			}
+
+			// This is actually a workaround.
+			// Depending on the (random, by design) order on which ginkgo runs the tests, a test which exclusively allocates CPUs may run before.
+			// We cannot (nor should) care about what runs before this test, but we know that this may happen.
+			// The proper solution is to wait for ALL the container requesting exclusive resources to be gone before to end the related test.
+			// To date, we don't yet have a clean way to wait for these pod (actually containers) to be completely gone
+			// (hence, releasing the exclusively allocated CPUs) before to end the test, so this test can run with some leftovers hanging around,
+			// which makes the accounting harder. And this is what we handle here.
+			isGreaterEqual := (cmp >= 0)
+			gomega.Expect(isGreaterEqual).To(gomega.BeTrue(), fmt.Sprintf("final allocatable resources not restored - cmp=%d initial=%v final=%v", cmp, initialAllocRes, finalAllocRes))
+		})
+
+		ginkgo.It("it should account for containers requesting exclusive cpus", func() {
+			nodes, err := testutils.FilterNodesWithEnoughCores(workerNodes, "1000m")
+			gomega.Expect(err).NotTo(gomega.HaveOccurred())
+			if len(nodes) < 1 {
+				ginkgo.Skip("not enough allocatable cores for this test")
+			}
+
+			ginkgo.By("getting the initial topology information")
+			initialNodeTopo := testutils.GetNodeTopology(topologyClient, topologyUpdaterNode.Name)
+			ginkgo.By("creating a pod consuming exclusive CPUs")
+			sleeperPod := testutils.GuaranteedSleeperPod("1000m")
+
+			podMap := make(map[string]*v1.Pod)
+			pod := f.PodClient().CreateSync(sleeperPod)
+			podMap[pod.Name] = pod
+			defer testutils.DeletePodsAsync(f, podMap)
+
+			ginkgo.By("getting the updated topology")
+			var finalNodeTopo *v1alpha1.NodeResourceTopology
+			gomega.Eventually(func() bool {
+				finalNodeTopo, err = topologyClient.TopologyV1alpha1().NodeResourceTopologies().Get(context.TODO(), topologyUpdaterNode.Name, metav1.GetOptions{})
+				if err != nil {
+					framework.Logf("failed to get the node topology resource: %v", err)
+					return false
+				}
+				return finalNodeTopo.ObjectMeta.ResourceVersion != initialNodeTopo.ObjectMeta.ResourceVersion
+			}, time.Minute, 5*time.Second).Should(gomega.BeTrue(), "didn't get updated node topology info")
+			ginkgo.By("checking the changes in the updated topology")
+
+			initialAllocRes := testutils.AllocatableResourceListFromNodeResourceTopology(initialNodeTopo)
+			finalAllocRes := testutils.AllocatableResourceListFromNodeResourceTopology(finalNodeTopo)
+			if len(initialAllocRes) == 0 || len(finalAllocRes) == 0 {
+				ginkgo.Fail(fmt.Sprintf("failed to find allocatable resources from node topology initial=%v final=%v", initialAllocRes, finalAllocRes))
+			}
+			zoneName, resName, isLess := testutils.LessAllocatableResources(initialAllocRes, finalAllocRes)
+			framework.Logf("zone=%q resource=%q isLess=%v", zoneName, resName, isLess)
+			gomega.Expect(isLess).To(gomega.BeTrue(), fmt.Sprintf("final allocatable resources not decreased - initial=%v final=%v", initialAllocRes, finalAllocRes))
+		})
+
+	})
+
+	ginkgo.JustAfterEach(func() {
+		err := testutils.DeconfigureRBAC(f.ClientSet, f.Namespace.Name)
+		if err != nil {
+			framework.Logf("failed to delete RBAC resources: %v", err)
+		}
+
+		err = extClient.ApiextensionsV1().CustomResourceDefinitions().Delete(context.TODO(), crd.Name, metav1.DeleteOptions{})
+		if err != nil {
+			framework.Logf("failed to delete node resources topologies CRD: %v", err)
+		}
+	})
+})
diff --git a/test/e2e/utils/node.go b/test/e2e/utils/node.go
@@ -0,0 +1,87 @@
+/*
+Copyright 2021-2022 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package utils
+
+import (
+	"context"
+	"fmt"
+
+	v1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/resource"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/labels"
+	"k8s.io/kubernetes/test/e2e/framework"
+)
+
+const (
+	// RoleWorker contains the worker role
+	RoleWorker = "worker"
+)
+
+const (
+	// LabelRole contains the key for the role label
+	LabelRole = "node-role.kubernetes.io"
+	// LabelHostname contains the key for the hostname label
+	LabelHostname = "kubernetes.io/hostname"
+)
+
+// GetWorkerNodes returns all nodes labeled as worker
+func GetWorkerNodes(f *framework.Framework) ([]v1.Node, error) {
+	return GetNodesByRole(f, RoleWorker)
+}
+
+// GetByRole returns all nodes with the specified role
+func GetNodesByRole(f *framework.Framework, role string) ([]v1.Node, error) {
+	selector, err := labels.Parse(fmt.Sprintf("%s/%s=", LabelRole, role))
+	if err != nil {
+		return nil, err
+	}
+	return GetNodesBySelector(f, selector)
+}
+
+// GetBySelector returns all nodes with the specified selector
+func GetNodesBySelector(f *framework.Framework, selector labels.Selector) ([]v1.Node, error) {
+	nodes, err := f.ClientSet.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{LabelSelector: selector.String()})
+	if err != nil {
+		return nil, err
+	}
+	return nodes.Items, nil
+}
+
+// FilterNodesWithEnoughCores returns all nodes with at least the amount of given CPU allocatable
+func FilterNodesWithEnoughCores(nodes []v1.Node, cpuAmount string) ([]v1.Node, error) {
+	requestCpu := resource.MustParse(cpuAmount)
+	framework.Logf("checking request %v on %d nodes", requestCpu, len(nodes))
+
+	resNodes := []v1.Node{}
+	for _, node := range nodes {
+		availCpu, ok := node.Status.Allocatable[v1.ResourceCPU]
+		if !ok || availCpu.IsZero() {
+			return nil, fmt.Errorf("node %q has no allocatable CPU", node.Name)
+		}
+
+		if availCpu.Cmp(requestCpu) < 1 {
+			framework.Logf("node %q available cpu %v requested cpu %v", node.Name, availCpu, requestCpu)
+			continue
+		}
+
+		framework.Logf("node %q has enough resources, cluster OK", node.Name)
+		resNodes = append(resNodes, node)
+	}
+
+	return resNodes, nil
+}