Skip to content

Commit

Permalink
e2e: node: add test to check device-requiring pods are cleaned up
Browse files Browse the repository at this point in the history
Make sure orphanded pods (pods deleted while kubelet is down) are
handled correctly.
Outline:
1. create a pod (not static pod)
2. stop kubelet
3. while kubelet is down, force delete the pod on API server
4. restart kubelet
the pod becomes an orphaned pod and is expected to be killed by HandlePodCleanups.

There is a similar test already, but here we want to check device
assignment.

Signed-off-by: Francesco Romani <fromani@redhat.com>
  • Loading branch information
ffromani committed Aug 8, 2023
1 parent 22cddaf commit b6aaf8c
Showing 1 changed file with 74 additions and 16 deletions.
90 changes: 74 additions & 16 deletions test/e2e_node/device_plugin_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) {
if err != nil {
framework.ExpectNoError(err, "getting pod resources assignment after pod restart")
}
err = checkPodResourcesAssignment(v1PodResources, pod1.Namespace, pod1.Name, pod1.Spec.Containers[0].Name, SampleDeviceResourceName, []string{devID1})
err, _ = checkPodResourcesAssignment(v1PodResources, pod1.Namespace, pod1.Name, pod1.Spec.Containers[0].Name, SampleDeviceResourceName, []string{devID1})
framework.ExpectNoError(err, "inconsistent device assignment after pod restart")

ginkgo.By("Creating another pod")
Expand All @@ -259,9 +259,9 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) {
if err != nil {
framework.ExpectNoError(err, "getting pod resources assignment after pod restart")
}
err = checkPodResourcesAssignment(v1PodResources, pod1.Namespace, pod1.Name, pod1.Spec.Containers[0].Name, SampleDeviceResourceName, []string{devID1})
err, _ = checkPodResourcesAssignment(v1PodResources, pod1.Namespace, pod1.Name, pod1.Spec.Containers[0].Name, SampleDeviceResourceName, []string{devID1})
framework.ExpectNoError(err, "inconsistent device assignment after extra container restart - pod1")
err = checkPodResourcesAssignment(v1PodResources, pod2.Namespace, pod2.Name, pod2.Spec.Containers[0].Name, SampleDeviceResourceName, []string{devID2})
err, _ = checkPodResourcesAssignment(v1PodResources, pod2.Namespace, pod2.Name, pod2.Spec.Containers[0].Name, SampleDeviceResourceName, []string{devID2})
framework.ExpectNoError(err, "inconsistent device assignment after extra container restart - pod2")
})

Expand All @@ -279,7 +279,7 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) {

pod1, err = e2epod.NewPodClient(f).Get(ctx, pod1.Name, metav1.GetOptions{})
framework.ExpectNoError(err)
framework.Logf("testing pod: UID=%s namespace=%s name=%s ready=%v", pod1.UID, pod1.Namespace, pod1.Name, podutils.IsPodReady(pod1))
framework.Logf("testing pod: pre-restart UID=%s namespace=%s name=%s ready=%v", pod1.UID, pod1.Namespace, pod1.Name, podutils.IsPodReady(pod1))

ginkgo.By("Restarting Kubelet")
restartKubelet(true)
Expand All @@ -305,9 +305,9 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) {
}).WithTimeout(time.Minute).ShouldNot(gomega.HaveOccurred(),
"the same pod instance not running across kubelet restarts, workload should not be perturbed by kubelet restarts")

pod1, err = e2epod.NewPodClient(f).Get(ctx, pod1.Name, metav1.GetOptions{})
pod2, err := e2epod.NewPodClient(f).Get(ctx, pod1.Name, metav1.GetOptions{})
framework.ExpectNoError(err)
framework.Logf("testing pod: UID=%s namespace=%s name=%s ready=%v", pod1.UID, pod1.Namespace, pod1.Name, podutils.IsPodReady(pod1))
framework.Logf("testing pod: post-restart UID=%s namespace=%s name=%s ready=%v", pod2.UID, pod2.Namespace, pod2.Name, podutils.IsPodReady(pod2))

// crosscheck from the device assignment is preserved and stable from perspective of the kubelet.
// note we don't check again the logs of the container: the check is done at startup, the container
Expand All @@ -319,7 +319,7 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) {
return err
}, 30*time.Second, framework.Poll).ShouldNot(gomega.HaveOccurred(), "cannot fetch the compute resource assignment after kubelet restart")

err = checkPodResourcesAssignment(v1PodResources, pod1.Namespace, pod1.Name, pod1.Spec.Containers[0].Name, SampleDeviceResourceName, []string{devID1})
err, _ = checkPodResourcesAssignment(v1PodResources, pod2.Namespace, pod2.Name, pod2.Spec.Containers[0].Name, SampleDeviceResourceName, []string{devID1})
framework.ExpectNoError(err, "inconsistent device assignment after pod restart")
})

Expand Down Expand Up @@ -388,7 +388,7 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) {
return err
}, 30*time.Second, framework.Poll).ShouldNot(gomega.HaveOccurred(), "cannot fetch the compute resource assignment after kubelet restart")

err = checkPodResourcesAssignment(v1PodResources, pod1.Namespace, pod1.Name, pod1.Spec.Containers[0].Name, SampleDeviceResourceName, []string{devID1})
err, _ = checkPodResourcesAssignment(v1PodResources, pod1.Namespace, pod1.Name, pod1.Spec.Containers[0].Name, SampleDeviceResourceName, []string{devID1})
framework.ExpectNoError(err, "inconsistent device assignment after pod restart")
})

Expand Down Expand Up @@ -442,7 +442,7 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) {
return err
}, 30*time.Second, framework.Poll).ShouldNot(gomega.HaveOccurred(), "cannot fetch the compute resource assignment after kubelet restart")

err = checkPodResourcesAssignment(v1PodResources, pod1.Namespace, pod1.Name, pod1.Spec.Containers[0].Name, SampleDeviceResourceName, []string{devID1})
err, _ = checkPodResourcesAssignment(v1PodResources, pod1.Namespace, pod1.Name, pod1.Spec.Containers[0].Name, SampleDeviceResourceName, []string{devID1})
framework.ExpectNoError(err, "inconsistent device assignment after pod restart")

ginkgo.By("Creating another pod")
Expand Down Expand Up @@ -498,7 +498,7 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) {
return err
}, 30*time.Second, framework.Poll).ShouldNot(gomega.HaveOccurred(), "cannot fetch the compute resource assignment after kubelet restart")

err = checkPodResourcesAssignment(v1PodResources, pod1.Namespace, pod1.Name, pod1.Spec.Containers[0].Name, SampleDeviceResourceName, []string{devID1})
err, _ = checkPodResourcesAssignment(v1PodResources, pod1.Namespace, pod1.Name, pod1.Spec.Containers[0].Name, SampleDeviceResourceName, []string{devID1})
framework.ExpectNoError(err, "inconsistent device assignment after pod restart")

ginkgo.By("Re-Register resources by deleting the plugin pod")
Expand Down Expand Up @@ -530,6 +530,62 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) {
}).WithTimeout(time.Minute).ShouldNot(gomega.HaveOccurred(),
"the same pod instance not running across kubelet restarts, workload should not be perturbed by device plugins restarts")
})

ginkgo.It("[OrphanedPods] Ensures pods consuming devices deleted while kubelet is down are cleaned up correctly", func(ctx context.Context) {
podRECMD := fmt.Sprintf("devs=$(ls /tmp/ | egrep '^Dev-[0-9]+$') && echo stub devices: $devs && sleep %s", sleepIntervalWithRestart)
pod := e2epod.NewPodClient(f).CreateSync(makeBusyboxPod(SampleDeviceResourceName, podRECMD))

deviceIDRE := "stub devices: (Dev-[0-9]+)"
devID, err := parseLog(f, pod.Name, pod.Name, deviceIDRE)
framework.ExpectNoError(err, "getting logs for pod %q", pod.Name)
gomega.Expect(devID).To(gomega.Not(gomega.BeEmpty()), "pod1 requested a device but started successfully without")

pod, err = e2epod.NewPodClient(f).Get(ctx, pod.Name, metav1.GetOptions{})
framework.ExpectNoError(err)

ginkgo.By("stopping the kubelet")
startKubelet := stopKubelet()

// wait until the kubelet health check will fail
gomega.Eventually(ctx, func() bool {
ok := kubeletHealthCheck(kubeletHealthCheckURL)
framework.Logf("kubelet health check at %q value=%v", kubeletHealthCheckURL, ok)
return ok
}, f.Timeouts.PodStart, 1*time.Second).Should(gomega.BeFalse())

framework.Logf("Delete the pod while the kubelet is not running")
// Delete pod sync by name will force delete the pod, removing it from kubelet's config
deletePodSyncByName(f, pod.Name)

framework.Logf("Starting the kubelet")
startKubelet()

// wait until the kubelet health check will succeed
gomega.Eventually(ctx, func() bool {
ok := kubeletHealthCheck(kubeletHealthCheckURL)
framework.Logf("kubelet health check at %q value=%v", kubeletHealthCheckURL, ok)
return ok
}, f.Timeouts.PodStart, 1*time.Second).Should(gomega.BeTrue())

framework.Logf("wait for the pod %v to disappear", pod.Name)
gomega.Eventually(ctx, func(ctx context.Context) error {
err := checkMirrorPodDisappear(f.ClientSet, pod.Name, pod.Namespace)
framework.Logf("pod %s/%s disappear check err=%v", pod.Namespace, pod.Name, err)
return err
}, f.Timeouts.PodDelete, 1*time.Second).Should(gomega.BeNil())

waitForAllContainerRemoval(pod.Name, pod.Namespace)

ginkgo.By("Verifying the device assignment after device plugin restart using podresources API")
gomega.Eventually(ctx, func() error {
v1PodResources, err = getV1NodeDevices()
return err
}, 30*time.Second, framework.Poll).ShouldNot(gomega.HaveOccurred(), "cannot fetch the compute resource assignment after kubelet restart")
err, allocated := checkPodResourcesAssignment(v1PodResources, pod.Namespace, pod.Name, pod.Spec.Containers[0].Name, SampleDeviceResourceName, []string{})
if err == nil || allocated {
framework.Fail(fmt.Sprintf("stale device assignment after pod deletion while kubelet was down allocated=%v error=%v", allocated, err))
}
})
})
}

Expand Down Expand Up @@ -727,7 +783,7 @@ func testDevicePluginNodeReboot(f *framework.Framework, pluginSockDir string) {
return err
}, 30*time.Second, framework.Poll).ShouldNot(gomega.HaveOccurred(), "cannot fetch the compute resource assignment after kubelet restart")

err = checkPodResourcesAssignment(v1PodResources, pod1.Namespace, pod1.Name, pod1.Spec.Containers[0].Name, SampleDeviceResourceName, []string{devID1})
err, _ = checkPodResourcesAssignment(v1PodResources, pod1.Namespace, pod1.Name, pod1.Spec.Containers[0].Name, SampleDeviceResourceName, []string{devID1})
framework.ExpectNoError(err, "inconsistent device assignment after node reboot")

})
Expand Down Expand Up @@ -795,7 +851,7 @@ func parseLog(f *framework.Framework, podName string, contName string, re string
return matches[1], nil
}

func checkPodResourcesAssignment(v1PodRes *kubeletpodresourcesv1.ListPodResourcesResponse, podNamespace, podName, containerName, resourceName string, devs []string) error {
func checkPodResourcesAssignment(v1PodRes *kubeletpodresourcesv1.ListPodResourcesResponse, podNamespace, podName, containerName, resourceName string, devs []string) (error, bool) {
for _, podRes := range v1PodRes.PodResources {
if podRes.Namespace != podNamespace || podRes.Name != podName {
continue
Expand All @@ -807,10 +863,12 @@ func checkPodResourcesAssignment(v1PodRes *kubeletpodresourcesv1.ListPodResource
return matchContainerDevices(podNamespace+"/"+podName+"/"+containerName, contRes.Devices, resourceName, devs)
}
}
return fmt.Errorf("no resources found for %s/%s/%s", podNamespace, podName, containerName)
err := fmt.Errorf("no resources found for %s/%s/%s", podNamespace, podName, containerName)
framework.Logf("%v", err)
return err, false
}

func matchContainerDevices(ident string, contDevs []*kubeletpodresourcesv1.ContainerDevices, resourceName string, devs []string) error {
func matchContainerDevices(ident string, contDevs []*kubeletpodresourcesv1.ContainerDevices, resourceName string, devs []string) (error, bool) {
expected := sets.NewString(devs...)
assigned := sets.NewString()
for _, contDev := range contDevs {
Expand All @@ -823,9 +881,9 @@ func matchContainerDevices(ident string, contDevs []*kubeletpodresourcesv1.Conta
assignedStr := strings.Join(assigned.UnsortedList(), ",")
framework.Logf("%s: devices expected %q assigned %q", ident, expectedStr, assignedStr)
if !assigned.Equal(expected) {
return fmt.Errorf("device allocation mismatch for %s expected %s assigned %s", ident, expectedStr, assignedStr)
return fmt.Errorf("device allocation mismatch for %s expected %s assigned %s", ident, expectedStr, assignedStr), true
}
return nil
return nil, true
}

// getSampleDevicePluginPod returns the Sample Device Plugin pod to be used e2e tests.
Expand Down

0 comments on commit b6aaf8c

Please sign in to comment.