kubernetes · k8s-ci-robot · Jun 29, 2023 · Jun 6, 2023 · Jun 22, 2023
diff --git a/pkg/kubelet/kubelet_pods.go b/pkg/kubelet/kubelet_pods.go
@@ -1176,6 +1176,21 @@ func (kl *Kubelet) HandlePodCleanups(ctx context.Context) error {
 	metrics.RestartedPodTotal.WithLabelValues("true").Add(float64(restartCountStatic))
 	metrics.RestartedPodTotal.WithLabelValues("").Add(float64(restartCount))
 
+	// Complete termination of deleted pods that are not runtime pods (don't have
+	// running containers), are terminal, and are not known to pod workers.
+	// An example is pods rejected during kubelet admission that have never
+	// started before (i.e. does not have an orphaned pod).
+	// Adding the pods with SyncPodKill to pod workers allows to proceed with
+	// force-deletion of such pods, yet preventing re-entry of the routine in the
+	// next invocation of HandlePodCleanups.
+	for _, pod := range kl.filterTerminalPodsToDelete(allPods, runningRuntimePods, workingPods) {
+		klog.V(3).InfoS("Handling termination and deletion of the pod to pod workers", "pod", klog.KObj(pod), "podUID", pod.UID)
+		kl.podWorkers.UpdatePod(UpdatePodOptions{
+			UpdateType: kubetypes.SyncPodKill,
+			Pod:        pod,
+		})
+	}
+
 	// Finally, terminate any pods that are observed in the runtime but not present in the list of
 	// known running pods from config. If we do terminate running runtime pods that will happen
 	// asynchronously in the background and those will be processed in the next invocation of
@@ -1249,6 +1264,41 @@ func (kl *Kubelet) HandlePodCleanups(ctx context.Context) error {
 	return nil
 }
 
+// filterTerminalPodsToDelete returns terminal pods which are ready to be
+// deleted by the status manager, but are not in pod workers.
+// First, the check for deletionTimestamp is a performance optimization as we
+// don't need to do anything with terminal pods without deletionTimestamp.
+// Second, the check for terminal pods is to avoid race conditions of triggering
+// deletion on Pending pods which are not yet added to pod workers.
+// Third, the check to skip pods known to pod workers is that the lifecycle of
+// such pods is already handled by pod workers.
+// Finally, we skip runtime pods as their termination is handled separately in
+// the HandlePodCleanups routine.
+func (kl *Kubelet) filterTerminalPodsToDelete(allPods []*v1.Pod, runningRuntimePods []*kubecontainer.Pod, workingPods map[types.UID]PodWorkerSync) map[types.UID]*v1.Pod {
+	terminalPodsToDelete := make(map[types.UID]*v1.Pod)
+	for _, pod := range allPods {
+		if pod.DeletionTimestamp == nil {
+			// skip pods which don't have a deletion timestamp
+			continue
+		}
+		if !podutil.IsPodPhaseTerminal(pod.Status.Phase) {
+			// skip the non-terminal pods
+			continue
+		}
+		if _, knownPod := workingPods[pod.UID]; knownPod {
+			// skip pods known to pod workers
+			continue
+		}
+		terminalPodsToDelete[pod.UID] = pod
+	}
+	for _, runningRuntimePod := range runningRuntimePods {
+		// skip running runtime pods - they are handled by a dedicated routine
+		// which terminates the containers
+		delete(terminalPodsToDelete, runningRuntimePod.ID)
+	}
+	return terminalPodsToDelete
+}
+
 // splitPodsByStatic separates a list of desired pods from the pod manager into
 // regular or static pods. Mirror pods are not valid config sources (a mirror pod
 // being created cannot cause the Kubelet to start running a static pod) and are

diff --git a/pkg/kubelet/status/status_manager.go b/pkg/kubelet/status/status_manager.go
@@ -888,14 +888,16 @@ func (m *manager) canBeDeleted(pod *v1.Pod, status v1.PodStatus, podIsFinished b
 	if pod.DeletionTimestamp == nil || kubetypes.IsMirrorPod(pod) {
 		return false
 	}
-	// Delay deletion of pods until the phase is terminal.
+	// Delay deletion of pods until the phase is terminal, based on pod.Status
+	// which comes from pod manager.
 	if !podutil.IsPodPhaseTerminal(pod.Status.Phase) {
-		klog.V(3).InfoS("Delaying pod deletion as the phase is non-terminal", "phase", status.Phase, "pod", klog.KObj(pod), "podUID", pod.UID)
+		// For debugging purposes we also log the kubelet's local phase, when the deletion is delayed.
+		klog.V(3).InfoS("Delaying pod deletion as the phase is non-terminal", "phase", pod.Status.Phase, "localPhase", status.Phase, "pod", klog.KObj(pod), "podUID", pod.UID)
 		return false
 	}
 	// If this is an update completing pod termination then we know the pod termination is finished.
 	if podIsFinished {
-		klog.V(3).InfoS("The pod termination is finished as SyncTerminatedPod completes its execution", "phase", status.Phase, "pod", klog.KObj(pod), "podUID", pod.UID)
+		klog.V(3).InfoS("The pod termination is finished as SyncTerminatedPod completes its execution", "phase", pod.Status.Phase, "localPhase", status.Phase, "pod", klog.KObj(pod), "podUID", pod.UID)
 		return true
 	}
 	return false

diff --git a/test/e2e_node/restart_test.go b/test/e2e_node/restart_test.go
@@ -381,6 +381,145 @@ var _ = SIGDescribe("Restart [Serial] [Slow] [Disruptive]", func() {
 				return checkMirrorPodDisappear(ctx, f.ClientSet, pod.Name, pod.Namespace)
 			}, f.Timeouts.PodDelete, f.Timeouts.Poll).Should(gomega.BeNil())
 		})
+		// Regression test for https://issues.k8s.io/118472
+		ginkgo.It("should force-delete non-admissible pods created and deleted during kubelet restart", func(ctx context.Context) {
+			podName := "rejected-deleted-pod" + string(uuid.NewUUID())
+			gracePeriod := int64(30)
+			nodeName := getNodeName(ctx, f)
+			podSpec := e2epod.MustMixinRestrictedPodSecurity(&v1.Pod{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      podName,
+					Namespace: f.Namespace.Name,
+				},
+				Spec: v1.PodSpec{
+					NodeName: nodeName,
+					NodeSelector: map[string]string{
+						"this-label": "does-not-exist-on-any-nodes",
+					},
+					TerminationGracePeriodSeconds: &gracePeriod,
+					RestartPolicy:                 v1.RestartPolicyNever,
+					Containers: []v1.Container{
+						{
+							Name:  podName,
+							Image: imageutils.GetPauseImageName(),
+						},
+					},
+				},
+			})
+			ginkgo.By("Stopping the kubelet")
+			startKubelet := stopKubelet()
+
+			// wait until the kubelet health check will fail
+			gomega.Eventually(ctx, func() bool {
+				return kubeletHealthCheck(kubeletHealthCheckURL)
+			}, f.Timeouts.PodStart, f.Timeouts.Poll).Should(gomega.BeFalse())
+
+			// Create the pod bound to the node. It will remain in the Pending
+			// phase as Kubelet is down.
+			ginkgo.By(fmt.Sprintf("Creating a pod (%v/%v)", f.Namespace.Name, podName))
+			pod := e2epod.NewPodClient(f).Create(ctx, podSpec)
+
+			ginkgo.By(fmt.Sprintf("Deleting the pod (%v/%v) to set a deletion timestamp", pod.Namespace, pod.Name))
+			err := e2epod.NewPodClient(f).Delete(ctx, pod.Name, metav1.DeleteOptions{GracePeriodSeconds: &gracePeriod})
+			framework.ExpectNoError(err, "Failed to delete the pod: %q", pod.Name)
+
+			// Restart Kubelet so that it proceeds with deletion
+			ginkgo.By("Starting the kubelet")
+			startKubelet()
+
+			// wait until the kubelet health check will succeed
+			gomega.Eventually(ctx, func() bool {
+				return kubeletHealthCheck(kubeletHealthCheckURL)
+			}, f.Timeouts.PodStart, f.Timeouts.Poll).Should(gomega.BeTrue())
+
+			// Wait for the Kubelet to be ready.
+			gomega.Eventually(ctx, func(ctx context.Context) bool {
+				nodes, err := e2enode.TotalReady(ctx, f.ClientSet)
+				framework.ExpectNoError(err)
+				return nodes == 1
+			}, time.Minute, f.Timeouts.Poll).Should(gomega.BeTrue())
+
+			ginkgo.By(fmt.Sprintf("After the kubelet is restarted, verify the pod (%v/%v) is deleted by kubelet", pod.Namespace, pod.Name))
+			gomega.Eventually(ctx, func(ctx context.Context) error {
+				return checkMirrorPodDisappear(ctx, f.ClientSet, pod.Name, pod.Namespace)
+			}, f.Timeouts.PodDelete, f.Timeouts.Poll).Should(gomega.BeNil())
+		})
+		// Regression test for an extended scenario for https://issues.k8s.io/118472
+		ginkgo.It("should force-delete non-admissible pods that was admitted and running before kubelet restart", func(ctx context.Context) {
+			nodeLabelKey := "custom-label-key-required"
+			nodeLabelValueRequired := "custom-label-value-required-for-admission"
+			podName := "rejected-deleted-run" + string(uuid.NewUUID())
+			gracePeriod := int64(30)
+			nodeName := getNodeName(ctx, f)
+			pod := e2epod.MustMixinRestrictedPodSecurity(&v1.Pod{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      podName,
+					Namespace: f.Namespace.Name,
+				},
+				Spec: v1.PodSpec{
+					NodeSelector: map[string]string{
+						nodeLabelKey: nodeLabelValueRequired,
+					},
+					NodeName:                      nodeName,
+					TerminationGracePeriodSeconds: &gracePeriod,
+					RestartPolicy:                 v1.RestartPolicyNever,
+					Containers: []v1.Container{
+						{
+							Name:  podName,
+							Image: imageutils.GetPauseImageName(),
+						},
+					},
+				},
+			})
+
+			ginkgo.By(fmt.Sprintf("Adding node label for node (%v) to allow admission of pod (%v/%v)", nodeName, f.Namespace.Name, podName))
+			e2enode.AddOrUpdateLabelOnNode(f.ClientSet, nodeName, nodeLabelKey, nodeLabelValueRequired)
+			ginkgo.DeferCleanup(func() { e2enode.RemoveLabelOffNode(f.ClientSet, nodeName, nodeLabelKey) })
+
+			// Create the pod bound to the node. It will start, but will be rejected after kubelet restart.
+			ginkgo.By(fmt.Sprintf("Creating a pod (%v/%v)", f.Namespace.Name, podName))
+			pod = e2epod.NewPodClient(f).Create(ctx, pod)
+
+			ginkgo.By(fmt.Sprintf("Waiting for the pod (%v/%v) to be running", f.Namespace.Name, pod.Name))
+			err := e2epod.WaitForPodNameRunningInNamespace(ctx, f.ClientSet, pod.Name, f.Namespace.Name)
+			framework.ExpectNoError(err, "Failed to await for the pod to be running: (%v/%v)", f.Namespace.Name, pod.Name)
+
+			ginkgo.By("Stopping the kubelet")
+			startKubelet := stopKubelet()
+
+			// wait until the kubelet health check will fail
+			gomega.Eventually(ctx, func() bool {
+				return kubeletHealthCheck(kubeletHealthCheckURL)
+			}, f.Timeouts.PodStart, f.Timeouts.Poll).Should(gomega.BeFalse())
+
+			ginkgo.By(fmt.Sprintf("Deleting the pod (%v/%v) to set a deletion timestamp", pod.Namespace, pod.Name))
+			err = e2epod.NewPodClient(f).Delete(ctx, pod.Name, metav1.DeleteOptions{GracePeriodSeconds: &gracePeriod})
+			framework.ExpectNoError(err, "Failed to delete the pod: %q", pod.Name)
+
+			ginkgo.By(fmt.Sprintf("Removing node label for node (%v) to ensure the pod (%v/%v) is rejected after kubelet restart", nodeName, f.Namespace.Name, podName))
+			e2enode.RemoveLabelOffNode(f.ClientSet, nodeName, nodeLabelKey)
+
+			// Restart Kubelet so that it proceeds with deletion
+			ginkgo.By("Starting the kubelet")
+			startKubelet()
+
+			// wait until the kubelet health check will succeed
+			gomega.Eventually(ctx, func() bool {
+				return kubeletHealthCheck(kubeletHealthCheckURL)
+			}, f.Timeouts.PodStart, f.Timeouts.Poll).Should(gomega.BeTrue())
+
+			// Wait for the Kubelet to be ready.
+			gomega.Eventually(ctx, func(ctx context.Context) bool {
+				nodes, err := e2enode.TotalReady(ctx, f.ClientSet)
+				framework.ExpectNoError(err)
+				return nodes == 1
+			}, time.Minute, f.Timeouts.Poll).Should(gomega.BeTrue())
+
+			ginkgo.By(fmt.Sprintf("Once Kubelet is restarted, verify the pod (%v/%v) is deleted by kubelet", pod.Namespace, pod.Name))
+			gomega.Eventually(ctx, func(ctx context.Context) error {
+				return checkMirrorPodDisappear(ctx, f.ClientSet, pod.Name, pod.Namespace)
+			}, f.Timeouts.PodDelete, f.Timeouts.Poll).Should(gomega.BeNil())
+		})
 	})
 
 })