Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Automated cherry pick of #118497: Fix the deletion of rejected pods #118841

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
50 changes: 50 additions & 0 deletions pkg/kubelet/kubelet_pods.go
Original file line number Diff line number Diff line change
Expand Up @@ -1176,6 +1176,21 @@ func (kl *Kubelet) HandlePodCleanups(ctx context.Context) error {
metrics.RestartedPodTotal.WithLabelValues("true").Add(float64(restartCountStatic))
metrics.RestartedPodTotal.WithLabelValues("").Add(float64(restartCount))

// Complete termination of deleted pods that are not runtime pods (don't have
// running containers), are terminal, and are not known to pod workers.
// An example is pods rejected during kubelet admission that have never
// started before (i.e. does not have an orphaned pod).
// Adding the pods with SyncPodKill to pod workers allows to proceed with
// force-deletion of such pods, yet preventing re-entry of the routine in the
// next invocation of HandlePodCleanups.
for _, pod := range kl.filterTerminalPodsToDelete(allPods, runningRuntimePods, workingPods) {
klog.V(3).InfoS("Handling termination and deletion of the pod to pod workers", "pod", klog.KObj(pod), "podUID", pod.UID)
kl.podWorkers.UpdatePod(UpdatePodOptions{
UpdateType: kubetypes.SyncPodKill,
Pod: pod,
})
}

// Finally, terminate any pods that are observed in the runtime but not present in the list of
// known running pods from config. If we do terminate running runtime pods that will happen
// asynchronously in the background and those will be processed in the next invocation of
Expand Down Expand Up @@ -1249,6 +1264,41 @@ func (kl *Kubelet) HandlePodCleanups(ctx context.Context) error {
return nil
}

// filterTerminalPodsToDelete returns terminal pods which are ready to be
// deleted by the status manager, but are not in pod workers.
// First, the check for deletionTimestamp is a performance optimization as we
// don't need to do anything with terminal pods without deletionTimestamp.
// Second, the check for terminal pods is to avoid race conditions of triggering
// deletion on Pending pods which are not yet added to pod workers.
// Third, the check to skip pods known to pod workers is that the lifecycle of
// such pods is already handled by pod workers.
// Finally, we skip runtime pods as their termination is handled separately in
// the HandlePodCleanups routine.
func (kl *Kubelet) filterTerminalPodsToDelete(allPods []*v1.Pod, runningRuntimePods []*kubecontainer.Pod, workingPods map[types.UID]PodWorkerSync) map[types.UID]*v1.Pod {
terminalPodsToDelete := make(map[types.UID]*v1.Pod)
for _, pod := range allPods {
if pod.DeletionTimestamp == nil {
// skip pods which don't have a deletion timestamp
continue
}
if !podutil.IsPodPhaseTerminal(pod.Status.Phase) {
// skip the non-terminal pods
continue
}
if _, knownPod := workingPods[pod.UID]; knownPod {
// skip pods known to pod workers
continue
}
terminalPodsToDelete[pod.UID] = pod
}
for _, runningRuntimePod := range runningRuntimePods {
// skip running runtime pods - they are handled by a dedicated routine
// which terminates the containers
delete(terminalPodsToDelete, runningRuntimePod.ID)
}
return terminalPodsToDelete
}

// splitPodsByStatic separates a list of desired pods from the pod manager into
// regular or static pods. Mirror pods are not valid config sources (a mirror pod
// being created cannot cause the Kubelet to start running a static pod) and are
Expand Down
8 changes: 5 additions & 3 deletions pkg/kubelet/status/status_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -888,14 +888,16 @@ func (m *manager) canBeDeleted(pod *v1.Pod, status v1.PodStatus, podIsFinished b
if pod.DeletionTimestamp == nil || kubetypes.IsMirrorPod(pod) {
return false
}
// Delay deletion of pods until the phase is terminal.
// Delay deletion of pods until the phase is terminal, based on pod.Status
// which comes from pod manager.
if !podutil.IsPodPhaseTerminal(pod.Status.Phase) {
klog.V(3).InfoS("Delaying pod deletion as the phase is non-terminal", "phase", status.Phase, "pod", klog.KObj(pod), "podUID", pod.UID)
// For debugging purposes we also log the kubelet's local phase, when the deletion is delayed.
klog.V(3).InfoS("Delaying pod deletion as the phase is non-terminal", "phase", pod.Status.Phase, "localPhase", status.Phase, "pod", klog.KObj(pod), "podUID", pod.UID)
return false
}
// If this is an update completing pod termination then we know the pod termination is finished.
if podIsFinished {
klog.V(3).InfoS("The pod termination is finished as SyncTerminatedPod completes its execution", "phase", status.Phase, "pod", klog.KObj(pod), "podUID", pod.UID)
klog.V(3).InfoS("The pod termination is finished as SyncTerminatedPod completes its execution", "phase", pod.Status.Phase, "localPhase", status.Phase, "pod", klog.KObj(pod), "podUID", pod.UID)
return true
}
return false
Expand Down
139 changes: 139 additions & 0 deletions test/e2e_node/restart_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,145 @@ var _ = SIGDescribe("Restart [Serial] [Slow] [Disruptive]", func() {
return checkMirrorPodDisappear(ctx, f.ClientSet, pod.Name, pod.Namespace)
}, f.Timeouts.PodDelete, f.Timeouts.Poll).Should(gomega.BeNil())
})
// Regression test for https://issues.k8s.io/118472
ginkgo.It("should force-delete non-admissible pods created and deleted during kubelet restart", func(ctx context.Context) {
podName := "rejected-deleted-pod" + string(uuid.NewUUID())
gracePeriod := int64(30)
nodeName := getNodeName(ctx, f)
podSpec := e2epod.MustMixinRestrictedPodSecurity(&v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: podName,
Namespace: f.Namespace.Name,
},
Spec: v1.PodSpec{
NodeName: nodeName,
NodeSelector: map[string]string{
"this-label": "does-not-exist-on-any-nodes",
},
TerminationGracePeriodSeconds: &gracePeriod,
RestartPolicy: v1.RestartPolicyNever,
Containers: []v1.Container{
{
Name: podName,
Image: imageutils.GetPauseImageName(),
},
},
},
})
ginkgo.By("Stopping the kubelet")
startKubelet := stopKubelet()

// wait until the kubelet health check will fail
gomega.Eventually(ctx, func() bool {
return kubeletHealthCheck(kubeletHealthCheckURL)
}, f.Timeouts.PodStart, f.Timeouts.Poll).Should(gomega.BeFalse())

// Create the pod bound to the node. It will remain in the Pending
// phase as Kubelet is down.
ginkgo.By(fmt.Sprintf("Creating a pod (%v/%v)", f.Namespace.Name, podName))
pod := e2epod.NewPodClient(f).Create(ctx, podSpec)

ginkgo.By(fmt.Sprintf("Deleting the pod (%v/%v) to set a deletion timestamp", pod.Namespace, pod.Name))
err := e2epod.NewPodClient(f).Delete(ctx, pod.Name, metav1.DeleteOptions{GracePeriodSeconds: &gracePeriod})
framework.ExpectNoError(err, "Failed to delete the pod: %q", pod.Name)

// Restart Kubelet so that it proceeds with deletion
ginkgo.By("Starting the kubelet")
startKubelet()

// wait until the kubelet health check will succeed
gomega.Eventually(ctx, func() bool {
return kubeletHealthCheck(kubeletHealthCheckURL)
}, f.Timeouts.PodStart, f.Timeouts.Poll).Should(gomega.BeTrue())

// Wait for the Kubelet to be ready.
gomega.Eventually(ctx, func(ctx context.Context) bool {
nodes, err := e2enode.TotalReady(ctx, f.ClientSet)
framework.ExpectNoError(err)
return nodes == 1
}, time.Minute, f.Timeouts.Poll).Should(gomega.BeTrue())

ginkgo.By(fmt.Sprintf("After the kubelet is restarted, verify the pod (%v/%v) is deleted by kubelet", pod.Namespace, pod.Name))
gomega.Eventually(ctx, func(ctx context.Context) error {
return checkMirrorPodDisappear(ctx, f.ClientSet, pod.Name, pod.Namespace)
}, f.Timeouts.PodDelete, f.Timeouts.Poll).Should(gomega.BeNil())
})
// Regression test for an extended scenario for https://issues.k8s.io/118472
ginkgo.It("should force-delete non-admissible pods that was admitted and running before kubelet restart", func(ctx context.Context) {
nodeLabelKey := "custom-label-key-required"
nodeLabelValueRequired := "custom-label-value-required-for-admission"
podName := "rejected-deleted-run" + string(uuid.NewUUID())
gracePeriod := int64(30)
nodeName := getNodeName(ctx, f)
pod := e2epod.MustMixinRestrictedPodSecurity(&v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: podName,
Namespace: f.Namespace.Name,
},
Spec: v1.PodSpec{
NodeSelector: map[string]string{
nodeLabelKey: nodeLabelValueRequired,
},
NodeName: nodeName,
TerminationGracePeriodSeconds: &gracePeriod,
RestartPolicy: v1.RestartPolicyNever,
Containers: []v1.Container{
{
Name: podName,
Image: imageutils.GetPauseImageName(),
},
},
},
})

ginkgo.By(fmt.Sprintf("Adding node label for node (%v) to allow admission of pod (%v/%v)", nodeName, f.Namespace.Name, podName))
e2enode.AddOrUpdateLabelOnNode(f.ClientSet, nodeName, nodeLabelKey, nodeLabelValueRequired)
ginkgo.DeferCleanup(func() { e2enode.RemoveLabelOffNode(f.ClientSet, nodeName, nodeLabelKey) })

// Create the pod bound to the node. It will start, but will be rejected after kubelet restart.
ginkgo.By(fmt.Sprintf("Creating a pod (%v/%v)", f.Namespace.Name, podName))
pod = e2epod.NewPodClient(f).Create(ctx, pod)

ginkgo.By(fmt.Sprintf("Waiting for the pod (%v/%v) to be running", f.Namespace.Name, pod.Name))
err := e2epod.WaitForPodNameRunningInNamespace(ctx, f.ClientSet, pod.Name, f.Namespace.Name)
framework.ExpectNoError(err, "Failed to await for the pod to be running: (%v/%v)", f.Namespace.Name, pod.Name)

ginkgo.By("Stopping the kubelet")
startKubelet := stopKubelet()

// wait until the kubelet health check will fail
gomega.Eventually(ctx, func() bool {
return kubeletHealthCheck(kubeletHealthCheckURL)
}, f.Timeouts.PodStart, f.Timeouts.Poll).Should(gomega.BeFalse())

ginkgo.By(fmt.Sprintf("Deleting the pod (%v/%v) to set a deletion timestamp", pod.Namespace, pod.Name))
err = e2epod.NewPodClient(f).Delete(ctx, pod.Name, metav1.DeleteOptions{GracePeriodSeconds: &gracePeriod})
framework.ExpectNoError(err, "Failed to delete the pod: %q", pod.Name)

ginkgo.By(fmt.Sprintf("Removing node label for node (%v) to ensure the pod (%v/%v) is rejected after kubelet restart", nodeName, f.Namespace.Name, podName))
e2enode.RemoveLabelOffNode(f.ClientSet, nodeName, nodeLabelKey)

// Restart Kubelet so that it proceeds with deletion
ginkgo.By("Starting the kubelet")
startKubelet()

// wait until the kubelet health check will succeed
gomega.Eventually(ctx, func() bool {
return kubeletHealthCheck(kubeletHealthCheckURL)
}, f.Timeouts.PodStart, f.Timeouts.Poll).Should(gomega.BeTrue())

// Wait for the Kubelet to be ready.
gomega.Eventually(ctx, func(ctx context.Context) bool {
nodes, err := e2enode.TotalReady(ctx, f.ClientSet)
framework.ExpectNoError(err)
return nodes == 1
}, time.Minute, f.Timeouts.Poll).Should(gomega.BeTrue())

ginkgo.By(fmt.Sprintf("Once Kubelet is restarted, verify the pod (%v/%v) is deleted by kubelet", pod.Namespace, pod.Name))
gomega.Eventually(ctx, func(ctx context.Context) error {
return checkMirrorPodDisappear(ctx, f.ClientSet, pod.Name, pod.Namespace)
}, f.Timeouts.PodDelete, f.Timeouts.Poll).Should(gomega.BeNil())
})
})

})