Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

kubelet: fix create sandbox delete pod race #98933

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 3 additions & 1 deletion pkg/kubelet/kuberuntime/fake_kuberuntime_manager.go
Expand Up @@ -101,7 +101,9 @@ func newFakeKubeRuntimeManager(runtimeService internalapi.RuntimeService, imageS
return nil, err
}

kubeRuntimeManager.containerGC = newContainerGC(runtimeService, newFakePodStateProvider(), kubeRuntimeManager)
podStateProvider := newFakePodStateProvider()
kubeRuntimeManager.containerGC = newContainerGC(runtimeService, podStateProvider, kubeRuntimeManager)
kubeRuntimeManager.podStateProvider = podStateProvider
kubeRuntimeManager.runtimeName = typedVersion.RuntimeName
kubeRuntimeManager.imagePuller = images.NewImageManager(
kubecontainer.FilterEventRecorder(recorder),
Expand Down
12 changes: 12 additions & 0 deletions pkg/kubelet/kuberuntime/kuberuntime_manager.go
Expand Up @@ -137,6 +137,9 @@ type kubeGenericRuntimeManager struct {

// Cache last per-container error message to reduce log spam
logReduction *logreduction.LogReduction

// PodState provider instance
podStateProvider podStateProvider
}

// KubeGenericRuntime is a interface contains interfaces for container runtime and command.
Expand Down Expand Up @@ -248,6 +251,7 @@ func NewKubeGenericRuntimeManager(
imagePullBurst)
kubeRuntimeManager.runner = lifecycle.NewHandlerRunner(httpClient, kubeRuntimeManager, kubeRuntimeManager)
kubeRuntimeManager.containerGC = newContainerGC(runtimeService, podStateProvider, kubeRuntimeManager)
kubeRuntimeManager.podStateProvider = podStateProvider

kubeRuntimeManager.versionCache = cache.NewObjectCache(
func() (interface{}, error) {
Expand Down Expand Up @@ -751,6 +755,14 @@ func (m *kubeGenericRuntimeManager) SyncPod(pod *v1.Pod, podStatus *kubecontaine
result.AddSyncResult(createSandboxResult)
podSandboxID, msg, err = m.createPodSandbox(pod, podContainerChanges.Attempt)
if err != nil {
// createPodSandbox can return an error from CNI, CSI,
// or CRI if the Pod has been deleted while the POD is
// being created. If the pod has been deleted then it's
// not a real error.
if m.podStateProvider.IsPodDeleted(pod.UID) {
klog.V(4).Infof("Pod %q was deleted and sandbox failed to be created: %v", format.Pod(pod), pod.UID)
return
}
createSandboxResult.Fail(kubecontainer.ErrCreatePodSandbox, msg)
klog.Errorf("createPodSandbox for pod %q failed: %v", format.Pod(pod), err)
ref, referr := ref.GetReference(legacyscheme.Scheme, pod)
Expand Down
35 changes: 35 additions & 0 deletions pkg/kubelet/kuberuntime/kuberuntime_manager_test.go
Expand Up @@ -1372,6 +1372,41 @@ func TestComputePodActionsWithInitAndEphemeralContainers(t *testing.T) {
}
}

func TestSyncPodWithSandboxAndDeletedPod(t *testing.T) {
fakeRuntime, _, m, err := createTestRuntimeManager()
assert.NoError(t, err)
fakeRuntime.ErrorOnSandboxCreate = true

containers := []v1.Container{
{
Name: "foo1",
Image: "busybox",
ImagePullPolicy: v1.PullIfNotPresent,
},
}
pod := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
UID: "12345678",
Name: "foo",
Namespace: "new",
},
Spec: v1.PodSpec{
Containers: containers,
},
}

backOff := flowcontrol.NewBackOff(time.Second, time.Minute)

// GetPodStatus and the following SyncPod will not return errors in the
// case where the pod has been deleted. We are not adding any pods into
// the fakePodProvider so they are 'deleted'.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

podStatus, err := m.GetPodStatus(pod.UID, pod.Name, pod.Namespace)
assert.NoError(t, err)
result := m.SyncPod(pod, podStatus, []v1.Secret{}, backOff)
// This will return an error if the pod has _not_ been deleted.
assert.NoError(t, result.Error())
}

func makeBasePodAndStatusWithInitAndEphemeralContainers() (*v1.Pod, *kubecontainer.PodStatus) {
pod, status := makeBasePodAndStatus()
pod.Spec.InitContainers = []v1.Container{
Expand Down
Expand Up @@ -67,6 +67,8 @@ type FakeRuntimeService struct {
Containers map[string]*FakeContainer
Sandboxes map[string]*FakePodSandbox
FakeContainerStats map[string]*runtimeapi.ContainerStats

ErrorOnSandboxCreate bool
}

// GetContainerID returns the unique container ID from the FakeRuntimeService.
Expand Down Expand Up @@ -198,6 +200,10 @@ func (r *FakeRuntimeService) RunPodSandbox(config *runtimeapi.PodSandboxConfig,
return "", err
}

if r.ErrorOnSandboxCreate {
return "", fmt.Errorf("error on sandbox create")
}

// PodSandboxID should be randomized for real container runtime, but here just use
// fixed name from BuildSandboxName() for easily making fake sandboxes.
podSandboxID := BuildSandboxName(config.Metadata)
Expand Down