if a pod is taking too long to evict, move on to others

Signed-off-by: Olga Shestopalova <oshestopalova@hubspot.com>
kubernetes · Mar 27, 2024 · a3e7fac · a3e7fac
1 parent 227c2e7
commit a3e7fac
Show file tree

Hide file tree

Showing 4 changed files with 165 additions and 3 deletions.
diff --git a/pkg/kubelet/eviction/eviction_manager.go b/pkg/kubelet/eviction/eviction_manager.go
@@ -102,6 +102,8 @@ type managerImpl struct {
 	thresholdsLastUpdated time.Time
 	// whether can support local storage capacity isolation
 	localStorageCapacityIsolation bool
+	// podsEvicted is the set of pods (format: namespace/pod: bool) that we have already tried to evict
+	podsEvicted map[string]bool
 }
 
 // ensure it implements the required interface
@@ -134,6 +136,7 @@ func NewManager(
 		splitContainerImageFs:         nil,
 		thresholdNotifiers:            []ThresholdNotifier{},
 		localStorageCapacityIsolation: localStorageCapacityIsolation,
+		podsEvicted:                   map[string]bool{},
 	}
 	return manager, manager
 }
@@ -354,6 +357,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
 
 	if len(thresholds) == 0 {
 		klog.V(3).InfoS("Eviction manager: no resources are starved")
+		m.podsEvicted = map[string]bool{}
 		return nil, nil
 	}
 
@@ -405,6 +409,12 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
 	// we kill at most a single pod during each eviction interval
 	for i := range activePods {
 		pod := activePods[i]
+		podAndNamespace := pod.Namespace + "/" + pod.Name
+		if _, ok := m.podsEvicted[podAndNamespace]; ok {
+			klog.Warningf("Eviction manager: not evicting pod that we already attempted to evict: %v", podAndNamespace)
+			continue
+		}
+		m.podsEvicted[podAndNamespace] = true
 		gracePeriodOverride := int64(0)
 		if !isHardEvictionThreshold(thresholdToReclaim) {
 			gracePeriodOverride = m.config.MaxPodGracePeriodSeconds

diff --git a/pkg/kubelet/eviction/eviction_manager_test.go b/pkg/kubelet/eviction/eviction_manager_test.go
@@ -339,6 +339,7 @@ func TestMemoryPressure_VerifyPodStatus(t *testing.T) {
 					nodeRef:                      nodeRef,
 					nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
 					thresholdsFirstObservedAt:    thresholdsObservedAt{},
+					podsEvicted:                  map[string]bool{},
 				}
 
 				// synchronize to detect the memory pressure
@@ -441,6 +442,7 @@ func TestPIDPressure_VerifyPodStatus(t *testing.T) {
 					nodeRef:                      nodeRef,
 					nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
 					thresholdsFirstObservedAt:    thresholdsObservedAt{},
+					podsEvicted:                  map[string]bool{},
 				}
 
 				// synchronize to detect the PID pressure
@@ -624,6 +626,7 @@ func TestDiskPressureNodeFs_VerifyPodStatus(t *testing.T) {
 					nodeRef:                      nodeRef,
 					nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
 					thresholdsFirstObservedAt:    thresholdsObservedAt{},
+					podsEvicted:                  map[string]bool{},
 				}
 
 				// synchronize
@@ -728,6 +731,7 @@ func TestMemoryPressure(t *testing.T) {
 		nodeRef:                      nodeRef,
 		nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
 		thresholdsFirstObservedAt:    thresholdsObservedAt{},
+		podsEvicted:                  map[string]bool{},
 	}
 
 	// create a best effort pod to test admission
@@ -999,6 +1003,7 @@ func TestPIDPressure(t *testing.T) {
 				nodeRef:                      nodeRef,
 				nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
 				thresholdsFirstObservedAt:    thresholdsObservedAt{},
+				podsEvicted:                  map[string]bool{},
 			}
 
 			// create a pod to test admission
@@ -1376,6 +1381,7 @@ func TestDiskPressureNodeFs(t *testing.T) {
 				nodeRef:                      nodeRef,
 				nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
 				thresholdsFirstObservedAt:    thresholdsObservedAt{},
+				podsEvicted:                  map[string]bool{},
 			}
 
 			// create a best effort pod to test admission
@@ -1575,6 +1581,7 @@ func TestMinReclaim(t *testing.T) {
 		podStats[pod] = podStat
 	}
 	podToEvict := pods[4]
+	nextPodToEvict := pods[2]
 	activePodsFunc := func() []*v1.Pod {
 		return pods
 	}
@@ -1613,6 +1620,7 @@ func TestMinReclaim(t *testing.T) {
 		nodeRef:                      nodeRef,
 		nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
 		thresholdsFirstObservedAt:    thresholdsObservedAt{},
+		podsEvicted:                  map[string]bool{},
 	}
 
 	// synchronize
@@ -1664,8 +1672,8 @@ func TestMinReclaim(t *testing.T) {
 	}
 
 	// check the right pod was killed
-	if podKiller.pod != podToEvict {
-		t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name)
+	if podKiller.pod != nextPodToEvict {
+		t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, nextPodToEvict.Name)
 	}
 	observedGracePeriod = *podKiller.gracePeriodOverride
 	if observedGracePeriod != int64(0) {
@@ -1898,6 +1906,7 @@ func TestNodeReclaimFuncs(t *testing.T) {
 				nodeRef:                      nodeRef,
 				nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
 				thresholdsFirstObservedAt:    thresholdsObservedAt{},
+				podsEvicted:                  map[string]bool{},
 			}
 
 			// synchronize
@@ -2355,6 +2364,7 @@ func TestInodePressureFsInodes(t *testing.T) {
 				nodeRef:                      nodeRef,
 				nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
 				thresholdsFirstObservedAt:    thresholdsObservedAt{},
+				podsEvicted:                  map[string]bool{},
 			}
 
 			// create a best effort pod to test admission
@@ -2589,6 +2599,7 @@ func TestStaticCriticalPodsAreNotEvicted(t *testing.T) {
 		nodeRef:                      nodeRef,
 		nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
 		thresholdsFirstObservedAt:    thresholdsObservedAt{},
+		podsEvicted:                  map[string]bool{},
 	}
 
 	fakeClock.Step(1 * time.Minute)
@@ -2720,6 +2731,7 @@ func TestAllocatableMemoryPressure(t *testing.T) {
 		nodeRef:                      nodeRef,
 		nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
 		thresholdsFirstObservedAt:    thresholdsObservedAt{},
+		podsEvicted:                  map[string]bool{},
 	}
 
 	// create a best effort pod to test admission
@@ -2889,6 +2901,7 @@ func TestUpdateMemcgThreshold(t *testing.T) {
 		nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
 		thresholdsFirstObservedAt:    thresholdsObservedAt{},
 		thresholdNotifiers:           []ThresholdNotifier{thresholdNotifier},
+		podsEvicted:                  map[string]bool{},
 	}
 
 	// The UpdateThreshold method should have been called once, since this is the first run.
@@ -2984,6 +2997,7 @@ func TestManagerWithLocalStorageCapacityIsolationOpen(t *testing.T) {
 		nodeRef:                       nodeRef,
 		localStorageCapacityIsolation: true,
 		dedicatedImageFs:              diskInfoProvider.dedicatedImageFs,
+		podsEvicted:                   map[string]bool{},
 	}
 
 	activePodsFunc := func() []*v1.Pod {
@@ -3003,3 +3017,128 @@ func TestManagerWithLocalStorageCapacityIsolationOpen(t *testing.T) {
 		t.Fatalf("Unexpected evicted pod (-want,+got):\n%s", diff)
 	}
 }
+
+// TestSoftEvictOthersWhileWaitingForPodGracefulShutdown verifies that if a pod is taking too long to evict, we
+// proceed to evict others while it shuts down
+func TestSoftEvictOthersWhileWaitingForPodGracefulShutdown(t *testing.T) {
+	podMaker := makePodWithMemoryStats
+	summaryStatsMaker := makeMemoryStats
+	podsToMake := []podToMake{
+		{name: "first-to-go", priority: defaultPriority, requests: newResourceList("100m", "1Gi", ""), limits: newResourceList("100m", "1Gi", ""), memoryWorkingSet: "1Gi"},
+		{name: "second-to-go", priority: defaultPriority, requests: newResourceList("100m", "1Gi", ""), limits: newResourceList("100m", "1Gi", ""), memoryWorkingSet: "900Mi"},
+		{name: "this-one-stays", priority: defaultPriority, requests: newResourceList("100m", "1Gi", ""), limits: newResourceList("100m", "1Gi", ""), memoryWorkingSet: "100Mi"},
+	}
+	pods := []*v1.Pod{}
+	podStats := map[*v1.Pod]statsapi.PodStats{}
+	for _, podToMake := range podsToMake {
+		pod, podStat := podMaker(podToMake.name, podToMake.priority, podToMake.requests, podToMake.limits, podToMake.memoryWorkingSet)
+		pods = append(pods, pod)
+		podStats[pod] = podStat
+	}
+	podToEvict := pods[0]
+	nextPodToEvict := pods[1]
+	activePodsFunc := func() []*v1.Pod {
+		return pods
+	}
+
+	fakeClock := testingclock.NewFakeClock(time.Now())
+	podKiller := &mockPodKiller{}
+	diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: ptr.To(false)}
+	diskGC := &mockDiskGC{err: nil}
+	nodeRef := &v1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}
+
+	config := Config{
+		MaxPodGracePeriodSeconds: 5,
+		PressureTransitionPeriod: time.Minute * 5,
+		Thresholds: []evictionapi.Threshold{
+			{
+				Signal:   evictionapi.SignalMemoryAvailable,
+				Operator: evictionapi.OpLessThan,
+				Value: evictionapi.ThresholdValue{
+					Quantity: quantityMustParse("1Gi"),
+				},
+			},
+		},
+	}
+	summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker("500Mi", podStats)}
+	manager := &managerImpl{
+		clock:                        fakeClock,
+		killPodFunc:                  podKiller.killPodNow,
+		imageGC:                      diskGC,
+		containerGC:                  diskGC,
+		config:                       config,
+		recorder:                     &record.FakeRecorder{},
+		summaryProvider:              summaryProvider,
+		nodeRef:                      nodeRef,
+		nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
+		thresholdsFirstObservedAt:    thresholdsObservedAt{},
+		podsEvicted:                  map[string]bool{},
+	}
+
+	// synchronize
+	_, err := manager.synchronize(diskInfoProvider, activePodsFunc)
+	if err != nil {
+		t.Fatalf("Manager should not report any errors")
+	}
+
+	if podKiller.pod == nil {
+		t.Fatalf("Manager should have chosen to kill a pod, but did not")
+	}
+	if podKiller.pod != podToEvict {
+		t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name)
+	}
+	observedGracePeriod := *podKiller.gracePeriodOverride
+	if observedGracePeriod != int64(0) {
+		t.Errorf("Manager chose to kill pod with incorrect grace period.  Expected: %d, actual: %d", 0, observedGracePeriod)
+	}
+
+	// pod is taking too long to shut down, we still have memory pressure
+	fakeClock.Step(6 * time.Minute)
+	podKiller.pod = nil
+	_, err = manager.synchronize(diskInfoProvider, activePodsFunc)
+	if err != nil {
+		t.Fatalf("Manager should not have an error %v", err)
+	}
+	if podKiller.pod == nil {
+		t.Fatalf("Manager should have chosen to kill a pod, but did not")
+	}
+	if podKiller.pod != nextPodToEvict {
+		t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, nextPodToEvict.Name)
+	}
+	observedGracePeriod = *podKiller.gracePeriodOverride
+	if observedGracePeriod != int64(0) {
+		t.Errorf("Manager chose to kill pod with incorrect grace period.  Expected: %d, actual: %d", 0, observedGracePeriod)
+	}
+
+	// now we no longer have memory pressure, this will reset who has been evicted (in real life, the pods may have re-joined)
+	fakeClock.Step(1 * time.Minute)
+	summaryProvider.result = summaryStatsMaker("2Gi", podStats)
+	podKiller.pod = nil
+	_, err = manager.synchronize(diskInfoProvider, activePodsFunc)
+	if err != nil {
+		t.Fatalf("Manager should not have an error %v", err)
+	}
+	if podKiller.pod != nil {
+		t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod.Name)
+	}
+
+	// under memory pressure again, should choose the first pod again
+	fakeClock.Step(1 * time.Minute)
+	summaryProvider.result = summaryStatsMaker("500Mi", podStats)
+	podKiller.pod = nil
+	_, err = manager.synchronize(diskInfoProvider, activePodsFunc)
+	if err != nil {
+		t.Fatalf("Manager should not have an error %v", err)
+	}
+	if podKiller.pod == nil {
+		t.Fatalf("Manager should have chosen to kill a pod, but did not")
+	}
+	if podKiller.pod != podToEvict {
+		t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name)
+	}
+	observedGracePeriod = *podKiller.gracePeriodOverride
+	if observedGracePeriod != int64(0) {
+		t.Errorf("Manager chose to kill pod with incorrect grace period.  Expected: %d, actual: %d", 0, observedGracePeriod)
+	}
+
+}
diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go
@@ -203,6 +203,9 @@ const (
 
 	// instrumentationScope is the name of OpenTelemetry instrumentation scope
 	instrumentationScope = "k8s.io/kubernetes/pkg/kubelet"
+
+	// urgentPodKillTimeoutSeconds is how long an urgent pod kill should wait for the pod to be killed before timing out
+	urgentPodKillTimeoutSeconds = 300
 )
 
 var (
@@ -856,7 +859,7 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
 
 	// setup eviction manager
 	evictionManager, evictionAdmitHandler := eviction.NewManager(klet.resourceAnalyzer, evictionConfig,
-		killPodNow(klet.podWorkers, kubeDeps.Recorder), klet.imageManager, klet.containerGC, kubeDeps.Recorder, nodeRef, klet.clock, kubeCfg.LocalStorageCapacityIsolation)
+		killPodNowWithMaxTimeout(klet.podWorkers, kubeDeps.Recorder, urgentPodKillTimeoutSeconds), klet.imageManager, klet.containerGC, kubeDeps.Recorder, nodeRef, klet.clock, kubeCfg.LocalStorageCapacityIsolation)
 
 	klet.evictionManager = evictionManager
 	klet.admitHandlers.AddPodAdmitHandler(evictionAdmitHandler)

diff --git a/pkg/kubelet/pod_workers.go b/pkg/kubelet/pod_workers.go
@@ -1645,6 +1645,13 @@ func (p *podWorkers) removeTerminatedWorker(uid types.UID, status *podSyncStatus
 // killPodNow returns a KillPodFunc that can be used to kill a pod.
 // It is intended to be injected into other modules that need to kill a pod.
 func killPodNow(podWorkers PodWorkers, recorder record.EventRecorder) eviction.KillPodFunc {
+	return killPodNowWithMaxTimeout(podWorkers, recorder, -1)
+}
+
+// killPodNowWithMaxTimeout returns a KillPodFunc that can be used to kill a pod.
+// It is intended to be injected into other modules that need to kill a pod.
+// accepts a max timeout before returning to the caller
+func killPodNowWithMaxTimeout(podWorkers PodWorkers, recorder record.EventRecorder, maxTimeout int64) eviction.KillPodFunc {
 	return func(pod *v1.Pod, isEvicted bool, gracePeriodOverride *int64, statusFn func(*v1.PodStatus)) error {
 		// determine the grace period to use when killing the pod
 		gracePeriod := int64(0)
@@ -1661,6 +1668,9 @@ func killPodNow(podWorkers PodWorkers, recorder record.EventRecorder) eviction.K
 		if timeout < minTimeout {
 			timeout = minTimeout
 		}
+		if maxTimeout != -1 && timeout > maxTimeout {
+			timeout = maxTimeout
+		}
 		timeoutDuration := time.Duration(timeout) * time.Second
 
 		// open a channel we block against until we get a result