Skip to content

Commit

Permalink
if a pod is taking too long to evict, move on to others
Browse files Browse the repository at this point in the history
Signed-off-by: Olga Shestopalova <oshestopalova@hubspot.com>
  • Loading branch information
Olga Shestopalova committed Mar 27, 2024
1 parent 227c2e7 commit a3e7fac
Show file tree
Hide file tree
Showing 4 changed files with 165 additions and 3 deletions.
10 changes: 10 additions & 0 deletions pkg/kubelet/eviction/eviction_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ type managerImpl struct {
thresholdsLastUpdated time.Time
// whether can support local storage capacity isolation
localStorageCapacityIsolation bool
// podsEvicted is the set of pods (format: namespace/pod: bool) that we have already tried to evict
podsEvicted map[string]bool
}

// ensure it implements the required interface
Expand Down Expand Up @@ -134,6 +136,7 @@ func NewManager(
splitContainerImageFs: nil,
thresholdNotifiers: []ThresholdNotifier{},
localStorageCapacityIsolation: localStorageCapacityIsolation,
podsEvicted: map[string]bool{},
}
return manager, manager
}
Expand Down Expand Up @@ -354,6 +357,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act

if len(thresholds) == 0 {
klog.V(3).InfoS("Eviction manager: no resources are starved")
m.podsEvicted = map[string]bool{}
return nil, nil
}

Expand Down Expand Up @@ -405,6 +409,12 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
// we kill at most a single pod during each eviction interval
for i := range activePods {
pod := activePods[i]
podAndNamespace := pod.Namespace + "/" + pod.Name
if _, ok := m.podsEvicted[podAndNamespace]; ok {
klog.Warningf("Eviction manager: not evicting pod that we already attempted to evict: %v", podAndNamespace)
continue
}
m.podsEvicted[podAndNamespace] = true
gracePeriodOverride := int64(0)
if !isHardEvictionThreshold(thresholdToReclaim) {
gracePeriodOverride = m.config.MaxPodGracePeriodSeconds
Expand Down
143 changes: 141 additions & 2 deletions pkg/kubelet/eviction/eviction_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,7 @@ func TestMemoryPressure_VerifyPodStatus(t *testing.T) {
nodeRef: nodeRef,
nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
thresholdsFirstObservedAt: thresholdsObservedAt{},
podsEvicted: map[string]bool{},
}

// synchronize to detect the memory pressure
Expand Down Expand Up @@ -441,6 +442,7 @@ func TestPIDPressure_VerifyPodStatus(t *testing.T) {
nodeRef: nodeRef,
nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
thresholdsFirstObservedAt: thresholdsObservedAt{},
podsEvicted: map[string]bool{},
}

// synchronize to detect the PID pressure
Expand Down Expand Up @@ -624,6 +626,7 @@ func TestDiskPressureNodeFs_VerifyPodStatus(t *testing.T) {
nodeRef: nodeRef,
nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
thresholdsFirstObservedAt: thresholdsObservedAt{},
podsEvicted: map[string]bool{},
}

// synchronize
Expand Down Expand Up @@ -728,6 +731,7 @@ func TestMemoryPressure(t *testing.T) {
nodeRef: nodeRef,
nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
thresholdsFirstObservedAt: thresholdsObservedAt{},
podsEvicted: map[string]bool{},
}

// create a best effort pod to test admission
Expand Down Expand Up @@ -999,6 +1003,7 @@ func TestPIDPressure(t *testing.T) {
nodeRef: nodeRef,
nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
thresholdsFirstObservedAt: thresholdsObservedAt{},
podsEvicted: map[string]bool{},
}

// create a pod to test admission
Expand Down Expand Up @@ -1376,6 +1381,7 @@ func TestDiskPressureNodeFs(t *testing.T) {
nodeRef: nodeRef,
nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
thresholdsFirstObservedAt: thresholdsObservedAt{},
podsEvicted: map[string]bool{},
}

// create a best effort pod to test admission
Expand Down Expand Up @@ -1575,6 +1581,7 @@ func TestMinReclaim(t *testing.T) {
podStats[pod] = podStat
}
podToEvict := pods[4]
nextPodToEvict := pods[2]
activePodsFunc := func() []*v1.Pod {
return pods
}
Expand Down Expand Up @@ -1613,6 +1620,7 @@ func TestMinReclaim(t *testing.T) {
nodeRef: nodeRef,
nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
thresholdsFirstObservedAt: thresholdsObservedAt{},
podsEvicted: map[string]bool{},
}

// synchronize
Expand Down Expand Up @@ -1664,8 +1672,8 @@ func TestMinReclaim(t *testing.T) {
}

// check the right pod was killed
if podKiller.pod != podToEvict {
t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name)
if podKiller.pod != nextPodToEvict {
t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, nextPodToEvict.Name)
}
observedGracePeriod = *podKiller.gracePeriodOverride
if observedGracePeriod != int64(0) {
Expand Down Expand Up @@ -1898,6 +1906,7 @@ func TestNodeReclaimFuncs(t *testing.T) {
nodeRef: nodeRef,
nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
thresholdsFirstObservedAt: thresholdsObservedAt{},
podsEvicted: map[string]bool{},
}

// synchronize
Expand Down Expand Up @@ -2355,6 +2364,7 @@ func TestInodePressureFsInodes(t *testing.T) {
nodeRef: nodeRef,
nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
thresholdsFirstObservedAt: thresholdsObservedAt{},
podsEvicted: map[string]bool{},
}

// create a best effort pod to test admission
Expand Down Expand Up @@ -2589,6 +2599,7 @@ func TestStaticCriticalPodsAreNotEvicted(t *testing.T) {
nodeRef: nodeRef,
nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
thresholdsFirstObservedAt: thresholdsObservedAt{},
podsEvicted: map[string]bool{},
}

fakeClock.Step(1 * time.Minute)
Expand Down Expand Up @@ -2720,6 +2731,7 @@ func TestAllocatableMemoryPressure(t *testing.T) {
nodeRef: nodeRef,
nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
thresholdsFirstObservedAt: thresholdsObservedAt{},
podsEvicted: map[string]bool{},
}

// create a best effort pod to test admission
Expand Down Expand Up @@ -2889,6 +2901,7 @@ func TestUpdateMemcgThreshold(t *testing.T) {
nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
thresholdsFirstObservedAt: thresholdsObservedAt{},
thresholdNotifiers: []ThresholdNotifier{thresholdNotifier},
podsEvicted: map[string]bool{},
}

// The UpdateThreshold method should have been called once, since this is the first run.
Expand Down Expand Up @@ -2984,6 +2997,7 @@ func TestManagerWithLocalStorageCapacityIsolationOpen(t *testing.T) {
nodeRef: nodeRef,
localStorageCapacityIsolation: true,
dedicatedImageFs: diskInfoProvider.dedicatedImageFs,
podsEvicted: map[string]bool{},
}

activePodsFunc := func() []*v1.Pod {
Expand All @@ -3003,3 +3017,128 @@ func TestManagerWithLocalStorageCapacityIsolationOpen(t *testing.T) {
t.Fatalf("Unexpected evicted pod (-want,+got):\n%s", diff)
}
}

// TestSoftEvictOthersWhileWaitingForPodGracefulShutdown verifies that if a pod is taking too long to evict, we
// proceed to evict others while it shuts down
func TestSoftEvictOthersWhileWaitingForPodGracefulShutdown(t *testing.T) {
podMaker := makePodWithMemoryStats
summaryStatsMaker := makeMemoryStats
podsToMake := []podToMake{
{name: "first-to-go", priority: defaultPriority, requests: newResourceList("100m", "1Gi", ""), limits: newResourceList("100m", "1Gi", ""), memoryWorkingSet: "1Gi"},
{name: "second-to-go", priority: defaultPriority, requests: newResourceList("100m", "1Gi", ""), limits: newResourceList("100m", "1Gi", ""), memoryWorkingSet: "900Mi"},
{name: "this-one-stays", priority: defaultPriority, requests: newResourceList("100m", "1Gi", ""), limits: newResourceList("100m", "1Gi", ""), memoryWorkingSet: "100Mi"},
}
pods := []*v1.Pod{}
podStats := map[*v1.Pod]statsapi.PodStats{}
for _, podToMake := range podsToMake {
pod, podStat := podMaker(podToMake.name, podToMake.priority, podToMake.requests, podToMake.limits, podToMake.memoryWorkingSet)
pods = append(pods, pod)
podStats[pod] = podStat
}
podToEvict := pods[0]
nextPodToEvict := pods[1]
activePodsFunc := func() []*v1.Pod {
return pods
}

fakeClock := testingclock.NewFakeClock(time.Now())
podKiller := &mockPodKiller{}
diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: ptr.To(false)}
diskGC := &mockDiskGC{err: nil}
nodeRef := &v1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}

config := Config{
MaxPodGracePeriodSeconds: 5,
PressureTransitionPeriod: time.Minute * 5,
Thresholds: []evictionapi.Threshold{
{
Signal: evictionapi.SignalMemoryAvailable,
Operator: evictionapi.OpLessThan,
Value: evictionapi.ThresholdValue{
Quantity: quantityMustParse("1Gi"),
},
},
},
}
summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker("500Mi", podStats)}
manager := &managerImpl{
clock: fakeClock,
killPodFunc: podKiller.killPodNow,
imageGC: diskGC,
containerGC: diskGC,
config: config,
recorder: &record.FakeRecorder{},
summaryProvider: summaryProvider,
nodeRef: nodeRef,
nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
thresholdsFirstObservedAt: thresholdsObservedAt{},
podsEvicted: map[string]bool{},
}

// synchronize
_, err := manager.synchronize(diskInfoProvider, activePodsFunc)
if err != nil {
t.Fatalf("Manager should not report any errors")
}

if podKiller.pod == nil {
t.Fatalf("Manager should have chosen to kill a pod, but did not")
}
if podKiller.pod != podToEvict {
t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name)
}
observedGracePeriod := *podKiller.gracePeriodOverride
if observedGracePeriod != int64(0) {
t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod)
}

// pod is taking too long to shut down, we still have memory pressure
fakeClock.Step(6 * time.Minute)
podKiller.pod = nil
_, err = manager.synchronize(diskInfoProvider, activePodsFunc)
if err != nil {
t.Fatalf("Manager should not have an error %v", err)
}
if podKiller.pod == nil {
t.Fatalf("Manager should have chosen to kill a pod, but did not")
}
if podKiller.pod != nextPodToEvict {
t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, nextPodToEvict.Name)
}
observedGracePeriod = *podKiller.gracePeriodOverride
if observedGracePeriod != int64(0) {
t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod)
}

// now we no longer have memory pressure, this will reset who has been evicted (in real life, the pods may have re-joined)
fakeClock.Step(1 * time.Minute)
summaryProvider.result = summaryStatsMaker("2Gi", podStats)
podKiller.pod = nil
_, err = manager.synchronize(diskInfoProvider, activePodsFunc)
if err != nil {
t.Fatalf("Manager should not have an error %v", err)
}
if podKiller.pod != nil {
t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod.Name)
}

// under memory pressure again, should choose the first pod again
fakeClock.Step(1 * time.Minute)
summaryProvider.result = summaryStatsMaker("500Mi", podStats)
podKiller.pod = nil
_, err = manager.synchronize(diskInfoProvider, activePodsFunc)
if err != nil {
t.Fatalf("Manager should not have an error %v", err)
}
if podKiller.pod == nil {
t.Fatalf("Manager should have chosen to kill a pod, but did not")
}
if podKiller.pod != podToEvict {
t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name)
}
observedGracePeriod = *podKiller.gracePeriodOverride
if observedGracePeriod != int64(0) {
t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod)
}

}
5 changes: 4 additions & 1 deletion pkg/kubelet/kubelet.go
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,9 @@ const (

// instrumentationScope is the name of OpenTelemetry instrumentation scope
instrumentationScope = "k8s.io/kubernetes/pkg/kubelet"

// urgentPodKillTimeoutSeconds is how long an urgent pod kill should wait for the pod to be killed before timing out
urgentPodKillTimeoutSeconds = 300
)

var (
Expand Down Expand Up @@ -856,7 +859,7 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,

// setup eviction manager
evictionManager, evictionAdmitHandler := eviction.NewManager(klet.resourceAnalyzer, evictionConfig,
killPodNow(klet.podWorkers, kubeDeps.Recorder), klet.imageManager, klet.containerGC, kubeDeps.Recorder, nodeRef, klet.clock, kubeCfg.LocalStorageCapacityIsolation)
killPodNowWithMaxTimeout(klet.podWorkers, kubeDeps.Recorder, urgentPodKillTimeoutSeconds), klet.imageManager, klet.containerGC, kubeDeps.Recorder, nodeRef, klet.clock, kubeCfg.LocalStorageCapacityIsolation)

klet.evictionManager = evictionManager
klet.admitHandlers.AddPodAdmitHandler(evictionAdmitHandler)
Expand Down
10 changes: 10 additions & 0 deletions pkg/kubelet/pod_workers.go
Original file line number Diff line number Diff line change
Expand Up @@ -1645,6 +1645,13 @@ func (p *podWorkers) removeTerminatedWorker(uid types.UID, status *podSyncStatus
// killPodNow returns a KillPodFunc that can be used to kill a pod.
// It is intended to be injected into other modules that need to kill a pod.
func killPodNow(podWorkers PodWorkers, recorder record.EventRecorder) eviction.KillPodFunc {
return killPodNowWithMaxTimeout(podWorkers, recorder, -1)
}

// killPodNowWithMaxTimeout returns a KillPodFunc that can be used to kill a pod.
// It is intended to be injected into other modules that need to kill a pod.
// accepts a max timeout before returning to the caller
func killPodNowWithMaxTimeout(podWorkers PodWorkers, recorder record.EventRecorder, maxTimeout int64) eviction.KillPodFunc {
return func(pod *v1.Pod, isEvicted bool, gracePeriodOverride *int64, statusFn func(*v1.PodStatus)) error {
// determine the grace period to use when killing the pod
gracePeriod := int64(0)
Expand All @@ -1661,6 +1668,9 @@ func killPodNow(podWorkers PodWorkers, recorder record.EventRecorder) eviction.K
if timeout < minTimeout {
timeout = minTimeout
}
if maxTimeout != -1 && timeout > maxTimeout {
timeout = maxTimeout
}
timeoutDuration := time.Duration(timeout) * time.Second

// open a channel we block against until we get a result
Expand Down

0 comments on commit a3e7fac

Please sign in to comment.