From 22079a79d45ffb2e478e275731b9e5ee2b6bf99d Mon Sep 17 00:00:00 2001 From: Shingo Omura Date: Sat, 29 Dec 2018 23:24:51 +0900 Subject: [PATCH] change sort function of scheduling queue to avoid starvation when unschedulable pods are in the queue When starvation heppens: - a lot of unschedulable pods exists in the head of queue - because condition.LastTransitionTime is updated only when condition.Status changed - (this means that once a pod is marked unschedulable, the field never updated until the pod successfuly scheduled.) What was changed: - condition.LastProbeTime is updated everytime when pod is determined unschedulable. - changed sort function so to use LastProbeTime to avoid starvation described above Consideration: - This changes increases k8s API server load because it updates Pod.status whenever scheduler decides it as unschedulable. Signed-off-by: Shingo Omura --- .../internal/queue/scheduling_queue.go | 5 +- .../internal/queue/scheduling_queue_test.go | 101 +++++++++++++++++- pkg/scheduler/scheduler.go | 9 +- 3 files changed, 106 insertions(+), 9 deletions(-) diff --git a/pkg/scheduler/internal/queue/scheduling_queue.go b/pkg/scheduler/internal/queue/scheduling_queue.go index a72aced91edc..aefc0e0cc9f1 100644 --- a/pkg/scheduler/internal/queue/scheduling_queue.go +++ b/pkg/scheduler/internal/queue/scheduling_queue.go @@ -230,7 +230,10 @@ func podTimestamp(pod *v1.Pod) *metav1.Time { if condition == nil { return &pod.CreationTimestamp } - return &condition.LastTransitionTime + if condition.LastProbeTime.IsZero() { + return &condition.LastTransitionTime + } + return &condition.LastProbeTime } // activeQComp is the function used by the activeQ heap algorithm to sort pods. diff --git a/pkg/scheduler/internal/queue/scheduling_queue_test.go b/pkg/scheduler/internal/queue/scheduling_queue_test.go index 7ac5fc1fbd5a..6fca021f92f6 100644 --- a/pkg/scheduler/internal/queue/scheduling_queue_test.go +++ b/pkg/scheduler/internal/queue/scheduling_queue_test.go @@ -572,10 +572,11 @@ func TestRecentlyTriedPodsGoBack(t *testing.T) { } // Update pod condition to unschedulable. podutil.UpdatePodCondition(&p1.Status, &v1.PodCondition{ - Type: v1.PodScheduled, - Status: v1.ConditionFalse, - Reason: v1.PodReasonUnschedulable, - Message: "fake scheduling failure", + Type: v1.PodScheduled, + Status: v1.ConditionFalse, + Reason: v1.PodReasonUnschedulable, + Message: "fake scheduling failure", + LastProbeTime: metav1.Now(), }) // Put in the unschedulable queue. q.AddUnschedulableIfNotPresent(p1) @@ -594,6 +595,98 @@ func TestRecentlyTriedPodsGoBack(t *testing.T) { } } +// TestPodFailedSchedulingMultipleTimesDoesNotBlockNewerPod tests +// that a pod determined as unschedulable multiple times doesn't block any newer pod. +// This behavior ensures that an unschedulable pod does not block head of the queue when there +// are frequent events that move pods to the active queue. +func TestPodFailedSchedulingMultipleTimesDoesNotBlockNewerPod(t *testing.T) { + q := NewPriorityQueue(nil) + + // Add an unschedulable pod to a priority queue. + // This makes a situation that the pod was tried to schedule + // and had been determined unschedulable so far. + unschedulablePod := v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod-unscheduled", + Namespace: "ns1", + UID: "tp001", + }, + Spec: v1.PodSpec{ + Priority: &highPriority, + }, + Status: v1.PodStatus{ + NominatedNodeName: "node1", + }, + } + // Update pod condition to unschedulable. + podutil.UpdatePodCondition(&unschedulablePod.Status, &v1.PodCondition{ + Type: v1.PodScheduled, + Status: v1.ConditionFalse, + Reason: v1.PodReasonUnschedulable, + Message: "fake scheduling failure", + LastProbeTime: metav1.Now(), + }) + // Put in the unschedulable queue + q.AddUnschedulableIfNotPresent(&unschedulablePod) + // Clear its backoff to simulate backoff its expiration + q.clearPodBackoff(&unschedulablePod) + // Move all unschedulable pods to the active queue. + q.MoveAllToActiveQueue() + + // Simulate a pod being popped by the scheduler, + // At this time, unschedulable pod should be popped. + p1, err := q.Pop() + if err != nil { + t.Errorf("Error while popping the head of the queue: %v", err) + } + if p1 != &unschedulablePod { + t.Errorf("Expected that test-pod-unscheduled was popped, got %v", p1.Name) + } + + // Assume newer pod was added just after unschedulable pod + // being popped and before being pushed back to the queue. + newerPod := v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-newer-pod", + Namespace: "ns1", + UID: "tp002", + CreationTimestamp: metav1.Now(), + }, + Spec: v1.PodSpec{ + Priority: &highPriority, + }, + Status: v1.PodStatus{ + NominatedNodeName: "node1", + }, + } + q.Add(&newerPod) + + // And then unschedulablePod was determined as unschedulable AGAIN. + podutil.UpdatePodCondition(&unschedulablePod.Status, &v1.PodCondition{ + Type: v1.PodScheduled, + Status: v1.ConditionFalse, + Reason: v1.PodReasonUnschedulable, + Message: "fake scheduling failure", + LastProbeTime: metav1.Now(), + }) + // And then, put unschedulable pod to the unschedulable queue + q.AddUnschedulableIfNotPresent(&unschedulablePod) + // Clear its backoff to simulate its backoff expiration + q.clearPodBackoff(&unschedulablePod) + // Move all unschedulable pods to the active queue. + q.MoveAllToActiveQueue() + + // At this time, newerPod should be popped + // because it is the oldest tried pod. + p2, err2 := q.Pop() + if err2 != nil { + t.Errorf("Error while popping the head of the queue: %v", err2) + } + if p2 != &newerPod { + t.Errorf("Expected that test-newer-pod was popped, got %v", p2.Name) + } +} + // TestHighPriorityBackoff tests that a high priority pod does not block // other pods if it is unschedulable func TestHighProirotyBackoff(t *testing.T) { diff --git a/pkg/scheduler/scheduler.go b/pkg/scheduler/scheduler.go index 0992ffe203ee..59bbd439f1d9 100644 --- a/pkg/scheduler/scheduler.go +++ b/pkg/scheduler/scheduler.go @@ -282,10 +282,11 @@ func (sched *Scheduler) recordSchedulingFailure(pod *v1.Pod, err error, reason s sched.config.Error(pod, err) sched.config.Recorder.Event(pod, v1.EventTypeWarning, "FailedScheduling", message) sched.config.PodConditionUpdater.Update(pod, &v1.PodCondition{ - Type: v1.PodScheduled, - Status: v1.ConditionFalse, - Reason: reason, - Message: err.Error(), + Type: v1.PodScheduled, + Status: v1.ConditionFalse, + LastProbeTime: metav1.Now(), + Reason: reason, + Message: err.Error(), }) }