From 22079a79d45ffb2e478e275731b9e5ee2b6bf99d Mon Sep 17 00:00:00 2001
From: Shingo Omura <everpeace@gmail.com>
Date: Sat, 29 Dec 2018 23:24:51 +0900
Subject: [PATCH] change sort function of scheduling queue to avoid starvation
 when unschedulable pods are in the queue

When starvation heppens:
- a lot of unschedulable pods exists in the head of queue
- because condition.LastTransitionTime is updated only when condition.Status changed
- (this means that once a pod is marked unschedulable, the field never updated until the pod successfuly scheduled.)

What was changed:
- condition.LastProbeTime is updated everytime when pod is determined
unschedulable.
- changed sort function so to use LastProbeTime to avoid starvation
described above

Consideration:
- This changes increases k8s API server load because it updates Pod.status whenever scheduler decides it as
unschedulable.

Signed-off-by: Shingo Omura <everpeace@gmail.com>
---
 .../internal/queue/scheduling_queue.go        |   5 +-
 .../internal/queue/scheduling_queue_test.go   | 101 +++++++++++++++++-
 pkg/scheduler/scheduler.go                    |   9 +-
 3 files changed, 106 insertions(+), 9 deletions(-)

diff --git a/pkg/scheduler/internal/queue/scheduling_queue.go b/pkg/scheduler/internal/queue/scheduling_queue.go
index a72aced91edc..aefc0e0cc9f1 100644
--- a/pkg/scheduler/internal/queue/scheduling_queue.go
+++ b/pkg/scheduler/internal/queue/scheduling_queue.go
@@ -230,7 +230,10 @@ func podTimestamp(pod *v1.Pod) *metav1.Time {
 	if condition == nil {
 		return &pod.CreationTimestamp
 	}
-	return &condition.LastTransitionTime
+	if condition.LastProbeTime.IsZero() {
+		return &condition.LastTransitionTime
+	}
+	return &condition.LastProbeTime
 }
 
 // activeQComp is the function used by the activeQ heap algorithm to sort pods.
diff --git a/pkg/scheduler/internal/queue/scheduling_queue_test.go b/pkg/scheduler/internal/queue/scheduling_queue_test.go
index 7ac5fc1fbd5a..6fca021f92f6 100644
--- a/pkg/scheduler/internal/queue/scheduling_queue_test.go
+++ b/pkg/scheduler/internal/queue/scheduling_queue_test.go
@@ -572,10 +572,11 @@ func TestRecentlyTriedPodsGoBack(t *testing.T) {
 	}
 	// Update pod condition to unschedulable.
 	podutil.UpdatePodCondition(&p1.Status, &v1.PodCondition{
-		Type:    v1.PodScheduled,
-		Status:  v1.ConditionFalse,
-		Reason:  v1.PodReasonUnschedulable,
-		Message: "fake scheduling failure",
+		Type:          v1.PodScheduled,
+		Status:        v1.ConditionFalse,
+		Reason:        v1.PodReasonUnschedulable,
+		Message:       "fake scheduling failure",
+		LastProbeTime: metav1.Now(),
 	})
 	// Put in the unschedulable queue.
 	q.AddUnschedulableIfNotPresent(p1)
@@ -594,6 +595,98 @@ func TestRecentlyTriedPodsGoBack(t *testing.T) {
 	}
 }
 
+// TestPodFailedSchedulingMultipleTimesDoesNotBlockNewerPod tests
+// that a pod determined as unschedulable multiple times doesn't block any newer pod.
+// This behavior ensures that an unschedulable pod does not block head of the queue when there
+// are frequent events that move pods to the active queue.
+func TestPodFailedSchedulingMultipleTimesDoesNotBlockNewerPod(t *testing.T) {
+	q := NewPriorityQueue(nil)
+
+	// Add an unschedulable pod to a priority queue.
+	// This makes a situation that the pod was tried to schedule
+	// and had been determined unschedulable so far.
+	unschedulablePod := v1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      "test-pod-unscheduled",
+			Namespace: "ns1",
+			UID:       "tp001",
+		},
+		Spec: v1.PodSpec{
+			Priority: &highPriority,
+		},
+		Status: v1.PodStatus{
+			NominatedNodeName: "node1",
+		},
+	}
+	// Update pod condition to unschedulable.
+	podutil.UpdatePodCondition(&unschedulablePod.Status, &v1.PodCondition{
+		Type:          v1.PodScheduled,
+		Status:        v1.ConditionFalse,
+		Reason:        v1.PodReasonUnschedulable,
+		Message:       "fake scheduling failure",
+		LastProbeTime: metav1.Now(),
+	})
+	// Put in the unschedulable queue
+	q.AddUnschedulableIfNotPresent(&unschedulablePod)
+	// Clear its backoff to simulate backoff its expiration
+	q.clearPodBackoff(&unschedulablePod)
+	// Move all unschedulable pods to the active queue.
+	q.MoveAllToActiveQueue()
+
+	// Simulate a pod being popped by the scheduler,
+	// At this time, unschedulable pod should be popped.
+	p1, err := q.Pop()
+	if err != nil {
+		t.Errorf("Error while popping the head of the queue: %v", err)
+	}
+	if p1 != &unschedulablePod {
+		t.Errorf("Expected that test-pod-unscheduled was popped, got %v", p1.Name)
+	}
+
+	// Assume newer pod was added just after unschedulable pod
+	// being popped and before being pushed back to the queue.
+	newerPod := v1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:              "test-newer-pod",
+			Namespace:         "ns1",
+			UID:               "tp002",
+			CreationTimestamp: metav1.Now(),
+		},
+		Spec: v1.PodSpec{
+			Priority: &highPriority,
+		},
+		Status: v1.PodStatus{
+			NominatedNodeName: "node1",
+		},
+	}
+	q.Add(&newerPod)
+
+	// And then unschedulablePod was determined as unschedulable AGAIN.
+	podutil.UpdatePodCondition(&unschedulablePod.Status, &v1.PodCondition{
+		Type:          v1.PodScheduled,
+		Status:        v1.ConditionFalse,
+		Reason:        v1.PodReasonUnschedulable,
+		Message:       "fake scheduling failure",
+		LastProbeTime: metav1.Now(),
+	})
+	// And then, put unschedulable pod to the unschedulable queue
+	q.AddUnschedulableIfNotPresent(&unschedulablePod)
+	// Clear its backoff to simulate its backoff expiration
+	q.clearPodBackoff(&unschedulablePod)
+	// Move all unschedulable pods to the active queue.
+	q.MoveAllToActiveQueue()
+
+	// At this time, newerPod should be popped
+	// because it is the oldest tried pod.
+	p2, err2 := q.Pop()
+	if err2 != nil {
+		t.Errorf("Error while popping the head of the queue: %v", err2)
+	}
+	if p2 != &newerPod {
+		t.Errorf("Expected that test-newer-pod was popped, got %v", p2.Name)
+	}
+}
+
 // TestHighPriorityBackoff tests that a high priority pod does not block
 // other pods if it is unschedulable
 func TestHighProirotyBackoff(t *testing.T) {
diff --git a/pkg/scheduler/scheduler.go b/pkg/scheduler/scheduler.go
index 0992ffe203ee..59bbd439f1d9 100644
--- a/pkg/scheduler/scheduler.go
+++ b/pkg/scheduler/scheduler.go
@@ -282,10 +282,11 @@ func (sched *Scheduler) recordSchedulingFailure(pod *v1.Pod, err error, reason s
 	sched.config.Error(pod, err)
 	sched.config.Recorder.Event(pod, v1.EventTypeWarning, "FailedScheduling", message)
 	sched.config.PodConditionUpdater.Update(pod, &v1.PodCondition{
-		Type:    v1.PodScheduled,
-		Status:  v1.ConditionFalse,
-		Reason:  reason,
-		Message: err.Error(),
+		Type:          v1.PodScheduled,
+		Status:        v1.ConditionFalse,
+		LastProbeTime: metav1.Now(),
+		Reason:        reason,
+		Message:       err.Error(),
 	})
 }