kubernetes · k8s-ci-robot · Nov 8, 2022 · Nov 8, 2022 · Oct 23, 2022 · Nov 1, 2022
diff --git a/pkg/scheduler/internal/queue/scheduling_queue.go b/pkg/scheduler/internal/queue/scheduling_queue.go
@@ -441,6 +441,9 @@ func (p *PriorityQueue) activate(pod *v1.Pod) bool {
 // isPodBackingoff returns true if a pod is still waiting for its backoff timer.
 // If this returns true, the pod should not be re-tried.
 func (p *PriorityQueue) isPodBackingoff(podInfo *framework.QueuedPodInfo) bool {
+	if podInfo.Gated {
+		return false
+	}
 	boTime := p.getBackoffTime(podInfo)
 	return boTime.After(p.clock.Now())
 }

diff --git a/pkg/scheduler/internal/queue/scheduling_queue_test.go b/pkg/scheduler/internal/queue/scheduling_queue_test.go
@@ -512,14 +512,24 @@ func TestPriorityQueue_addToActiveQ(t *testing.T) {
 			defer cancel()
 
 			m := map[string][]framework.PreEnqueuePlugin{"": tt.plugins}
-			q := NewTestQueueWithObjects(ctx, newDefaultQueueSort(), []runtime.Object{tt.pod}, WithPreEnqueuePluginMap(m))
-			got, _ := q.addToActiveQ(newQueuedPodInfoForLookup(tt.pod))
+			q := NewTestQueueWithObjects(ctx, newDefaultQueueSort(), []runtime.Object{tt.pod}, WithPreEnqueuePluginMap(m),
+				WithPodInitialBackoffDuration(time.Second*30), WithPodMaxBackoffDuration(time.Second*60))
+			got, _ := q.addToActiveQ(q.newQueuedPodInfo(tt.pod))
 			if got != tt.wantSuccess {
 				t.Errorf("Unexpected result: want %v, but got %v", tt.wantSuccess, got)
 			}
 			if tt.wantUnschedulablePods != len(q.unschedulablePods.podInfoMap) {
 				t.Errorf("Unexpected unschedulablePods: want %v, but got %v", tt.wantUnschedulablePods, len(q.unschedulablePods.podInfoMap))
 			}
+
+			// Simulate an update event.
+			clone := tt.pod.DeepCopy()
+			metav1.SetMetaDataAnnotation(&clone.ObjectMeta, "foo", "")
+			q.Update(tt.pod, clone)
+			// Ensure the pod is still located in unschedulablePods.
+			if tt.wantUnschedulablePods != len(q.unschedulablePods.podInfoMap) {
+				t.Errorf("Unexpected unschedulablePods: want %v, but got %v", tt.wantUnschedulablePods, len(q.unschedulablePods.podInfoMap))
+			}
 		})
 	}
 }

diff --git a/test/e2e/framework/pod/wait.go b/test/e2e/framework/pod/wait.go
@@ -21,6 +21,7 @@ import (
 	"context"
 	"errors"
 	"fmt"
+	"reflect"
 	"text/tabwriter"
 	"time"
 
@@ -253,7 +254,7 @@ func WaitForPodsRunningReady(c clientset.Interface, ns string, minPods, allowedN
 					framework.Logf("Pod %s is Failed, but it's not controlled by a controller", pod.ObjectMeta.Name)
 					badPods = append(badPods, pod)
 				}
-				//ignore failed pods that are controlled by some controller
+				// ignore failed pods that are controlled by some controller
 			}
 		}
 
@@ -326,7 +327,7 @@ func WaitForPodCondition(c clientset.Interface, ns, podName, conditionDesc strin
 	return maybeTimeoutError(err, "waiting for pod %s to be %s", podIdentifier(ns, podName), conditionDesc)
 }
 
-// WaitForPodsCondition waits for the listed pods to match the given condition.
+// WaitForAllPodsCondition waits for the listed pods to match the given condition.
 // To succeed, at least minPods must be listed, and all listed pods must match the condition.
 func WaitForAllPodsCondition(c clientset.Interface, ns string, opts metav1.ListOptions, minPods int, conditionDesc string, timeout time.Duration, condition podCondition) (*v1.PodList, error) {
 	framework.Logf("Waiting up to %v for at least %d pods in namespace %s to be %s", timeout, minPods, ns, conditionDesc)
@@ -362,6 +363,78 @@ func WaitForAllPodsCondition(c clientset.Interface, ns string, opts metav1.ListO
 	return pods, maybeTimeoutError(err, "waiting for at least %d pods to be %s (matched %d)", minPods, conditionDesc, matched)
 }
 
+// WaitForPodsRunning waits for a given `timeout` to evaluate if a certain amount of pods in given `ns` are running.
+func WaitForPodsRunning(c clientset.Interface, ns string, num int, timeout time.Duration) error {
+	matched := 0
+	err := wait.PollImmediate(poll, timeout, func() (done bool, err error) {
+		pods, err := c.CoreV1().Pods(ns).List(context.TODO(), metav1.ListOptions{})
+		if err != nil {
+			return handleWaitingAPIError(err, true, "listing pods")
+		}
+		matched = 0
+		for _, pod := range pods.Items {
+			if ready, _ := testutils.PodRunningReady(&pod); ready {
+				matched++
+			}
+		}
+		if matched == num {
+			return true, nil
+		}
+		framework.Logf("expect %d pods are running, but got %v", num, matched)
+		return false, nil
+	})
+	return maybeTimeoutError(err, "waiting for pods to be running (want %v, matched %d)", num, matched)
+}
+
+// WaitForPodsSchedulingGated waits for a given `timeout` to evaluate if a certain amount of pods in given `ns` stay in scheduling gated state.
+func WaitForPodsSchedulingGated(c clientset.Interface, ns string, num int, timeout time.Duration) error {
+	matched := 0
+	err := wait.PollImmediate(poll, timeout, func() (done bool, err error) {
+		pods, err := c.CoreV1().Pods(ns).List(context.TODO(), metav1.ListOptions{})
+		if err != nil {
+			return handleWaitingAPIError(err, true, "listing pods")
+		}
+		matched = 0
+		for _, pod := range pods.Items {
+			for _, condition := range pod.Status.Conditions {
+				if condition.Type == v1.PodScheduled && condition.Reason == v1.PodReasonSchedulingGated {
+					matched++
+				}
+			}
+		}
+		if matched == num {
+			return true, nil
+		}
+		framework.Logf("expect %d pods in scheduling gated state, but got %v", num, matched)
+		return false, nil
+	})
+	return maybeTimeoutError(err, "waiting for pods to be scheduling gated (want %d, matched %d)", num, matched)
+}
+
+// WaitForPodsWithSchedulingGates waits for a given `timeout` to evaluate if a certain amount of pods in given `ns`
+// match the given `schedulingGates`stay in scheduling gated state.
+func WaitForPodsWithSchedulingGates(c clientset.Interface, ns string, num int, timeout time.Duration, schedulingGates []v1.PodSchedulingGate) error {
+	matched := 0
+	err := wait.PollImmediate(poll, timeout, func() (done bool, err error) {
+		pods, err := c.CoreV1().Pods(ns).List(context.TODO(), metav1.ListOptions{})
+		if err != nil {
+			return handleWaitingAPIError(err, true, "listing pods")
+		}
+		matched = 0
+		for _, pod := range pods.Items {
+			if reflect.DeepEqual(pod.Spec.SchedulingGates, schedulingGates) {
+				matched++
+			}
+		}
+		if matched == num {
+			return true, nil
+		}
+		framework.Logf("expect %d pods carry the expected scheduling gates, but got %v", num, matched)
+		return false, nil
+	})
+	return maybeTimeoutError(err, "waiting for pods to carry the expected scheduling gates (want %d, matched %d)", num, matched)
+}
+
 // WaitForPodTerminatedInNamespace returns an error if it takes too long for the pod to terminate,
 // if the pod Get api returns an error (IsNotFound or other), or if the pod failed (and thus did not
 // terminate) with an unexpected reason. Typically called to test that the passed-in pod is fully

diff --git a/test/e2e/scheduling/predicates.go b/test/e2e/scheduling/predicates.go
@@ -18,15 +18,18 @@ package scheduling
 
 import (
 	"context"
+	"encoding/json"
 	"fmt"
 	"time"
 
 	v1 "k8s.io/api/core/v1"
 	nodev1 "k8s.io/api/node/v1"
 	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/apimachinery/pkg/util/intstr"
 	"k8s.io/apimachinery/pkg/util/sets"
+	"k8s.io/apimachinery/pkg/util/strategicpatch"
 	"k8s.io/apimachinery/pkg/util/uuid"
 	utilversion "k8s.io/apimachinery/pkg/util/version"
 	clientset "k8s.io/client-go/kubernetes"
@@ -70,6 +73,7 @@ type pausePodConfig struct {
 	PriorityClassName                 string
 	DeletionGracePeriodSeconds        *int64
 	TopologySpreadConstraints         []v1.TopologySpreadConstraint
+	SchedulingGates                   []v1.PodSchedulingGate
 }
 
 var _ = SIGDescribe("SchedulerPredicates [Serial]", func() {
@@ -799,8 +803,75 @@ var _ = SIGDescribe("SchedulerPredicates [Serial]", func() {
 			framework.ExpectEqual(numInNode2, expected, fmt.Sprintf("Pods are not distributed as expected on node %q", nodeNames[1]))
 		})
 	})
+
+	ginkgo.It("validates Pods with non-empty schedulingGates are blocked on scheduling [Feature:PodSchedulingReadiness] [alpha]", func() {
+		podLabel := "e2e-scheduling-gates"
+		replicas := 3
+		ginkgo.By(fmt.Sprintf("Creating a ReplicaSet with replicas=%v, carrying scheduling gates [foo bar]", replicas))
+		rsConfig := pauseRSConfig{
+			Replicas: int32(replicas),
+			PodConfig: pausePodConfig{
+				Name:      podLabel,
+				Namespace: ns,
+				Labels:    map[string]string{podLabel: ""},
+				SchedulingGates: []v1.PodSchedulingGate{
+					{Name: "foo"},
+					{Name: "bar"},
+				},
+			},
+		}
+		createPauseRS(f, rsConfig)
+
+		ginkgo.By("Expect all pods stay in pending state")
+		podList, err := e2epod.WaitForNumberOfPods(cs, ns, replicas, time.Minute)
+		framework.ExpectNoError(err)
+		framework.ExpectNoError(e2epod.WaitForPodsSchedulingGated(cs, ns, replicas, time.Minute))
+
+		ginkgo.By("Remove one scheduling gate")
+		want := []v1.PodSchedulingGate{{Name: "bar"}}
+		var pods []*v1.Pod
+		for _, pod := range podList.Items {
+			clone := pod.DeepCopy()
+			clone.Spec.SchedulingGates = want
+			live, err := patchPod(cs, &pod, clone)
+			framework.ExpectNoError(err)
+			pods = append(pods, live)
+		}
+
+		ginkgo.By("Expect all pods carry one scheduling gate and are still in pending state")
+		framework.ExpectNoError(e2epod.WaitForPodsWithSchedulingGates(cs, ns, replicas, time.Minute, want))
+		framework.ExpectNoError(e2epod.WaitForPodsSchedulingGated(cs, ns, replicas, time.Minute))
+
+		ginkgo.By("Remove the remaining scheduling gates")
+		for _, pod := range pods {
+			clone := pod.DeepCopy()
+			clone.Spec.SchedulingGates = nil
+			_, err := patchPod(cs, pod, clone)
+			framework.ExpectNoError(err)
+		}
+
+		ginkgo.By("Expect all pods are scheduled and running")
+		framework.ExpectNoError(e2epod.WaitForPodsRunning(cs, ns, replicas, time.Minute))
+	})
 })
 
+func patchPod(cs clientset.Interface, old, new *v1.Pod) (*v1.Pod, error) {
+	oldData, err := json.Marshal(old)
+	if err != nil {
+		return nil, err
+	}
+
+	newData, err := json.Marshal(new)
+	if err != nil {
+		return nil, err
+	}
+	patchBytes, err := strategicpatch.CreateTwoWayMergePatch(oldData, newData, &v1.Pod{})
+	if err != nil {
+		return nil, fmt.Errorf("failed to create merge patch for Pod %q: %v", old.Name, err)
+	}
+	return cs.CoreV1().Pods(new.Namespace).Patch(context.TODO(), new.Name, types.StrategicMergePatchType, patchBytes, metav1.PatchOptions{})
+}
+
 // printAllPodsOnNode outputs status of all kubelet pods into log.
 func printAllPodsOnNode(c clientset.Interface, nodeName string) {
 	podList, err := c.CoreV1().Pods(metav1.NamespaceAll).List(context.TODO(), metav1.ListOptions{FieldSelector: "spec.nodeName=" + nodeName})
@@ -844,6 +915,7 @@ func initPausePod(f *framework.Framework, conf pausePodConfig) *v1.Pod {
 			Tolerations:                   conf.Tolerations,
 			PriorityClassName:             conf.PriorityClassName,
 			TerminationGracePeriodSeconds: &gracePeriod,
+			SchedulingGates:               conf.SchedulingGates,
 		},
 	}
 	for key, value := range conf.Labels {

diff --git a/test/integration/scheduler/plugins/plugins_test.go b/test/integration/scheduler/plugins/plugins_test.go
@@ -31,9 +31,12 @@ import (
 	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/apimachinery/pkg/util/wait"
+	"k8s.io/apiserver/pkg/util/feature"
 	clientset "k8s.io/client-go/kubernetes"
 	listersv1 "k8s.io/client-go/listers/core/v1"
+	featuregatetesting "k8s.io/component-base/featuregate/testing"
 	configv1 "k8s.io/kube-scheduler/config/v1"
+	"k8s.io/kubernetes/pkg/features"
 	"k8s.io/kubernetes/pkg/scheduler"
 	schedulerconfig "k8s.io/kubernetes/pkg/scheduler/apis/config"
 	configtesting "k8s.io/kubernetes/pkg/scheduler/apis/config/testing"
@@ -57,9 +60,15 @@ var (
 	podSchedulingError              = testutils.PodSchedulingError
 	createAndWaitForNodesInCache    = testutils.CreateAndWaitForNodesInCache
 	waitForPodUnschedulable         = testutils.WaitForPodUnschedulable
+	waitForPodSchedulingGated       = testutils.WaitForPodSchedulingGated
 	waitForPodToScheduleWithTimeout = testutils.WaitForPodToScheduleWithTimeout
 )
 
+type PreEnqueuePlugin struct {
+	called int32
+	admit  bool
+}
+
 type PreFilterPlugin struct {
 	numPreFilterCalled int
 	failPreFilter      bool
@@ -146,6 +155,7 @@ type PermitPlugin struct {
 }
 
 const (
+	enqueuePluginName            = "enqueue-plugin"
 	prefilterPluginName          = "prefilter-plugin"
 	postfilterPluginName         = "postfilter-plugin"
 	scorePluginName              = "score-plugin"
@@ -158,6 +168,7 @@ const (
 	permitPluginName             = "permit-plugin"
 )
 
+var _ framework.PreEnqueuePlugin = &PreEnqueuePlugin{}
 var _ framework.PreFilterPlugin = &PreFilterPlugin{}
 var _ framework.PostFilterPlugin = &PostFilterPlugin{}
 var _ framework.ScorePlugin = &ScorePlugin{}
@@ -184,6 +195,18 @@ func newPlugin(plugin framework.Plugin) frameworkruntime.PluginFactory {
 	}
 }
 
+func (ep *PreEnqueuePlugin) Name() string {
+	return enqueuePluginName
+}
+
+func (ep *PreEnqueuePlugin) PreEnqueue(ctx context.Context, p *v1.Pod) *framework.Status {
+	ep.called++
+	if ep.admit {
+		return nil
+	}
+	return framework.NewStatus(framework.UnschedulableAndUnresolvable, "not ready for scheduling")
+}
+
 // Name returns name of the score plugin.
 func (sp *ScorePlugin) Name() string {
 	return scorePluginName
@@ -2089,6 +2112,72 @@ func TestPreScorePlugin(t *testing.T) {
 	}
 }
 
+// TestPreEnqueuePlugin tests invocation of enqueue plugins.
+func TestPreEnqueuePlugin(t *testing.T) {
+	defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.PodSchedulingReadiness, true)()
+
+	// Create a plugin registry for testing. Register only a filter plugin.
+	enqueuePlugin := &PreEnqueuePlugin{}
+	// Plumb a preFilterPlugin to verify if it's called or not.
+	preFilterPlugin := &PreFilterPlugin{}
+	registry, prof := initRegistryAndConfig(t, enqueuePlugin, preFilterPlugin)
+
+	// Create the API server and the scheduler with the test plugin set.
+	testCtx := initTestSchedulerForFrameworkTest(t, testutils.InitTestAPIServer(t, "enqueue-plugin", nil), 1,
+		scheduler.WithProfiles(prof),
+		scheduler.WithFrameworkOutOfTreeRegistry(registry))
+	defer testutils.CleanupTest(t, testCtx)
+
+	tests := []struct {
+		name         string
+		pod          *v1.Pod
+		admitEnqueue bool
+	}{
+		{
+			name:         "pod is admitted to enqueue",
+			pod:          st.MakePod().Name("p").Namespace(testCtx.NS.Name).Container("pause").Obj(),
+			admitEnqueue: true,
+		},
+		{
+			name:         "pod is not admitted to enqueue",
+			pod:          st.MakePod().Name("p").Namespace(testCtx.NS.Name).SchedulingGates([]string{"foo"}).Container("pause").Obj(),
+			admitEnqueue: false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			enqueuePlugin.admit = tt.admitEnqueue
+			// Create a best effort pod.
+			pod, err := createPausePod(testCtx.ClientSet, tt.pod)
+			if err != nil {
+				t.Errorf("Error while creating a test pod: %v", err)
+			}
+
+			if tt.admitEnqueue {
+				if err := waitForPodToScheduleWithTimeout(testCtx.ClientSet, pod, 10*time.Second); err != nil {
+					t.Errorf("Expected the pod to be schedulable, but got: %v", err)
+				}
+				// Also verify enqueuePlugin is called.
+				if enqueuePlugin.called == 0 {
+					t.Errorf("Expected the enqueuePlugin plugin to be called at least once, but got 0")
+				}
+			} else {
+				if err := waitForPodSchedulingGated(testCtx.ClientSet, pod, 10*time.Second); err != nil {
+					t.Errorf("Expected the pod to be scheduling waiting, but got: %v", err)
+				}
+				// Also verify preFilterPlugin is not called.
+				if preFilterPlugin.numPreFilterCalled != 0 {
+					t.Errorf("Expected the preFilter plugin not to be called, but got %v", preFilterPlugin.numPreFilterCalled)
+				}
+			}
+
+			preFilterPlugin.reset()
+			testutils.CleanupPods(testCtx.ClientSet, t, []*v1.Pod{pod})
+		})
+	}
+}
+
 // TestPreemptWithPermitPlugin tests preempt with permit plugins.
 // It verifies how waitingPods behave in different scenarios:
 // - when waitingPods get preempted
@@ -2450,6 +2539,8 @@ func initRegistryAndConfig(t *testing.T, plugins ...framework.Plugin) (framework
 		plugin := configv1.Plugin{Name: p.Name()}
 
 		switch p.(type) {
+		case *PreEnqueuePlugin:
+			pls.PreEnqueue.Enabled = append(pls.PreEnqueue.Enabled, plugin)
 		case *PreFilterPlugin:
 			pls.PreFilter.Enabled = append(pls.PreFilter.Enabled, plugin)
 		case *FilterPlugin: