Merge pull request #66733 from bsalamat/subset_nodes

Automatic merge from submit-queue. If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>. Add a feature to the scheduler to score fewer than all nodes in every scheduling cycle **What this PR does / why we need it**: Today, the scheduler scores all the nodes in the cluster in every scheduling cycle (every time a posd is attempted). This feature implements a mechanism in the scheduler that allows scoring fewer than all nodes in the cluster. The scheduler stops searching for more nodes once the configured number of feasible nodes are found. This can help improve the scheduler's performance in large clusters (several hundred nodes and larger). This PR also adds a new structure to the scheduler's cache, called NodeTree, that allows scheduler to iterate over various nodes in different zones in a cluster. This is needed to avoid scoring the same set of nodes in every scheduling cycle. **Which issue(s) this PR fixes** *(optional, in `fixes #<issue number>(, fixes #<issue_number>, ...)` format, will close the issue(s) when PR gets merged)*: Fixes #66627 **Special notes for your reviewer**: This is a large PR, but broken into a few logical commits. Reviewing would be easier if you review by commits. **Release note**: ```release-note Add a feature to the scheduler to score fewer than all nodes in every scheduling cycle. This can improve performance of the scheduler in large clusters. ```
kubernetes · Aug 18, 2018 · 8c1bfeb · 8c1bfeb
2 parents 8b52ca1 + 2860743
commit 8c1bfeb
Show file tree

Hide file tree

Showing 27 changed files with 907 additions and 154 deletions.
diff --git a/cmd/kube-scheduler/app/options/deprecated.go b/cmd/kube-scheduler/app/options/deprecated.go
@@ -18,7 +18,6 @@ package options
 
 import (
 	"fmt"
-
 	"github.com/spf13/pflag"
 
 	"k8s.io/kubernetes/pkg/apis/componentconfig"

diff --git a/cmd/kube-scheduler/app/options/options_test.go b/cmd/kube-scheduler/app/options/options_test.go
@@ -175,6 +175,7 @@ users:
 					Burst:       100,
 					ContentType: "application/vnd.kubernetes.protobuf",
 				},
+				PercentageOfNodesToScore: 50,
 			},
 		},
 		{
@@ -211,6 +212,7 @@ users:
 					Burst:       100,
 					ContentType: "application/vnd.kubernetes.protobuf",
 				},
+				PercentageOfNodesToScore: 50,
 			},
 		},
 		{

diff --git a/cmd/kube-scheduler/app/server.go b/cmd/kube-scheduler/app/server.go
@@ -287,23 +287,24 @@ func NewSchedulerConfig(s schedulerserverconfig.CompletedConfig) (*scheduler.Con
 	}
 
 	// Set up the configurator which can create schedulers from configs.
-	configurator := factory.NewConfigFactory(
-		s.ComponentConfig.SchedulerName,
-		s.Client,
-		s.InformerFactory.Core().V1().Nodes(),
-		s.PodInformer,
-		s.InformerFactory.Core().V1().PersistentVolumes(),
-		s.InformerFactory.Core().V1().PersistentVolumeClaims(),
-		s.InformerFactory.Core().V1().ReplicationControllers(),
-		s.InformerFactory.Apps().V1().ReplicaSets(),
-		s.InformerFactory.Apps().V1().StatefulSets(),
-		s.InformerFactory.Core().V1().Services(),
-		s.InformerFactory.Policy().V1beta1().PodDisruptionBudgets(),
-		storageClassInformer,
-		s.ComponentConfig.HardPodAffinitySymmetricWeight,
-		utilfeature.DefaultFeatureGate.Enabled(features.EnableEquivalenceClassCache),
-		s.ComponentConfig.DisablePreemption,
-	)
+	configurator := factory.NewConfigFactory(&factory.ConfigFactoryArgs{
+		SchedulerName:                  s.ComponentConfig.SchedulerName,
+		Client:                         s.Client,
+		NodeInformer:                   s.InformerFactory.Core().V1().Nodes(),
+		PodInformer:                    s.PodInformer,
+		PvInformer:                     s.InformerFactory.Core().V1().PersistentVolumes(),
+		PvcInformer:                    s.InformerFactory.Core().V1().PersistentVolumeClaims(),
+		ReplicationControllerInformer:  s.InformerFactory.Core().V1().ReplicationControllers(),
+		ReplicaSetInformer:             s.InformerFactory.Apps().V1().ReplicaSets(),
+		StatefulSetInformer:            s.InformerFactory.Apps().V1().StatefulSets(),
+		ServiceInformer:                s.InformerFactory.Core().V1().Services(),
+		PdbInformer:                    s.InformerFactory.Policy().V1beta1().PodDisruptionBudgets(),
+		StorageClassInformer:           storageClassInformer,
+		HardPodAffinitySymmetricWeight: s.ComponentConfig.HardPodAffinitySymmetricWeight,
+		EnableEquivalenceClassCache:    utilfeature.DefaultFeatureGate.Enabled(features.EnableEquivalenceClassCache),
+		DisablePreemption:              s.ComponentConfig.DisablePreemption,
+		PercentageOfNodesToScore:       s.ComponentConfig.PercentageOfNodesToScore,
+	})
 
 	source := s.ComponentConfig.AlgorithmSource
 	var config *scheduler.Config

diff --git a/pkg/apis/componentconfig/types.go b/pkg/apis/componentconfig/types.go
@@ -99,6 +99,15 @@ type KubeSchedulerConfiguration struct {
 
 	// DisablePreemption disables the pod preemption feature.
 	DisablePreemption bool
+
+	// PercentageOfNodeToScore is the percentage of all nodes that once found feasible
+	// for running a pod, the scheduler stops its search for more feasible nodes in
+	// the cluster. This helps improve scheduler's performance. Scheduler always tries to find
+	// at least "minFeasibleNodesToFind" feasible nodes no matter what the value of this flag is.
+	// Example: if the cluster size is 500 nodes and the value of this flag is 30,
+	// then scheduler stops finding further feasible nodes once it finds 150 feasible ones.
+	// When the value is 0, default percentage (50%) of the nodes will be scored.
+	PercentageOfNodesToScore int32
 }
 
 // KubeSchedulerLeaderElectionConfiguration expands LeaderElectionConfiguration

diff --git a/pkg/apis/componentconfig/v1alpha1/defaults.go b/pkg/apis/componentconfig/v1alpha1/defaults.go
@@ -278,6 +278,11 @@ func SetDefaults_KubeSchedulerConfiguration(obj *KubeSchedulerConfiguration) {
 		obj.FailureDomains = kubeletapis.DefaultFailureDomains
 	}
 
+	if obj.PercentageOfNodesToScore == 0 {
+		// by default, stop finding feasible nodes once the number of feasible nodes is 50% of the cluster.
+		obj.PercentageOfNodesToScore = 50
+	}
+
 	// Use the default ClientConnectionConfiguration and LeaderElectionConfiguration options
 	apimachineryconfigv1alpha1.RecommendedDefaultClientConnectionConfiguration(&obj.ClientConnection)
 	apiserverconfigv1alpha1.RecommendedDefaultLeaderElectionConfiguration(&obj.LeaderElection.LeaderElectionConfiguration)

diff --git a/pkg/apis/componentconfig/v1alpha1/types.go b/pkg/apis/componentconfig/v1alpha1/types.go
@@ -95,6 +95,16 @@ type KubeSchedulerConfiguration struct {
 
 	// DisablePreemption disables the pod preemption feature.
 	DisablePreemption bool `json:"disablePreemption"`
+
+	// PercentageOfNodeToScore specifies what percentage of all nodes should be scored in each
+	// scheduling cycle. This helps improve scheduler's performance. Scheduler always tries to find
+	// at least "minFeasibleNodesToFind" feasible nodes no matter what the value of this flag is.
+	// When this value is below 100%, the scheduler stops finding feasible nodes for running a pod
+	// once it finds that percentage of feasible nodes of the whole cluster size. For example, if the
+	// cluster size is 500 nodes and the value of this flag is 30, then scheduler stops finding
+	// feasible nodes once it finds 150 feasible nodes.
+	// When the value is 0, default percentage (50%) of the nodes will be scored.
+	PercentageOfNodesToScore int32 `json:"percentageOfNodesToScore"`
 }
 
 // KubeSchedulerLeaderElectionConfiguration expands LeaderElectionConfiguration

diff --git a/pkg/apis/componentconfig/v1alpha1/zz_generated.conversion.go b/pkg/apis/componentconfig/v1alpha1/zz_generated.conversion.go
diff --git a/pkg/scheduler/BUILD b/pkg/scheduler/BUILD
@@ -15,6 +15,7 @@ go_test(
         "//pkg/controller/volume/persistentvolume:go_default_library",
         "//pkg/scheduler/algorithm:go_default_library",
         "//pkg/scheduler/algorithm/predicates:go_default_library",
+        "//pkg/scheduler/api:go_default_library",
         "//pkg/scheduler/cache:go_default_library",
         "//pkg/scheduler/core:go_default_library",
         "//pkg/scheduler/testing:go_default_library",

diff --git a/pkg/scheduler/algorithmprovider/defaults/compatibility_test.go b/pkg/scheduler/algorithmprovider/defaults/compatibility_test.go
@@ -849,23 +849,24 @@ func TestCompatibility_v1_Scheduler(t *testing.T) {
 		client := clientset.NewForConfigOrDie(&restclient.Config{Host: server.URL, ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
 		informerFactory := informers.NewSharedInformerFactory(client, 0)
 
-		if _, err := factory.NewConfigFactory(
-			"some-scheduler-name",
-			client,
-			informerFactory.Core().V1().Nodes(),
-			informerFactory.Core().V1().Pods(),
-			informerFactory.Core().V1().PersistentVolumes(),
-			informerFactory.Core().V1().PersistentVolumeClaims(),
-			informerFactory.Core().V1().ReplicationControllers(),
-			informerFactory.Apps().V1().ReplicaSets(),
-			informerFactory.Apps().V1().StatefulSets(),
-			informerFactory.Core().V1().Services(),
-			informerFactory.Policy().V1beta1().PodDisruptionBudgets(),
-			informerFactory.Storage().V1().StorageClasses(),
-			v1.DefaultHardPodAffinitySymmetricWeight,
-			enableEquivalenceCache,
-			false,
-		).CreateFromConfig(policy); err != nil {
+		if _, err := factory.NewConfigFactory(&factory.ConfigFactoryArgs{
+			SchedulerName:                  "some-scheduler-name",
+			Client:                         client,
+			NodeInformer:                   informerFactory.Core().V1().Nodes(),
+			PodInformer:                    informerFactory.Core().V1().Pods(),
+			PvInformer:                     informerFactory.Core().V1().PersistentVolumes(),
+			PvcInformer:                    informerFactory.Core().V1().PersistentVolumeClaims(),
+			ReplicationControllerInformer:  informerFactory.Core().V1().ReplicationControllers(),
+			ReplicaSetInformer:             informerFactory.Apps().V1().ReplicaSets(),
+			StatefulSetInformer:            informerFactory.Apps().V1().StatefulSets(),
+			ServiceInformer:                informerFactory.Core().V1().Services(),
+			PdbInformer:                    informerFactory.Policy().V1beta1().PodDisruptionBudgets(),
+			StorageClassInformer:           informerFactory.Storage().V1().StorageClasses(),
+			HardPodAffinitySymmetricWeight: v1.DefaultHardPodAffinitySymmetricWeight,
+			EnableEquivalenceClassCache:    enableEquivalenceCache,
+			DisablePreemption:              false,
+			PercentageOfNodesToScore:       schedulerapi.DefaultPercentageOfNodesToScore,
+		}).CreateFromConfig(policy); err != nil {
 			t.Errorf("%s: Error constructing: %v", v, err)
 			continue
 		}

diff --git a/pkg/scheduler/api/types.go b/pkg/scheduler/api/types.go
@@ -36,6 +36,9 @@ const (
 	MaxPriority = 10
 	// MaxWeight defines the max weight value.
 	MaxWeight = MaxInt / MaxPriority
+	// DefaultPercentageOfNodesToScore defines the percentage of nodes of all nodes
+	// that once found feasible, the scheduler stops looking for more nodes.
+	DefaultPercentageOfNodesToScore = 50
 )
 
 // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object

diff --git a/pkg/scheduler/cache/BUILD b/pkg/scheduler/cache/BUILD
@@ -6,6 +6,7 @@ go_library(
         "cache.go",
         "interface.go",
         "node_info.go",
+        "node_tree.go",
         "util.go",
     ],
     importpath = "k8s.io/kubernetes/pkg/scheduler/cache",
@@ -15,6 +16,7 @@ go_library(
         "//pkg/features:go_default_library",
         "//pkg/scheduler/algorithm/priorities/util:go_default_library",
         "//pkg/scheduler/util:go_default_library",
+        "//pkg/util/node:go_default_library",
         "//staging/src/k8s.io/api/core/v1:go_default_library",
         "//staging/src/k8s.io/api/policy/v1beta1:go_default_library",
         "//staging/src/k8s.io/apimachinery/pkg/api/resource:go_default_library",
@@ -31,11 +33,13 @@ go_test(
     srcs = [
         "cache_test.go",
         "node_info_test.go",
+        "node_tree_test.go",
         "util_test.go",
     ],
     embed = [":go_default_library"],
     deps = [
         "//pkg/features:go_default_library",
+        "//pkg/kubelet/apis:go_default_library",
         "//pkg/scheduler/algorithm/priorities/util:go_default_library",
         "//pkg/scheduler/util:go_default_library",
         "//staging/src/k8s.io/api/core/v1:go_default_library",

diff --git a/pkg/scheduler/cache/cache.go b/pkg/scheduler/cache/cache.go
@@ -59,6 +59,7 @@ type schedulerCache struct {
 	// a map from pod key to podState.
 	podStates map[string]*podState
 	nodes     map[string]*NodeInfo
+	nodeTree  *NodeTree
 	pdbs      map[string]*policy.PodDisruptionBudget
 	// A map from image name to its imageState.
 	imageStates map[string]*imageState
@@ -102,6 +103,7 @@ func newSchedulerCache(ttl, period time.Duration, stop <-chan struct{}) *schedul
 		stop:   stop,
 
 		nodes:       make(map[string]*NodeInfo),
+		nodeTree:    newNodeTree(nil),
 		assumedPods: make(map[string]bool),
 		podStates:   make(map[string]*podState),
 		pdbs:        make(map[string]*policy.PodDisruptionBudget),
@@ -426,6 +428,7 @@ func (cache *schedulerCache) AddNode(node *v1.Node) error {
 		cache.removeNodeImageStates(n.node)
 	}
 
+	cache.nodeTree.AddNode(node)
 	cache.addNodeImageStates(node, n)
 	return n.SetNode(node)
 }
@@ -442,6 +445,7 @@ func (cache *schedulerCache) UpdateNode(oldNode, newNode *v1.Node) error {
 		cache.removeNodeImageStates(n.node)
 	}
 
+	cache.nodeTree.UpdateNode(oldNode, newNode)
 	cache.addNodeImageStates(newNode, n)
 	return n.SetNode(newNode)
 }
@@ -462,6 +466,7 @@ func (cache *schedulerCache) RemoveNode(node *v1.Node) error {
 		delete(cache.nodes, node.Name)
 	}
 
+	cache.nodeTree.RemoveNode(node)
 	cache.removeNodeImageStates(node)
 	return nil
 }
@@ -598,3 +603,7 @@ func (cache *schedulerCache) expirePod(key string, ps *podState) error {
 	delete(cache.podStates, key)
 	return nil
 }
+
+func (cache *schedulerCache) NodeTree() *NodeTree {
+	return cache.nodeTree
+}
diff --git a/pkg/scheduler/cache/cache_test.go b/pkg/scheduler/cache/cache_test.go
@@ -1065,6 +1065,9 @@ func TestNodeOperators(t *testing.T) {
 		if !found {
 			t.Errorf("Failed to find node %v in schedulercache.", node.Name)
 		}
+		if cache.nodeTree.NumNodes != 1 || cache.nodeTree.Next() != node.Name {
+			t.Errorf("cache.nodeTree is not updated correctly after adding node: %v", node.Name)
+		}
 
 		// Generations are globally unique. We check in our unit tests that they are incremented correctly.
 		expected.generation = got.generation
@@ -1100,12 +1103,21 @@ func TestNodeOperators(t *testing.T) {
 		if !reflect.DeepEqual(got, expected) {
 			t.Errorf("Failed to update node in schedulercache:\n got: %+v \nexpected: %+v", got, expected)
 		}
+		// Check nodeTree after update
+		if cache.nodeTree.NumNodes != 1 || cache.nodeTree.Next() != node.Name {
+			t.Errorf("unexpected cache.nodeTree after updating node: %v", node.Name)
+		}
 
 		// Case 4: the node can not be removed if pods is not empty.
 		cache.RemoveNode(node)
 		if _, found := cache.nodes[node.Name]; !found {
 			t.Errorf("The node %v should not be removed if pods is not empty.", node.Name)
 		}
+		// Check nodeTree after remove. The node should be removed from the nodeTree even if there are
+		// still pods on it.
+		if cache.nodeTree.NumNodes != 0 || cache.nodeTree.Next() != "" {
+			t.Errorf("unexpected cache.nodeTree after removing node: %v", node.Name)
+		}
 	}
 }
 

diff --git a/pkg/scheduler/cache/interface.go b/pkg/scheduler/cache/interface.go
@@ -125,6 +125,9 @@ type Cache interface {
 
 	// IsUpToDate returns true if the given NodeInfo matches the current data in the cache.
 	IsUpToDate(n *NodeInfo) bool
+
+	// NodeTree returns a node tree structure
+	NodeTree() *NodeTree
 }
 
 // Snapshot is a snapshot of cache state