Skip to content

Commit

Permalink
Merge pull request #66733 from bsalamat/subset_nodes
Browse files Browse the repository at this point in the history
Automatic merge from submit-queue. If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>.

Add a feature to the scheduler to score fewer than all nodes in every scheduling cycle

**What this PR does / why we need it**:
Today, the scheduler scores all the nodes in the cluster in every scheduling cycle (every time a posd is attempted). This feature implements a mechanism in the scheduler that allows scoring fewer than all nodes in the cluster. The scheduler stops searching for more nodes once the configured number of feasible nodes are found. This can help improve the scheduler's performance in large clusters (several hundred nodes and larger).
This PR also adds a new structure to the scheduler's cache, called NodeTree, that allows scheduler to iterate over various nodes in different zones in a cluster. This is needed to avoid scoring the same set of nodes in every scheduling cycle.

**Which issue(s) this PR fixes** *(optional, in `fixes #<issue number>(, fixes #<issue_number>, ...)` format, will close the issue(s) when PR gets merged)*:
Fixes #66627 

**Special notes for your reviewer**:
This is a large PR, but broken into a few logical commits. Reviewing would be easier if you review by commits.

**Release note**:

```release-note
Add a feature to the scheduler to score fewer than all nodes in every scheduling cycle. This can improve performance of the scheduler in large clusters.
```
  • Loading branch information
Kubernetes Submit Queue committed Aug 18, 2018
2 parents 8b52ca1 + 2860743 commit 8c1bfeb
Show file tree
Hide file tree
Showing 27 changed files with 907 additions and 154 deletions.
1 change: 0 additions & 1 deletion cmd/kube-scheduler/app/options/deprecated.go
Expand Up @@ -18,7 +18,6 @@ package options

import (
"fmt"

"github.com/spf13/pflag"

"k8s.io/kubernetes/pkg/apis/componentconfig"
Expand Down
2 changes: 2 additions & 0 deletions cmd/kube-scheduler/app/options/options_test.go
Expand Up @@ -175,6 +175,7 @@ users:
Burst: 100,
ContentType: "application/vnd.kubernetes.protobuf",
},
PercentageOfNodesToScore: 50,
},
},
{
Expand Down Expand Up @@ -211,6 +212,7 @@ users:
Burst: 100,
ContentType: "application/vnd.kubernetes.protobuf",
},
PercentageOfNodesToScore: 50,
},
},
{
Expand Down
35 changes: 18 additions & 17 deletions cmd/kube-scheduler/app/server.go
Expand Up @@ -287,23 +287,24 @@ func NewSchedulerConfig(s schedulerserverconfig.CompletedConfig) (*scheduler.Con
}

// Set up the configurator which can create schedulers from configs.
configurator := factory.NewConfigFactory(
s.ComponentConfig.SchedulerName,
s.Client,
s.InformerFactory.Core().V1().Nodes(),
s.PodInformer,
s.InformerFactory.Core().V1().PersistentVolumes(),
s.InformerFactory.Core().V1().PersistentVolumeClaims(),
s.InformerFactory.Core().V1().ReplicationControllers(),
s.InformerFactory.Apps().V1().ReplicaSets(),
s.InformerFactory.Apps().V1().StatefulSets(),
s.InformerFactory.Core().V1().Services(),
s.InformerFactory.Policy().V1beta1().PodDisruptionBudgets(),
storageClassInformer,
s.ComponentConfig.HardPodAffinitySymmetricWeight,
utilfeature.DefaultFeatureGate.Enabled(features.EnableEquivalenceClassCache),
s.ComponentConfig.DisablePreemption,
)
configurator := factory.NewConfigFactory(&factory.ConfigFactoryArgs{
SchedulerName: s.ComponentConfig.SchedulerName,
Client: s.Client,
NodeInformer: s.InformerFactory.Core().V1().Nodes(),
PodInformer: s.PodInformer,
PvInformer: s.InformerFactory.Core().V1().PersistentVolumes(),
PvcInformer: s.InformerFactory.Core().V1().PersistentVolumeClaims(),
ReplicationControllerInformer: s.InformerFactory.Core().V1().ReplicationControllers(),
ReplicaSetInformer: s.InformerFactory.Apps().V1().ReplicaSets(),
StatefulSetInformer: s.InformerFactory.Apps().V1().StatefulSets(),
ServiceInformer: s.InformerFactory.Core().V1().Services(),
PdbInformer: s.InformerFactory.Policy().V1beta1().PodDisruptionBudgets(),
StorageClassInformer: storageClassInformer,
HardPodAffinitySymmetricWeight: s.ComponentConfig.HardPodAffinitySymmetricWeight,
EnableEquivalenceClassCache: utilfeature.DefaultFeatureGate.Enabled(features.EnableEquivalenceClassCache),
DisablePreemption: s.ComponentConfig.DisablePreemption,
PercentageOfNodesToScore: s.ComponentConfig.PercentageOfNodesToScore,
})

source := s.ComponentConfig.AlgorithmSource
var config *scheduler.Config
Expand Down
9 changes: 9 additions & 0 deletions pkg/apis/componentconfig/types.go
Expand Up @@ -99,6 +99,15 @@ type KubeSchedulerConfiguration struct {

// DisablePreemption disables the pod preemption feature.
DisablePreemption bool

// PercentageOfNodeToScore is the percentage of all nodes that once found feasible
// for running a pod, the scheduler stops its search for more feasible nodes in
// the cluster. This helps improve scheduler's performance. Scheduler always tries to find
// at least "minFeasibleNodesToFind" feasible nodes no matter what the value of this flag is.
// Example: if the cluster size is 500 nodes and the value of this flag is 30,
// then scheduler stops finding further feasible nodes once it finds 150 feasible ones.
// When the value is 0, default percentage (50%) of the nodes will be scored.
PercentageOfNodesToScore int32
}

// KubeSchedulerLeaderElectionConfiguration expands LeaderElectionConfiguration
Expand Down
5 changes: 5 additions & 0 deletions pkg/apis/componentconfig/v1alpha1/defaults.go
Expand Up @@ -278,6 +278,11 @@ func SetDefaults_KubeSchedulerConfiguration(obj *KubeSchedulerConfiguration) {
obj.FailureDomains = kubeletapis.DefaultFailureDomains
}

if obj.PercentageOfNodesToScore == 0 {
// by default, stop finding feasible nodes once the number of feasible nodes is 50% of the cluster.
obj.PercentageOfNodesToScore = 50
}

// Use the default ClientConnectionConfiguration and LeaderElectionConfiguration options
apimachineryconfigv1alpha1.RecommendedDefaultClientConnectionConfiguration(&obj.ClientConnection)
apiserverconfigv1alpha1.RecommendedDefaultLeaderElectionConfiguration(&obj.LeaderElection.LeaderElectionConfiguration)
Expand Down
10 changes: 10 additions & 0 deletions pkg/apis/componentconfig/v1alpha1/types.go
Expand Up @@ -95,6 +95,16 @@ type KubeSchedulerConfiguration struct {

// DisablePreemption disables the pod preemption feature.
DisablePreemption bool `json:"disablePreemption"`

// PercentageOfNodeToScore specifies what percentage of all nodes should be scored in each
// scheduling cycle. This helps improve scheduler's performance. Scheduler always tries to find
// at least "minFeasibleNodesToFind" feasible nodes no matter what the value of this flag is.
// When this value is below 100%, the scheduler stops finding feasible nodes for running a pod
// once it finds that percentage of feasible nodes of the whole cluster size. For example, if the
// cluster size is 500 nodes and the value of this flag is 30, then scheduler stops finding
// feasible nodes once it finds 150 feasible nodes.
// When the value is 0, default percentage (50%) of the nodes will be scored.
PercentageOfNodesToScore int32 `json:"percentageOfNodesToScore"`
}

// KubeSchedulerLeaderElectionConfiguration expands LeaderElectionConfiguration
Expand Down
2 changes: 2 additions & 0 deletions pkg/apis/componentconfig/v1alpha1/zz_generated.conversion.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pkg/scheduler/BUILD
Expand Up @@ -15,6 +15,7 @@ go_test(
"//pkg/controller/volume/persistentvolume:go_default_library",
"//pkg/scheduler/algorithm:go_default_library",
"//pkg/scheduler/algorithm/predicates:go_default_library",
"//pkg/scheduler/api:go_default_library",
"//pkg/scheduler/cache:go_default_library",
"//pkg/scheduler/core:go_default_library",
"//pkg/scheduler/testing:go_default_library",
Expand Down
35 changes: 18 additions & 17 deletions pkg/scheduler/algorithmprovider/defaults/compatibility_test.go
Expand Up @@ -849,23 +849,24 @@ func TestCompatibility_v1_Scheduler(t *testing.T) {
client := clientset.NewForConfigOrDie(&restclient.Config{Host: server.URL, ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
informerFactory := informers.NewSharedInformerFactory(client, 0)

if _, err := factory.NewConfigFactory(
"some-scheduler-name",
client,
informerFactory.Core().V1().Nodes(),
informerFactory.Core().V1().Pods(),
informerFactory.Core().V1().PersistentVolumes(),
informerFactory.Core().V1().PersistentVolumeClaims(),
informerFactory.Core().V1().ReplicationControllers(),
informerFactory.Apps().V1().ReplicaSets(),
informerFactory.Apps().V1().StatefulSets(),
informerFactory.Core().V1().Services(),
informerFactory.Policy().V1beta1().PodDisruptionBudgets(),
informerFactory.Storage().V1().StorageClasses(),
v1.DefaultHardPodAffinitySymmetricWeight,
enableEquivalenceCache,
false,
).CreateFromConfig(policy); err != nil {
if _, err := factory.NewConfigFactory(&factory.ConfigFactoryArgs{
SchedulerName: "some-scheduler-name",
Client: client,
NodeInformer: informerFactory.Core().V1().Nodes(),
PodInformer: informerFactory.Core().V1().Pods(),
PvInformer: informerFactory.Core().V1().PersistentVolumes(),
PvcInformer: informerFactory.Core().V1().PersistentVolumeClaims(),
ReplicationControllerInformer: informerFactory.Core().V1().ReplicationControllers(),
ReplicaSetInformer: informerFactory.Apps().V1().ReplicaSets(),
StatefulSetInformer: informerFactory.Apps().V1().StatefulSets(),
ServiceInformer: informerFactory.Core().V1().Services(),
PdbInformer: informerFactory.Policy().V1beta1().PodDisruptionBudgets(),
StorageClassInformer: informerFactory.Storage().V1().StorageClasses(),
HardPodAffinitySymmetricWeight: v1.DefaultHardPodAffinitySymmetricWeight,
EnableEquivalenceClassCache: enableEquivalenceCache,
DisablePreemption: false,
PercentageOfNodesToScore: schedulerapi.DefaultPercentageOfNodesToScore,
}).CreateFromConfig(policy); err != nil {
t.Errorf("%s: Error constructing: %v", v, err)
continue
}
Expand Down
3 changes: 3 additions & 0 deletions pkg/scheduler/api/types.go
Expand Up @@ -36,6 +36,9 @@ const (
MaxPriority = 10
// MaxWeight defines the max weight value.
MaxWeight = MaxInt / MaxPriority
// DefaultPercentageOfNodesToScore defines the percentage of nodes of all nodes
// that once found feasible, the scheduler stops looking for more nodes.
DefaultPercentageOfNodesToScore = 50
)

// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
Expand Down
4 changes: 4 additions & 0 deletions pkg/scheduler/cache/BUILD
Expand Up @@ -6,6 +6,7 @@ go_library(
"cache.go",
"interface.go",
"node_info.go",
"node_tree.go",
"util.go",
],
importpath = "k8s.io/kubernetes/pkg/scheduler/cache",
Expand All @@ -15,6 +16,7 @@ go_library(
"//pkg/features:go_default_library",
"//pkg/scheduler/algorithm/priorities/util:go_default_library",
"//pkg/scheduler/util:go_default_library",
"//pkg/util/node:go_default_library",
"//staging/src/k8s.io/api/core/v1:go_default_library",
"//staging/src/k8s.io/api/policy/v1beta1:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/api/resource:go_default_library",
Expand All @@ -31,11 +33,13 @@ go_test(
srcs = [
"cache_test.go",
"node_info_test.go",
"node_tree_test.go",
"util_test.go",
],
embed = [":go_default_library"],
deps = [
"//pkg/features:go_default_library",
"//pkg/kubelet/apis:go_default_library",
"//pkg/scheduler/algorithm/priorities/util:go_default_library",
"//pkg/scheduler/util:go_default_library",
"//staging/src/k8s.io/api/core/v1:go_default_library",
Expand Down
9 changes: 9 additions & 0 deletions pkg/scheduler/cache/cache.go
Expand Up @@ -59,6 +59,7 @@ type schedulerCache struct {
// a map from pod key to podState.
podStates map[string]*podState
nodes map[string]*NodeInfo
nodeTree *NodeTree
pdbs map[string]*policy.PodDisruptionBudget
// A map from image name to its imageState.
imageStates map[string]*imageState
Expand Down Expand Up @@ -102,6 +103,7 @@ func newSchedulerCache(ttl, period time.Duration, stop <-chan struct{}) *schedul
stop: stop,

nodes: make(map[string]*NodeInfo),
nodeTree: newNodeTree(nil),
assumedPods: make(map[string]bool),
podStates: make(map[string]*podState),
pdbs: make(map[string]*policy.PodDisruptionBudget),
Expand Down Expand Up @@ -426,6 +428,7 @@ func (cache *schedulerCache) AddNode(node *v1.Node) error {
cache.removeNodeImageStates(n.node)
}

cache.nodeTree.AddNode(node)
cache.addNodeImageStates(node, n)
return n.SetNode(node)
}
Expand All @@ -442,6 +445,7 @@ func (cache *schedulerCache) UpdateNode(oldNode, newNode *v1.Node) error {
cache.removeNodeImageStates(n.node)
}

cache.nodeTree.UpdateNode(oldNode, newNode)
cache.addNodeImageStates(newNode, n)
return n.SetNode(newNode)
}
Expand All @@ -462,6 +466,7 @@ func (cache *schedulerCache) RemoveNode(node *v1.Node) error {
delete(cache.nodes, node.Name)
}

cache.nodeTree.RemoveNode(node)
cache.removeNodeImageStates(node)
return nil
}
Expand Down Expand Up @@ -598,3 +603,7 @@ func (cache *schedulerCache) expirePod(key string, ps *podState) error {
delete(cache.podStates, key)
return nil
}

func (cache *schedulerCache) NodeTree() *NodeTree {
return cache.nodeTree
}
12 changes: 12 additions & 0 deletions pkg/scheduler/cache/cache_test.go
Expand Up @@ -1065,6 +1065,9 @@ func TestNodeOperators(t *testing.T) {
if !found {
t.Errorf("Failed to find node %v in schedulercache.", node.Name)
}
if cache.nodeTree.NumNodes != 1 || cache.nodeTree.Next() != node.Name {
t.Errorf("cache.nodeTree is not updated correctly after adding node: %v", node.Name)
}

// Generations are globally unique. We check in our unit tests that they are incremented correctly.
expected.generation = got.generation
Expand Down Expand Up @@ -1100,12 +1103,21 @@ func TestNodeOperators(t *testing.T) {
if !reflect.DeepEqual(got, expected) {
t.Errorf("Failed to update node in schedulercache:\n got: %+v \nexpected: %+v", got, expected)
}
// Check nodeTree after update
if cache.nodeTree.NumNodes != 1 || cache.nodeTree.Next() != node.Name {
t.Errorf("unexpected cache.nodeTree after updating node: %v", node.Name)
}

// Case 4: the node can not be removed if pods is not empty.
cache.RemoveNode(node)
if _, found := cache.nodes[node.Name]; !found {
t.Errorf("The node %v should not be removed if pods is not empty.", node.Name)
}
// Check nodeTree after remove. The node should be removed from the nodeTree even if there are
// still pods on it.
if cache.nodeTree.NumNodes != 0 || cache.nodeTree.Next() != "" {
t.Errorf("unexpected cache.nodeTree after removing node: %v", node.Name)
}
}
}

Expand Down
3 changes: 3 additions & 0 deletions pkg/scheduler/cache/interface.go
Expand Up @@ -125,6 +125,9 @@ type Cache interface {

// IsUpToDate returns true if the given NodeInfo matches the current data in the cache.
IsUpToDate(n *NodeInfo) bool

// NodeTree returns a node tree structure
NodeTree() *NodeTree
}

// Snapshot is a snapshot of cache state
Expand Down

0 comments on commit 8c1bfeb

Please sign in to comment.