-
Notifications
You must be signed in to change notification settings - Fork 38.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Make node tree order part of the snapshot #84014
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -187,16 +187,15 @@ func (g *genericScheduler) Schedule(ctx context.Context, state *framework.CycleS | |
} | ||
trace.Step("Running prefilter plugins done") | ||
|
||
numNodes := g.cache.NodeTree().NumNodes() | ||
if numNodes == 0 { | ||
return result, ErrNoNodesAvailable | ||
} | ||
|
||
if err := g.snapshot(); err != nil { | ||
return result, err | ||
} | ||
trace.Step("Snapshoting scheduler cache and node infos done") | ||
|
||
if len(g.nodeInfoSnapshot.NodeInfoList) == 0 { | ||
return result, ErrNoNodesAvailable | ||
} | ||
|
||
startPredicateEvalTime := time.Now() | ||
filteredNodes, failedPredicateMap, filteredNodesStatuses, err := g.findNodesThatFit(ctx, state, pod) | ||
if err != nil { | ||
|
@@ -213,7 +212,7 @@ func (g *genericScheduler) Schedule(ctx context.Context, state *framework.CycleS | |
if len(filteredNodes) == 0 { | ||
return result, &FitError{ | ||
Pod: pod, | ||
NumAllNodes: numNodes, | ||
NumAllNodes: len(g.nodeInfoSnapshot.NodeInfoList), | ||
FailedPredicates: failedPredicateMap, | ||
FilteredNodesStatuses: filteredNodesStatuses, | ||
} | ||
|
@@ -460,13 +459,13 @@ func (g *genericScheduler) findNodesThatFit(ctx context.Context, state *framewor | |
if len(g.predicates) == 0 && !g.framework.HasFilterPlugins() { | ||
filtered = g.nodeInfoSnapshot.ListNodes() | ||
} else { | ||
allNodes := int32(g.cache.NodeTree().NumNodes()) | ||
allNodes := int32(len(g.nodeInfoSnapshot.NodeInfoList)) | ||
numNodesToFind := g.numFeasibleNodesToFind(allNodes) | ||
|
||
// Create filtered list with enough space to avoid growing it | ||
// and allow assigning. | ||
filtered = make([]*v1.Node, numNodesToFind) | ||
errs := errors.MessageCountMap{} | ||
errCh := util.NewErrorChannel() | ||
var ( | ||
predicateResultLock sync.Mutex | ||
filteredLen int32 | ||
|
@@ -479,20 +478,17 @@ func (g *genericScheduler) findNodesThatFit(ctx context.Context, state *framewor | |
state.Write(migration.PredicatesStateKey, &migration.PredicatesStateData{Reference: meta}) | ||
|
||
checkNode := func(i int) { | ||
nodeName := g.cache.NodeTree().Next() | ||
|
||
nodeInfo := g.nodeInfoSnapshot.NodeInfoList[i] | ||
fits, failedPredicates, status, err := g.podFitsOnNode( | ||
ctx, | ||
state, | ||
pod, | ||
meta, | ||
g.nodeInfoSnapshot.NodeInfoMap[nodeName], | ||
nodeInfo, | ||
g.alwaysCheckAllPredicates, | ||
) | ||
if err != nil { | ||
predicateResultLock.Lock() | ||
errs[err.Error()]++ | ||
predicateResultLock.Unlock() | ||
errCh.SendErrorWithCancel(err, cancel) | ||
return | ||
} | ||
if fits { | ||
|
@@ -501,15 +497,15 @@ func (g *genericScheduler) findNodesThatFit(ctx context.Context, state *framewor | |
cancel() | ||
atomic.AddInt32(&filteredLen, -1) | ||
} else { | ||
filtered[length-1] = g.nodeInfoSnapshot.NodeInfoMap[nodeName].Node() | ||
filtered[length-1] = nodeInfo.Node() | ||
} | ||
} else { | ||
predicateResultLock.Lock() | ||
if !status.IsSuccess() { | ||
filteredNodesStatuses[nodeName] = status | ||
filteredNodesStatuses[nodeInfo.Node().Name] = status | ||
} | ||
if len(failedPredicates) != 0 { | ||
failedPredicateMap[nodeName] = failedPredicates | ||
failedPredicateMap[nodeInfo.Node().Name] = failedPredicates | ||
} | ||
predicateResultLock.Unlock() | ||
} | ||
|
@@ -520,8 +516,8 @@ func (g *genericScheduler) findNodesThatFit(ctx context.Context, state *framewor | |
workqueue.ParallelizeUntil(ctx, 16, int(allNodes), checkNode) | ||
|
||
filtered = filtered[:filteredLen] | ||
if len(errs) > 0 { | ||
return []*v1.Node{}, FailedPredicateMap{}, framework.NodeToStatusMap{}, errors.CreateAggregateFromMessageCountMap(errs) | ||
if err := errCh.ReceiveError(); err != nil { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This doesn't look right, we used to return all predicates errors to the users, but here it changes to a single error. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was collecting errors from all nodes, not predicates. Why is it useful to continue to examine all nodes and end up existing anyways? remember that this is error, not predicate failure, so it is likely that something internal went wrong and caused the error, so again keeping iterating over all nodes does not seem useful to me. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah I misread, |
||
return []*v1.Node{}, FailedPredicateMap{}, framework.NodeToStatusMap{}, err | ||
} | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -74,7 +74,7 @@ type schedulerCache struct { | |||||
// headNode points to the most recently updated NodeInfo in "nodes". It is the | ||||||
// head of the linked list. | ||||||
headNode *nodeInfoListItem | ||||||
nodeTree *NodeTree | ||||||
nodeTree *nodeTree | ||||||
// A map from image name to its imageState. | ||||||
imageStates map[string]*imageState | ||||||
} | ||||||
|
@@ -238,6 +238,17 @@ func (cache *schedulerCache) UpdateNodeInfoSnapshot(nodeSnapshot *schedulernodei | |||||
} | ||||||
} | ||||||
} | ||||||
|
||||||
// Take a snapshot of the nodes order in the tree | ||||||
nodeSnapshot.NodeInfoList = make([]*schedulernodeinfo.NodeInfo, 0, cache.nodeTree.numNodes) | ||||||
for i := 0; i < cache.nodeTree.numNodes; i++ { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The loop seems to be both memory and time consuming, can we have a benchmark testing the whole scheduling cycle? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is not really, as described in the issue description, we gain about 3%, as for memory, this is just an array of pointers, so even for cluster with 5k nodes for example, the overhead is negligible. Note that we do something similar in the predicates metadata, here are examples:
|
||||||
nodeName := cache.nodeTree.next() | ||||||
if n := nodeSnapshot.NodeInfoMap[nodeName]; n != nil { | ||||||
nodeSnapshot.NodeInfoList = append(nodeSnapshot.NodeInfoList, n) | ||||||
} else { | ||||||
klog.Errorf("node %q exist in nodeTree but not in NodeInfoMap, this should not happen.", nodeName) | ||||||
} | ||||||
} | ||||||
return nil | ||||||
} | ||||||
|
||||||
|
@@ -516,7 +527,7 @@ func (cache *schedulerCache) AddNode(node *v1.Node) error { | |||||
} | ||||||
cache.moveNodeInfoToHead(node.Name) | ||||||
|
||||||
cache.nodeTree.AddNode(node) | ||||||
cache.nodeTree.addNode(node) | ||||||
cache.addNodeImageStates(node, n.info) | ||||||
return n.info.SetNode(node) | ||||||
} | ||||||
|
@@ -534,7 +545,7 @@ func (cache *schedulerCache) UpdateNode(oldNode, newNode *v1.Node) error { | |||||
} | ||||||
cache.moveNodeInfoToHead(newNode.Name) | ||||||
|
||||||
cache.nodeTree.UpdateNode(oldNode, newNode) | ||||||
cache.nodeTree.updateNode(oldNode, newNode) | ||||||
cache.addNodeImageStates(newNode, n.info) | ||||||
return n.info.SetNode(newNode) | ||||||
} | ||||||
|
@@ -560,7 +571,7 @@ func (cache *schedulerCache) RemoveNode(node *v1.Node) error { | |||||
cache.moveNodeInfoToHead(node.Name) | ||||||
} | ||||||
|
||||||
if err := cache.nodeTree.RemoveNode(node); err != nil { | ||||||
if err := cache.nodeTree.removeNode(node); err != nil { | ||||||
return err | ||||||
} | ||||||
cache.removeNodeImageStates(node) | ||||||
|
@@ -688,10 +699,6 @@ func (cache *schedulerCache) expirePod(key string, ps *podState) error { | |||||
return nil | ||||||
} | ||||||
|
||||||
func (cache *schedulerCache) NodeTree() *NodeTree { | ||||||
return cache.nodeTree | ||||||
} | ||||||
|
||||||
// GetNodeInfo returns cached data for the node name. | ||||||
func (cache *schedulerCache) GetNodeInfo(nodeName string) (*v1.Node, error) { | ||||||
cache.mu.RLock() | ||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This broke scalability tests: #84151
TL;DR; this breaks spreading of pods in large clusters.
What exactly has happened:
1, in large enough clusters, we are using the features of finding only N feasible nodes and scoring only those:
kubernetes/pkg/scheduler/core/generic_scheduler.go
Line 464 in 9d17385
So assume, you have 5k nodes, all are feasible, and numFeasible is choosing 250.
With this PR, next() is called only in UpdateNodeSnapshotInfo:
https://github.com/kubernetes/kubernetes/pull/84014/files#diff-f4a894ca5e905aa5f613269fc967fe2cR206
and if the set of nodes doesn't change, we will pretty much always be generating the same set of nodes.
This kind of breaks the fact that scheduler is scheduling in the whole cluster. While it's not documented feature per-se, I think this isn't the right thing to do.
I'm going to open a revert of this PR to to fix scalability tests (or the half of them, because we seem to have two different regressions), but will wait for your explicit approval.
We can discuss how to fix that later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
heh... it's no longer possible to autorevert it..
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fortunately the conflicts were trivial - opened #84222