Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: scheduler framework: support simulating cluster changes during autoscaling #120936

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion pkg/scheduler/framework/cycle_state.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ var (
type StateData interface {
// Clone is an interface to make a copy of StateData. For performance reasons,
// clone should make shallow copies for members (e.g., slices or maps) that are not
// impacted by PreFilter's optional AddPod/RemovePod methods.
// impacted by PreFilter's optional AddPod/RemovePod methods or by
// ClusterAutoscalerPlugin's SimulateBindPod.
Clone() StateData
}

Expand Down
56 changes: 56 additions & 0 deletions pkg/scheduler/framework/interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,59 @@ type Plugin interface {
Name() string
}

// ClusterAutoScalerPlugin is an interface that is used only by the cluster autoscaler.
// It enables plugins to store state across different scheduling cycles.
//
// The usual call sequence of a plugin when used in the scheduler is:
// - at program startup:
// - instantiate plugin
// - EventsToRegister
// - for each new pod:
// - PreEnqueue
// - for each pod that is ready to be scheduled, one pod at a time:
// - PreFilter, Filter, etc.
//
// Cluster autoscaler works a bit differently. It identifies all pending pods,
// takes a snapshot of the current cluster state, and then simulates the effect
// of scheduling those pods with additional nodes added to the cluster. To
// determine whether a pod fits into one of these simulated nodes, it
// uses the same PreFilter and Filter plugins as the scheduler. Other extension
// points (Reserve, Bind) are not used. Plugins which modify the cluster state
// therefore need a different way of recording the result of scheduling
// a pod onto a node. This is done through ClusterAutoScalerPlugin.
//
// Cluster autoscaler will:
// - at program startup:
// - instantiate plugin, with real informer factory and no Kubernetes client
// - start informers
// - at the start of a simulation:
// - call StartSimulation with a clean cycle state
// - for each pending pod:
// - call PreFilter and Filter with the same cycle state that
// was passed to StartSimulation
// - call SimulateBindPod with the same cycle state that
// was passed to StartSimulation (i.e. *not* the one which was modified
// by PreFilter or Filter) to indicate that a pod is being scheduled onto a node
// as part of the simulation
//
// A plugin may:
// - Take a snapshot of all relevant cluster state as part of StartSimulation
// and store it in the cycle state. This signals to the other extension
// points that the plugin is being used as part of the cluster autoscaler.
// . In PreFilter and Filter use the cluster snapshot to make decisions
// instead of the normal "live" cluster state.
// - In SimulateBindPod update the snapshot in the cycle state.
type ClusterAutoScalerPlugin interface {
Plugin
// StartSimulation is called when the cluster autoscaler begins
// a simulation.
StartSimulation(ctx context.Context, state *CycleState) *Status
// SimulateBindPod is called when the cluster autoscaler decided to schedule
// a pod onto a certain node.
SimulateBindPod(ctx context.Context, state *CycleState, pod *v1.Pod, nodeInfo *NodeInfo) *Status
// TODO(?): ClusterAutoScalerPluginExtensions
}

// PreEnqueuePlugin is an interface that must be implemented by "PreEnqueue" plugins.
// These plugins are called prior to adding Pods to activeQ.
// Note: an preEnqueue plugin is expected to be lightweight and efficient, so it's not expected to
Expand Down Expand Up @@ -512,6 +565,9 @@ type BindPlugin interface {
type Framework interface {
Handle

// ClusterAutoscalerPlugins returns the registered ClusterAutoscaler plugins.
ClusterAutoscalerPlugins() []ClusterAutoScalerPlugin

// PreEnqueuePlugins returns the registered preEnqueue plugins.
PreEnqueuePlugins() []PreEnqueuePlugin

Expand Down
55 changes: 38 additions & 17 deletions pkg/scheduler/framework/runtime/framework.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,23 +47,24 @@ const (
// frameworkImpl is the component responsible for initializing and running scheduler
// plugins.
type frameworkImpl struct {
registry Registry
snapshotSharedLister framework.SharedLister
waitingPods *waitingPodsMap
scorePluginWeight map[string]int
preEnqueuePlugins []framework.PreEnqueuePlugin
enqueueExtensions []framework.EnqueueExtensions
queueSortPlugins []framework.QueueSortPlugin
preFilterPlugins []framework.PreFilterPlugin
filterPlugins []framework.FilterPlugin
postFilterPlugins []framework.PostFilterPlugin
preScorePlugins []framework.PreScorePlugin
scorePlugins []framework.ScorePlugin
reservePlugins []framework.ReservePlugin
preBindPlugins []framework.PreBindPlugin
bindPlugins []framework.BindPlugin
postBindPlugins []framework.PostBindPlugin
permitPlugins []framework.PermitPlugin
registry Registry
snapshotSharedLister framework.SharedLister
waitingPods *waitingPodsMap
scorePluginWeight map[string]int
clusterAutoScalerPlugins []framework.ClusterAutoScalerPlugin
preEnqueuePlugins []framework.PreEnqueuePlugin
enqueueExtensions []framework.EnqueueExtensions
queueSortPlugins []framework.QueueSortPlugin
preFilterPlugins []framework.PreFilterPlugin
filterPlugins []framework.FilterPlugin
postFilterPlugins []framework.PostFilterPlugin
preScorePlugins []framework.PreScorePlugin
scorePlugins []framework.ScorePlugin
reservePlugins []framework.ReservePlugin
preBindPlugins []framework.PreBindPlugin
bindPlugins []framework.BindPlugin
postBindPlugins []framework.PostBindPlugin
permitPlugins []framework.PermitPlugin

clientSet clientset.Interface
kubeConfig *restclient.Config
Expand Down Expand Up @@ -325,6 +326,21 @@ func NewFramework(ctx context.Context, r Registry, profile *config.KubeScheduler
}
}

// ClusterAutoscaler plugins are not configured separately. Any PreFilter or Filter plugin
// may implement that additional interface.
clusterAutoScalerPlugins := sets.New[framework.ClusterAutoScalerPlugin]()
for _, plugin := range f.preFilterPlugins {
if plugin, ok := plugin.(framework.ClusterAutoScalerPlugin); ok {
clusterAutoScalerPlugins.Insert(plugin)
}
}
for _, plugin := range f.filterPlugins {
if plugin, ok := plugin.(framework.ClusterAutoScalerPlugin); ok {
clusterAutoScalerPlugins.Insert(plugin)
}
}
f.clusterAutoScalerPlugins = clusterAutoScalerPlugins.UnsortedList()

if len(f.queueSortPlugins) != 1 {
return nil, fmt.Errorf("only one queue sort plugin required for profile with scheduler name %q, but got %d", profile.SchedulerName, len(f.queueSortPlugins))
}
Expand Down Expand Up @@ -598,6 +614,11 @@ func updatePluginList(pluginList interface{}, pluginSet config.PluginSet, plugin
return nil
}

// ClusterAutoscalerPlugins returns the registered ClusterAutoscaler plugins.
func (f *frameworkImpl) ClusterAutoscalerPlugins() []framework.ClusterAutoScalerPlugin {
return f.clusterAutoScalerPlugins
}

// PreEnqueuePlugins returns the registered preEnqueue plugins.
func (f *frameworkImpl) PreEnqueuePlugins() []framework.PreEnqueuePlugin {
return f.preEnqueuePlugins
Expand Down