Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Scheduler should use a shared informer, and fix broken watch behavior for cached watches #46223

Merged
merged 2 commits into from
May 23, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions plugin/cmd/kube-scheduler/app/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ go_library(
"//pkg/client/informers/informers_generated/externalversions/extensions/v1beta1:go_default_library",
"//pkg/client/leaderelection:go_default_library",
"//pkg/client/leaderelection/resourcelock:go_default_library",
"//pkg/controller:go_default_library",
"//pkg/util/configz:go_default_library",
"//plugin/cmd/kube-scheduler/app/options:go_default_library",
"//plugin/pkg/scheduler:go_default_library",
Expand Down
2 changes: 2 additions & 0 deletions plugin/cmd/kube-scheduler/app/configurator.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ func CreateScheduler(
s *options.SchedulerServer,
kubecli *clientset.Clientset,
nodeInformer coreinformers.NodeInformer,
podInformer coreinformers.PodInformer,
pvInformer coreinformers.PersistentVolumeInformer,
pvcInformer coreinformers.PersistentVolumeClaimInformer,
replicationControllerInformer coreinformers.ReplicationControllerInformer,
Expand All @@ -89,6 +90,7 @@ func CreateScheduler(
s.SchedulerName,
kubecli,
nodeInformer,
podInformer,
pvInformer,
pvcInformer,
replicationControllerInformer,
Expand Down
7 changes: 7 additions & 0 deletions plugin/cmd/kube-scheduler/app/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,11 @@ import (
informers "k8s.io/kubernetes/pkg/client/informers/informers_generated/externalversions"
"k8s.io/kubernetes/pkg/client/leaderelection"
"k8s.io/kubernetes/pkg/client/leaderelection/resourcelock"
"k8s.io/kubernetes/pkg/controller"
"k8s.io/kubernetes/pkg/util/configz"
"k8s.io/kubernetes/plugin/cmd/kube-scheduler/app/options"
_ "k8s.io/kubernetes/plugin/pkg/scheduler/algorithmprovider"
"k8s.io/kubernetes/plugin/pkg/scheduler/factory"

"github.com/golang/glog"
"github.com/prometheus/client_golang/prometheus"
Expand Down Expand Up @@ -71,11 +73,14 @@ func Run(s *options.SchedulerServer) error {
recorder := createRecorder(kubecli, s)

informerFactory := informers.NewSharedInformerFactory(kubecli, 0)
// cache only non-terminal pods
podInformer := factory.NewPodInformer(kubecli, 0)

sched, err := CreateScheduler(
s,
kubecli,
informerFactory.Core().V1().Nodes(),
podInformer,
informerFactory.Core().V1().PersistentVolumes(),
informerFactory.Core().V1().PersistentVolumeClaims(),
informerFactory.Core().V1().ReplicationControllers(),
Expand All @@ -92,9 +97,11 @@ func Run(s *options.SchedulerServer) error {

stop := make(chan struct{})
defer close(stop)
go podInformer.Informer().Run(stop)
informerFactory.Start(stop)
// Waiting for all cache to sync before scheduling.
informerFactory.WaitForCacheSync(stop)
controller.WaitForCacheSync("scheduler", stop, podInformer.Informer().HasSynced)

run := func(_ <-chan struct{}) {
sched.Run()
Expand Down
2 changes: 2 additions & 0 deletions plugin/pkg/scheduler/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ go_library(
],
tags = ["automanaged"],
deps = [
"//pkg/api:go_default_library",
"//pkg/api/v1:go_default_library",
"//pkg/client/clientset_generated/clientset:go_default_library",
"//pkg/client/listers/core/v1:go_default_library",
Expand All @@ -52,6 +53,7 @@ go_library(
"//plugin/pkg/scheduler/util:go_default_library",
"//vendor/github.com/golang/glog:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/runtime:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/wait:go_default_library",
"//vendor/k8s.io/client-go/tools/cache:go_default_library",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,7 @@ func TestCompatibility_v1_Scheduler(t *testing.T) {
"some-scheduler-name",
client,
informerFactory.Core().V1().Nodes(),
informerFactory.Core().V1().Pods(),
informerFactory.Core().V1().PersistentVolumes(),
informerFactory.Core().V1().PersistentVolumeClaims(),
informerFactory.Core().V1().ReplicationControllers(),
Expand Down
2 changes: 2 additions & 0 deletions plugin/pkg/scheduler/factory/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ go_library(
"//vendor/k8s.io/apimachinery/pkg/api/errors:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/fields:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/labels:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/runtime/schema:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/types:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/runtime:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library",
Expand Down
192 changes: 155 additions & 37 deletions plugin/pkg/scheduler/factory/factory.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ import (
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/sets"
Expand Down Expand Up @@ -82,7 +84,7 @@ type ConfigFactory struct {
// Close this to stop all reflectors
StopEverything chan struct{}

scheduledPodPopulator cache.Controller
scheduledPodsHasSynced cache.InformerSynced

schedulerCache schedulercache.Cache

Expand All @@ -105,6 +107,7 @@ func NewConfigFactory(
schedulerName string,
client clientset.Interface,
nodeInformer coreinformers.NodeInformer,
podInformer coreinformers.PodInformer,
pvInformer coreinformers.PersistentVolumeInformer,
pvcInformer coreinformers.PersistentVolumeClaimInformer,
replicationControllerInformer coreinformers.ReplicationControllerInformer,
Expand Down Expand Up @@ -132,23 +135,60 @@ func NewConfigFactory(
hardPodAffinitySymmetricWeight: hardPodAffinitySymmetricWeight,
}

// On add/delete to the scheduled pods, remove from the assumed pods.
// We construct this here instead of in CreateFromKeys because
// ScheduledPodLister is something we provide to plug in functions that
// they may need to call.
var scheduledPodIndexer cache.Indexer
scheduledPodIndexer, c.scheduledPodPopulator = cache.NewIndexerInformer(
c.createAssignedNonTerminatedPodLW(),
&v1.Pod{},
0,
cache.ResourceEventHandlerFuncs{
AddFunc: c.addPodToCache,
UpdateFunc: c.updatePodInCache,
DeleteFunc: c.deletePodFromCache,
c.scheduledPodsHasSynced = podInformer.Informer().HasSynced
// scheduled pod cache
podInformer.Informer().AddEventHandler(
cache.FilteringResourceEventHandler{
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will the informer cache all Pods in local (scheduler) and filter? or handled in apiserver?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's the informer's job. The informer in this package is filtered to non-terminal pods, and then split in half into the pod cache (assigned) and queue (unassigned). The global informer in the controller manager (which this PR does not use, but an extension scheduler or hyper kube could use) would be global, and this filters down to that set.

FilterFunc: func(obj interface{}) bool {
switch t := obj.(type) {
case *v1.Pod:
return assignedNonTerminatedPod(t)
default:
runtime.HandleError(fmt.Errorf("unable to handle object in %T: %T", c, obj))
return false
}
},
Handler: cache.ResourceEventHandlerFuncs{
AddFunc: c.addPodToCache,
UpdateFunc: c.updatePodInCache,
DeleteFunc: c.deletePodFromCache,
},
},
)
// unscheduled pod queue
podInformer.Informer().AddEventHandler(
cache.FilteringResourceEventHandler{
FilterFunc: func(obj interface{}) bool {
switch t := obj.(type) {
case *v1.Pod:
return unassignedNonTerminatedPod(t)
default:
runtime.HandleError(fmt.Errorf("unable to handle object in %T: %T", c, obj))
return false
}
},
Handler: cache.ResourceEventHandlerFuncs{
AddFunc: func(obj interface{}) {
if err := c.podQueue.Add(obj); err != nil {
runtime.HandleError(fmt.Errorf("unable to queue %T: %v", obj, err))
}
},
UpdateFunc: func(oldObj, newObj interface{}) {
if err := c.podQueue.Update(newObj); err != nil {
runtime.HandleError(fmt.Errorf("unable to update %T: %v", newObj, err))
}
},
DeleteFunc: func(obj interface{}) {
if err := c.podQueue.Delete(obj); err != nil {
runtime.HandleError(fmt.Errorf("unable to dequeue %T: %v", obj, err))
}
},
},
},
cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc},
)
c.scheduledPodLister = corelisters.NewPodLister(scheduledPodIndexer)
// ScheduledPodLister is something we provide to plug-in functions that
// they may need to call.
c.scheduledPodLister = assignedPodLister{podInformer.Lister()}

// Only nodes in the "Ready" condition with status == "True" are schedulable
nodeInformer.Informer().AddEventHandlerWithResyncPeriod(
Expand Down Expand Up @@ -369,7 +409,6 @@ func (f *ConfigFactory) CreateFromKeys(predicateKeys, priorityKeys sets.String,
return nil, err
}

f.Run()
// TODO(resouer) use equivalence cache instead of nil here when #36238 get merged
algo := core.NewGenericScheduler(f.schedulerCache, nil, predicateFuncs, predicateMetaProducer, priorityConfigs, priorityMetaProducer, extenders)
podBackoff := util.CreateDefaultPodBackoff()
Expand All @@ -381,7 +420,7 @@ func (f *ConfigFactory) CreateFromKeys(predicateKeys, priorityKeys sets.String,
Binder: &binder{f.client},
PodConditionUpdater: &podConditionUpdater{f.client},
WaitForCacheSync: func() bool {
return cache.WaitForCacheSync(f.StopEverything, f.scheduledPodPopulator.HasSynced)
return cache.WaitForCacheSync(f.StopEverything, f.scheduledPodsHasSynced)
},
NextPod: func() *v1.Pod {
return f.getNextPod()
Expand Down Expand Up @@ -450,14 +489,6 @@ func (f *ConfigFactory) getPluginArgs() (*PluginFactoryArgs, error) {
}, nil
}

func (f *ConfigFactory) Run() {
// Watch and queue pods that need scheduling.
cache.NewReflector(f.createUnassignedNonTerminatedPodLW(), &v1.Pod{}, f.podQueue, 0).RunUntil(f.StopEverything)

// Begin populating scheduled pods.
go f.scheduledPodPopulator.Run(f.StopEverything)
}

func (f *ConfigFactory) getNextPod() *v1.Pod {
for {
pod := cache.Pop(f.podQueue).(*v1.Pod)
Expand Down Expand Up @@ -500,19 +531,106 @@ func getNodeConditionPredicate() corelisters.NodeConditionPredicate {
}
}

// Returns a cache.ListWatch that finds all pods that need to be
// scheduled.
func (factory *ConfigFactory) createUnassignedNonTerminatedPodLW() *cache.ListWatch {
selector := fields.ParseSelectorOrDie("spec.nodeName==" + "" + ",status.phase!=" + string(v1.PodSucceeded) + ",status.phase!=" + string(v1.PodFailed))
return cache.NewListWatchFromClient(factory.client.Core().RESTClient(), "pods", metav1.NamespaceAll, selector)
// unassignedNonTerminatedPod selects pods that are unassigned and non-terminal.
func unassignedNonTerminatedPod(pod *v1.Pod) bool {
if len(pod.Spec.NodeName) != 0 {
return false
}
if pod.Status.Phase == v1.PodSucceeded || pod.Status.Phase == v1.PodFailed {
return false
}
return true
}

// Returns a cache.ListWatch that finds all pods that are
// already scheduled.
// TODO: return a ListerWatcher interface instead?
func (factory *ConfigFactory) createAssignedNonTerminatedPodLW() *cache.ListWatch {
selector := fields.ParseSelectorOrDie("spec.nodeName!=" + "" + ",status.phase!=" + string(v1.PodSucceeded) + ",status.phase!=" + string(v1.PodFailed))
return cache.NewListWatchFromClient(factory.client.Core().RESTClient(), "pods", metav1.NamespaceAll, selector)
// assignedNonTerminatedPod selects pods that are assigned and non-terminal (scheduled and running).
func assignedNonTerminatedPod(pod *v1.Pod) bool {
if len(pod.Spec.NodeName) == 0 {
return false
}
if pod.Status.Phase == v1.PodSucceeded || pod.Status.Phase == v1.PodFailed {
return false
}
return true
}

// assignedPodLister filters the pods returned from a PodLister to
// only include those that have a node name set.
type assignedPodLister struct {
corelisters.PodLister
}

// List lists all Pods in the indexer for a given namespace.
func (l assignedPodLister) List(selector labels.Selector) ([]*v1.Pod, error) {
list, err := l.PodLister.List(selector)
if err != nil {
return nil, err
}
filtered := make([]*v1.Pod, 0, len(list))
for _, pod := range list {
if len(pod.Spec.NodeName) > 0 {
filtered = append(filtered, pod)
}
}
return filtered, nil
}

// List lists all Pods in the indexer for a given namespace.
func (l assignedPodLister) Pods(namespace string) corelisters.PodNamespaceLister {
return assignedPodNamespaceLister{l.PodLister.Pods(namespace)}
}

// assignedPodNamespaceLister filters the pods returned from a PodNamespaceLister to
// only include those that have a node name set.
type assignedPodNamespaceLister struct {
corelisters.PodNamespaceLister
}

// List lists all Pods in the indexer for a given namespace.
func (l assignedPodNamespaceLister) List(selector labels.Selector) (ret []*v1.Pod, err error) {
list, err := l.PodNamespaceLister.List(selector)
if err != nil {
return nil, err
}
filtered := make([]*v1.Pod, 0, len(list))
for _, pod := range list {
if len(pod.Spec.NodeName) > 0 {
filtered = append(filtered, pod)
}
}
return filtered, nil
}

// Get retrieves the Pod from the indexer for a given namespace and name.
func (l assignedPodNamespaceLister) Get(name string) (*v1.Pod, error) {
pod, err := l.PodNamespaceLister.Get(name)
if err != nil {
return nil, err
}
if len(pod.Spec.NodeName) > 0 {
return pod, nil
}
return nil, errors.NewNotFound(schema.GroupResource{Resource: "pods"}, name)
}

type podInformer struct {
informer cache.SharedIndexInformer
}

func (i *podInformer) Informer() cache.SharedIndexInformer {
return i.informer
}

func (i *podInformer) Lister() corelisters.PodLister {
return corelisters.NewPodLister(i.informer.GetIndexer())
}

// NewPodInformer creates a shared index informer that returns only non-terminal pods.
func NewPodInformer(client clientset.Interface, resyncPeriod time.Duration) coreinformers.PodInformer {
selector := fields.ParseSelectorOrDie("status.phase!=" + string(v1.PodSucceeded) + ",status.phase!=" + string(v1.PodFailed))
lw := cache.NewListWatchFromClient(client.Core().RESTClient(), "pods", metav1.NamespaceAll, selector)
return &podInformer{
informer: cache.NewSharedIndexInformer(lw, &v1.Pod{}, resyncPeriod, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}),
}
}

func (factory *ConfigFactory) MakeDefaultErrorFunc(backoff *util.PodBackoff, podQueue *cache.FIFO) func(pod *v1.Pod, err error) {
Expand Down
Loading