New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Use endpoints informer for the endpoint controller #47731
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,6 +31,7 @@ import ( | |
"k8s.io/apimachinery/pkg/util/wait" | ||
"k8s.io/client-go/tools/cache" | ||
"k8s.io/client-go/util/workqueue" | ||
"k8s.io/kubernetes/pkg/api" | ||
"k8s.io/kubernetes/pkg/api/v1/endpoints" | ||
podutil "k8s.io/kubernetes/pkg/api/v1/pod" | ||
"k8s.io/kubernetes/pkg/client/clientset_generated/clientset" | ||
|
@@ -69,13 +70,15 @@ var ( | |
) | ||
|
||
// NewEndpointController returns a new *EndpointController. | ||
func NewEndpointController(podInformer coreinformers.PodInformer, serviceInformer coreinformers.ServiceInformer, client clientset.Interface) *EndpointController { | ||
func NewEndpointController(podInformer coreinformers.PodInformer, serviceInformer coreinformers.ServiceInformer, | ||
endpointsInformer coreinformers.EndpointsInformer, client clientset.Interface) *EndpointController { | ||
if client != nil && client.Core().RESTClient().GetRateLimiter() != nil { | ||
metrics.RegisterMetricAndTrackRateLimiterUsage("endpoint_controller", client.Core().RESTClient().GetRateLimiter()) | ||
} | ||
e := &EndpointController{ | ||
client: client, | ||
queue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "endpoint"), | ||
client: client, | ||
queue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "endpoint"), | ||
workerLoopPeriod: time.Second, | ||
} | ||
|
||
serviceInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ | ||
|
@@ -96,6 +99,9 @@ func NewEndpointController(podInformer coreinformers.PodInformer, serviceInforme | |
e.podLister = podInformer.Lister() | ||
e.podsSynced = podInformer.Informer().HasSynced | ||
|
||
e.endpointsLister = endpointsInformer.Lister() | ||
e.endpointsSynced = endpointsInformer.Informer().HasSynced | ||
|
||
return e | ||
} | ||
|
||
|
@@ -117,12 +123,22 @@ type EndpointController struct { | |
// Added as a member to the struct to allow injection for testing. | ||
podsSynced cache.InformerSynced | ||
|
||
// endpointsLister is able to list/get endpoints and is populated by the shared informer passed to | ||
// NewEndpointController. | ||
endpointsLister corelisters.EndpointsLister | ||
// endpointsSynced returns true if the endpoints shared informer has been synced at least once. | ||
// Added as a member to the struct to allow injection for testing. | ||
endpointsSynced cache.InformerSynced | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, please wait for endpoint cache to sync. You don't want to make decisions on unintentionally blank data. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yup, I thought I had done it, will fix. |
||
|
||
// Services that need to be updated. A channel is inappropriate here, | ||
// because it allows services with lots of pods to be serviced much | ||
// more often than services with few pods; it also would cause a | ||
// service that's inserted multiple times to be processed more than | ||
// necessary. | ||
queue workqueue.RateLimitingInterface | ||
|
||
// workerLoopPeriod is the time between worker runs. The workers process the queue of service and pod changes. | ||
workerLoopPeriod time.Duration | ||
} | ||
|
||
// Runs e; will not return until stopCh is closed. workers determines how many | ||
|
@@ -134,12 +150,12 @@ func (e *EndpointController) Run(workers int, stopCh <-chan struct{}) { | |
glog.Infof("Starting endpoint controller") | ||
defer glog.Infof("Shutting down endpoint controller") | ||
|
||
if !controller.WaitForCacheSync("endpoint", stopCh, e.podsSynced, e.servicesSynced) { | ||
if !controller.WaitForCacheSync("endpoint", stopCh, e.podsSynced, e.servicesSynced, e.endpointsSynced) { | ||
return | ||
} | ||
|
||
for i := 0; i < workers; i++ { | ||
go wait.Until(e.worker, time.Second, stopCh) | ||
go wait.Until(e.worker, e.workerLoopPeriod, stopCh) | ||
} | ||
|
||
go func() { | ||
|
@@ -413,7 +429,7 @@ func (e *EndpointController) syncService(key string) error { | |
subsets = endpoints.RepackSubsets(subsets) | ||
|
||
// See if there's actually an update here. | ||
currentEndpoints, err := e.client.Core().Endpoints(service.Namespace).Get(service.Name, metav1.GetOptions{}) | ||
currentEndpoints, err := e.endpointsLister.Endpoints(service.Namespace).Get(service.Name) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You want to deep-copy these below before you mutate them. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1. You need to do deep copy in line 432 (in case when you didn't enter |
||
if err != nil { | ||
if errors.IsNotFound(err) { | ||
currentEndpoints = &v1.Endpoints{ | ||
|
@@ -432,7 +448,11 @@ func (e *EndpointController) syncService(key string) error { | |
glog.V(5).Infof("endpoints are equal for %s/%s, skipping update", service.Namespace, service.Name) | ||
return nil | ||
} | ||
newEndpoints := currentEndpoints | ||
copy, err := api.Scheme.DeepCopy(currentEndpoints) | ||
if err != nil { | ||
return err | ||
} | ||
newEndpoints := copy.(*v1.Endpoints) | ||
newEndpoints.Subsets = subsets | ||
newEndpoints.Labels = service.Labels | ||
if newEndpoints.Annotations == nil { | ||
|
@@ -468,13 +488,12 @@ func (e *EndpointController) syncService(key string) error { | |
// some stragglers could have been left behind if the endpoint controller | ||
// reboots). | ||
func (e *EndpointController) checkLeftoverEndpoints() { | ||
list, err := e.client.Core().Endpoints(metav1.NamespaceAll).List(metav1.ListOptions{}) | ||
list, err := e.endpointsLister.List(labels.Everything()) | ||
if err != nil { | ||
utilruntime.HandleError(fmt.Errorf("Unable to list endpoints (%v); orphaned endpoints will not be cleaned up. (They're pretty harmless, but you can restart this component if you want another attempt made.)", err)) | ||
return | ||
} | ||
for i := range list.Items { | ||
ep := &list.Items[i] | ||
for _, ep := range list { | ||
if _, ok := ep.Annotations[resourcelock.LeaderElectionRecordAnnotationKey]; ok { | ||
// when there are multiple controller-manager instances, | ||
// we observe that it will delete leader-election endpoints after 5min | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I didn't realize we weren't watching endpoints before - given that this controller is expected to know and mutate every endpoint, I think this is exactly what we should be doing. It will have a memory impact, but will drop a significant chunk of QPS from the apiservers.
@wojtek-t @gmarek may change our kube-mark profiles, but I would expect it to be in a positive way.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
QPS drops substantially in my testing. I also don't see spurious updates I was seeing before (during a resync, about 20% of the endpoints would cause a PUT even though nothing changed - not sure why).