New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Predicate cacheing and cleanup #33763
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
/* | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jayunit100 - thanks this is exactly what I was asking for. As you wrote somewhere, this PR became pretty big. Would you be able to separate this change (I mean this metadata file together with all the changes to integrate it in factory etc.) to a separate PR? And then in this PR you would leave the changes related to service-affinity? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @wojtek-t i could break it up, but my first question would be - how should we separate the changes?
so its a pretty cohesive change set. Either way is ok, just let me know... but figured i would double-check that you think its necessary... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that what you can separate is all the changes to ServiceAffinity (which means all the changes in "checkServiceAffinity" and the utils and utils_test.go files). This would significantly reduce the size of this PR. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hmmm...
however, if you want i can think more deeply about separating it. ill let you make the final call either way is ok with me, i know youre very busy, so anything i can do to help ill do :) |
||
Copyright 2016 The Kubernetes Authors. | ||
|
||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
|
||
http://www.apache.org/licenses/LICENSE-2.0 | ||
|
||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
package predicates | ||
|
||
import ( | ||
"github.com/golang/glog" | ||
"k8s.io/kubernetes/pkg/api" | ||
"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm" | ||
"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache" | ||
) | ||
|
||
type PredicateMetadataFactory struct { | ||
podLister algorithm.PodLister | ||
} | ||
|
||
func NewPredicateMetadataFactory(podLister algorithm.PodLister) algorithm.MetadataProducer { | ||
factory := &PredicateMetadataFactory{ | ||
podLister, | ||
} | ||
return factory.GetMetadata | ||
} | ||
|
||
// GetMetadata returns the predicateMetadata used which will be used by various predicates. | ||
func (pfactory *PredicateMetadataFactory) GetMetadata(pod *api.Pod, nodeNameToInfoMap map[string]*schedulercache.NodeInfo) interface{} { | ||
// If we cannot compute metadata, just return nil | ||
if pod == nil { | ||
return nil | ||
} | ||
matchingTerms, err := getMatchingAntiAffinityTerms(pod, nodeNameToInfoMap) | ||
if err != nil { | ||
return nil | ||
} | ||
predicateMetadata := &predicateMetadata{ | ||
pod: pod, | ||
podBestEffort: isPodBestEffort(pod), | ||
podRequest: GetResourceRequest(pod), | ||
podPorts: GetUsedPorts(pod), | ||
matchingAntiAffinityTerms: matchingTerms, | ||
} | ||
for predicateName, precomputeFunc := range predicatePrecomputations { | ||
glog.V(4).Info("Precompute: %v", predicateName) | ||
precomputeFunc(predicateMetadata) | ||
} | ||
return predicateMetadata | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -36,6 +36,19 @@ import ( | |
"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache" | ||
) | ||
|
||
// predicatePrecomputations: Helper types/variables... | ||
type PredicateMetadataModifier func(pm *predicateMetadata) | ||
|
||
var predicatePrecomputeRegisterLock sync.Mutex | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: can you please move it above predicatePrecomputations? For me it's always easier to see the lock before the variable it's locking. |
||
var predicatePrecomputations map[string]PredicateMetadataModifier = make(map[string]PredicateMetadataModifier) | ||
|
||
func RegisterPredicatePrecomputation(predicateName string, precomp PredicateMetadataModifier) { | ||
predicatePrecomputeRegisterLock.Lock() | ||
defer predicatePrecomputeRegisterLock.Unlock() | ||
predicatePrecomputations[predicateName] = precomp | ||
} | ||
|
||
// Other types for predicate functions... | ||
type NodeInfo interface { | ||
GetNodeInfo(nodeID string) (*api.Node, error) | ||
} | ||
|
@@ -67,34 +80,21 @@ func (c *CachedNodeInfo) GetNodeInfo(id string) (*api.Node, error) { | |
return node.(*api.Node), nil | ||
} | ||
|
||
// predicateMetadata is a type that is passed as metadata for predicate functions | ||
type predicateMetadata struct { | ||
podBestEffort bool | ||
podRequest *schedulercache.Resource | ||
podPorts map[int]bool | ||
matchingAntiAffinityTerms []matchingPodAntiAffinityTerm | ||
} | ||
|
||
// Note that predicateMetdata and matchingPodAntiAffinityTerm need to be declared in the same file | ||
// due to the way declarations are processed in predicate declaration unit tests. | ||
type matchingPodAntiAffinityTerm struct { | ||
term *api.PodAffinityTerm | ||
node *api.Node | ||
} | ||
|
||
func PredicateMetadata(pod *api.Pod, nodeInfoMap map[string]*schedulercache.NodeInfo) interface{} { | ||
// If we cannot compute metadata, just return nil | ||
if pod == nil { | ||
return nil | ||
} | ||
matchingTerms, err := getMatchingAntiAffinityTerms(pod, nodeInfoMap) | ||
if err != nil { | ||
return nil | ||
} | ||
return &predicateMetadata{ | ||
podBestEffort: isPodBestEffort(pod), | ||
podRequest: GetResourceRequest(pod), | ||
podPorts: GetUsedPorts(pod), | ||
matchingAntiAffinityTerms: matchingTerms, | ||
} | ||
type predicateMetadata struct { | ||
pod *api.Pod | ||
podBestEffort bool | ||
podRequest *schedulercache.Resource | ||
podPorts map[int]bool | ||
matchingAntiAffinityTerms []matchingPodAntiAffinityTerm | ||
serviceAffinityMatchingPodList []*api.Pod | ||
serviceAffinityMatchingPodServices []*api.Service | ||
} | ||
|
||
func isVolumeConflict(volume api.Volume, pod *api.Pod) bool { | ||
|
@@ -627,20 +627,42 @@ type ServiceAffinity struct { | |
labels []string | ||
} | ||
|
||
func NewServiceAffinityPredicate(podLister algorithm.PodLister, serviceLister algorithm.ServiceLister, nodeInfo NodeInfo, labels []string) algorithm.FitPredicate { | ||
// serviceAffinityPrecomputation should be run once by the scheduler before looping through the Predicate. It is a helper function that | ||
// only should be referenced by NewServiceAffinityPredicate. | ||
func (s *ServiceAffinity) serviceAffinityPrecomputation(pm *predicateMetadata) { | ||
if pm.pod == nil { | ||
glog.Errorf("Cannot precompute service affinity, a pod is required to caluculate service affinity.") | ||
return | ||
} | ||
|
||
var errSvc, errList error | ||
// Store services which match the pod. | ||
pm.serviceAffinityMatchingPodServices, errSvc = s.serviceLister.GetPodServices(pm.pod) | ||
selector := CreateSelectorFromLabels(pm.pod.Labels) | ||
// consider only the pods that belong to the same namespace | ||
allMatches, errList := s.podLister.List(selector) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: remove empty line |
||
// In the future maybe we will return them as part of the function. | ||
if errSvc != nil || errList != nil { | ||
glog.Errorf("Some Error were found while precomputing svc affinity: \nservices:%v , \npods:%v", errSvc, errList) | ||
} | ||
pm.serviceAffinityMatchingPodList = FilterPodsByNamespace(allMatches, pm.pod.Namespace) | ||
} | ||
|
||
func NewServiceAffinityPredicate(podLister algorithm.PodLister, serviceLister algorithm.ServiceLister, nodeInfo NodeInfo, labels []string) (algorithm.FitPredicate, PredicateMetadataModifier) { | ||
affinity := &ServiceAffinity{ | ||
podLister: podLister, | ||
serviceLister: serviceLister, | ||
nodeInfo: nodeInfo, | ||
labels: labels, | ||
} | ||
return affinity.CheckServiceAffinity | ||
return affinity.checkServiceAffinity, affinity.serviceAffinityPrecomputation | ||
} | ||
|
||
// The checkServiceAffinity predicate matches nodes in such a way to force that | ||
// ServiceAffinity.labels are homogenous for pods added to a node. | ||
// (i.e. it returns true IFF this pod can be added to this node, such | ||
// that all other pods in the same service are running on nodes w/ | ||
// checkServiceAffinity is a predicate which matches nodes in such a way to force that | ||
// ServiceAffinity.labels are homogenous for pods that are scheduled to a node. | ||
// (i.e. it returns true IFF this pod can be added to this node such that all other pods in | ||
// the same service are running on nodes with | ||
// the exact same ServiceAffinity.label values). | ||
// | ||
// Details: | ||
|
@@ -650,46 +672,47 @@ func NewServiceAffinityPredicate(podLister algorithm.PodLister, serviceLister al | |
// the match. | ||
// Otherwise: | ||
// Create an "implicit selector" which gaurantees pods will land on nodes with similar values | ||
// for the affinity labels. | ||
// for the affinity labels. | ||
// | ||
// To do this, we "reverse engineer" a selector by introspecting existing pods running under the same service+namespace. | ||
// These backfilled labels in the selector "L" are defined like so: | ||
// - L is a label that the ServiceAffinity object needs as a matching constraints. | ||
// - L is not defined in the pod itself already. | ||
// - and SOME pod, from a service, in the same namespace, ALREADY scheduled onto a node, has a matching value. | ||
func (s *ServiceAffinity) CheckServiceAffinity(pod *api.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { | ||
// | ||
// WARNING: This Predicate is NOT gauranteed to work if some of the predicateMetadata data isn't precomputed... | ||
// For that reason it is not exported, i.e. it is highlhy coupled to the implementation of the FitPredicate construction. | ||
func (s *ServiceAffinity) checkServiceAffinity(pod *api.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { | ||
var services []*api.Service | ||
var pods []*api.Pod | ||
if pm, ok := meta.(*predicateMetadata); ok && (pm.serviceAffinityMatchingPodList != nil || pm.serviceAffinityMatchingPodServices != nil) { | ||
services = pm.serviceAffinityMatchingPodServices | ||
pods = pm.serviceAffinityMatchingPodList | ||
} else { | ||
// Make the predicate resilient in case metadata is missing. | ||
pm = &predicateMetadata{pod: pod} | ||
s.serviceAffinityPrecomputation(pm) | ||
pods, services = pm.serviceAffinityMatchingPodList, pm.serviceAffinityMatchingPodServices | ||
} | ||
node := nodeInfo.Node() | ||
if node == nil { | ||
return false, nil, fmt.Errorf("node not found") | ||
} | ||
|
||
// check if the pod being scheduled has the affinity labels specified in its NodeSelector | ||
affinityLabels := FindLabelsInSet(s.labels, labels.Set(pod.Spec.NodeSelector)) | ||
|
||
// Introspect services IFF we didn't predefine all the affinity labels in the pod itself. | ||
// Step 1: If we don't have all constraints, introspect nodes to find the missing constraints. | ||
if len(s.labels) > len(affinityLabels) { | ||
services, err := s.serviceLister.GetPodServices(pod) | ||
if err == nil && len(services) > 0 { | ||
// just use the first service and get the other pods within the service | ||
// TODO: a separate predicate can be created that tries to handle all services for the pod | ||
selector := labels.SelectorFromSet(services[0].Spec.Selector) | ||
servicePods, err := s.podLister.List(selector) | ||
if err != nil { | ||
return false, nil, err | ||
} | ||
// consider only the pods that belong to the same namespace | ||
nsServicePods := FilterPodsByNamespace(servicePods, pod.Namespace) | ||
if len(nsServicePods) > 0 { | ||
// consider any service pod and fetch the node its hosted on | ||
otherNode, err := s.nodeInfo.GetNodeInfo(nsServicePods[0].Spec.NodeName) | ||
if len(services) > 0 { | ||
if len(pods) > 0 { | ||
nodeWithAffinityLabels, err := s.nodeInfo.GetNodeInfo(pods[0].Spec.NodeName) | ||
if err != nil { | ||
return false, nil, err | ||
} | ||
AddUnsetLabelsToMap(affinityLabels, s.labels, labels.Set(otherNode.Labels)) | ||
AddUnsetLabelsToMap(affinityLabels, s.labels, labels.Set(nodeWithAffinityLabels.Labels)) | ||
} | ||
} | ||
} | ||
|
||
// check if the node matches the selector | ||
// Step 2: Finally complete the affinity predicate based on whatever set of predicates we were able to find. | ||
if CreateSelectorFromLabels(affinityLabels).Matches(labels.Set(node.Labels)) { | ||
return true, nil, nil | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -49,7 +49,11 @@ func ExampleFindLabelsInSet() { | |
}, | ||
}, | ||
|
||
{}, // a third pod which will have no effect on anything. | ||
{ | ||
ObjectMeta: api.ObjectMeta{ | ||
Name: "pod3ThatWeWontSee", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This modification makes the unit test a little more robust for debugging, in terms of the expected output. |
||
}, | ||
}, | ||
} | ||
fmt.Println(FindLabelsInSet([]string{"label1", "label2", "label3"}, nsPods[0].ObjectMeta.Labels)["label3"]) | ||
AddUnsetLabelsToMap(labelSubset, []string{"label1", "label2", "label3"}, nsPods[0].ObjectMeta.Labels) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This makes the if err == nil check which woj suggested less noisy,and more importantly makes it so semantics conform to the other GetPodsServices tests.