Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize priorities in scheduler #28769

Merged
merged 5 commits into from
Jul 11, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 4 additions & 4 deletions pkg/api/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -470,7 +470,7 @@ func GetTaintsFromNodeAnnotations(annotations map[string]string) ([]Taint, error
}

// TolerationToleratesTaint checks if the toleration tolerates the taint.
func TolerationToleratesTaint(toleration Toleration, taint Taint) bool {
func TolerationToleratesTaint(toleration *Toleration, taint *Taint) bool {
if len(toleration.Effect) != 0 && toleration.Effect != taint.Effect {
return false
}
Expand All @@ -490,10 +490,10 @@ func TolerationToleratesTaint(toleration Toleration, taint Taint) bool {
}

// TaintToleratedByTolerations checks if taint is tolerated by any of the tolerations.
func TaintToleratedByTolerations(taint Taint, tolerations []Toleration) bool {
func TaintToleratedByTolerations(taint *Taint, tolerations []Toleration) bool {
tolerated := false
for _, toleration := range tolerations {
if TolerationToleratesTaint(toleration, taint) {
for i := range tolerations {
if TolerationToleratesTaint(&tolerations[i], taint) {
tolerated = true
break
}
Expand Down
60 changes: 34 additions & 26 deletions plugin/pkg/scheduler/algorithm/predicates/predicates.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,10 @@ func (c *CachedNodeInfo) GetNodeInfo(id string) (*api.Node, error) {
return node.(*api.Node), nil
}

// podMetadata defines a type, that is an expected type that is passed
// as metadata for predicate functions
// podMetadata is a type that is passed as metadata for predicate functions
type predicateMetadata struct {
podBestEffort bool
podRequest *resourceRequest
}

func PredicateMetadata(pod *api.Pod) interface{} {
Expand All @@ -78,6 +78,7 @@ func PredicateMetadata(pod *api.Pod) interface{} {
}
return &predicateMetadata{
podBestEffort: isPodBestEffort(pod),
podRequest: getResourceRequest(pod),
}
}

Expand Down Expand Up @@ -405,7 +406,7 @@ type resourceRequest struct {
nvidiaGPU int64
}

func getResourceRequest(pod *api.Pod) resourceRequest {
func getResourceRequest(pod *api.Pod) *resourceRequest {
result := resourceRequest{}
for _, container := range pod.Spec.Containers {
requests := container.Resources.Requests
Expand All @@ -423,7 +424,7 @@ func getResourceRequest(pod *api.Pod) resourceRequest {
result.milliCPU = cpu
}
}
return result
return &result
}

func CheckPodsExceedingFreeResources(pods []*api.Pod, allocatable api.ResourceList) (fitting []*api.Pod, notFittingCPU, notFittingMemory, notFittingNvidiaGPU []*api.Pod) {
Expand Down Expand Up @@ -471,17 +472,25 @@ func PodFitsResources(pod *api.Pod, meta interface{}, nodeInfo *schedulercache.N
if node == nil {
return false, fmt.Errorf("node not found")
}
allocatable := node.Status.Allocatable
allowedPodNumber := allocatable.Pods().Value()
if int64(len(nodeInfo.Pods()))+1 > allowedPodNumber {
allowedPodNumber := nodeInfo.AllowedPodNumber()
if len(nodeInfo.Pods())+1 > allowedPodNumber {
return false,
newInsufficientResourceError(podCountResourceName, 1, int64(len(nodeInfo.Pods())), allowedPodNumber)
newInsufficientResourceError(podCountResourceName, 1, int64(len(nodeInfo.Pods())), int64(allowedPodNumber))
}

var podRequest *resourceRequest
predicateMeta, ok := meta.(*predicateMetadata)
if ok {
podRequest = predicateMeta.podRequest
} else {
// We couldn't parse metadata - fallback to computing it.
podRequest = getResourceRequest(pod)
}
podRequest := getResourceRequest(pod)
if podRequest.milliCPU == 0 && podRequest.memory == 0 && podRequest.nvidiaGPU == 0 {
return true, nil
}

allocatable := node.Status.Allocatable
totalMilliCPU := allocatable.Cpu().MilliValue()
totalMemory := allocatable.Memory().Value()
totalNvidiaGPU := allocatable.NvidiaGPU().Value()
Expand All @@ -498,8 +507,12 @@ func PodFitsResources(pod *api.Pod, meta interface{}, nodeInfo *schedulercache.N
return false,
newInsufficientResourceError(nvidiaGpuResourceName, podRequest.nvidiaGPU, nodeInfo.RequestedResource().NvidiaGPU, totalNvidiaGPU)
}
glog.V(10).Infof("Schedule Pod %+v on Node %+v is allowed, Node is running only %v out of %v Pods.",
podName(pod), node.Name, len(nodeInfo.Pods()), allowedPodNumber)
if glog.V(10) {
// We explicitly don't do glog.V(10).Infof() to avoid computing all the parameters if this is
// not logged. There is visible performance gain from it.
glog.Infof("Schedule Pod %+v on Node %+v is allowed, Node is running only %v out of %v Pods.",
podName(pod), node.Name, len(nodeInfo.Pods()), allowedPodNumber)
}
return true, nil
}

Expand Down Expand Up @@ -758,8 +771,10 @@ func getUsedPorts(pods ...*api.Pod) map[int]bool {
// TODO: Aggregate it at the NodeInfo level.
ports := make(map[int]bool)
for _, pod := range pods {
for _, container := range pod.Spec.Containers {
for _, podPort := range container.Ports {
for j := range pod.Spec.Containers {
container := &pod.Spec.Containers[j]
for k := range container.Ports {
podPort := &container.Ports[k]
// "0" is explicitly ignored in PodFitsHostPorts,
// which is the only function that uses this value.
if podPort.HostPort != 0 {
Expand Down Expand Up @@ -999,19 +1014,11 @@ func (checker *PodAffinityChecker) NodeMatchPodAffinityAntiAffinity(pod *api.Pod
return true
}

type TolerationMatch struct {
info NodeInfo
}

func NewTolerationMatchPredicate(info NodeInfo) algorithm.FitPredicate {
tolerationMatch := &TolerationMatch{
info: info,
}
return tolerationMatch.PodToleratesNodeTaints
}

func (t *TolerationMatch) PodToleratesNodeTaints(pod *api.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, error) {
func PodToleratesNodeTaints(pod *api.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, error) {
node := nodeInfo.Node()
if node == nil {
return false, fmt.Errorf("node not found")
}

taints, err := api.GetTaintsFromNodeAnnotations(node.Annotations)
if err != nil {
Expand Down Expand Up @@ -1040,7 +1047,8 @@ func tolerationsToleratesTaints(tolerations []api.Toleration, taints []api.Taint
return false
}

for _, taint := range taints {
for i := range taints {
taint := &taints[i]
// skip taints that have effect PreferNoSchedule, since it is for priorities
if taint.Effect == api.TaintEffectPreferNoSchedule {
continue
Expand Down
3 changes: 1 addition & 2 deletions plugin/pkg/scheduler/algorithm/predicates/predicates_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2724,10 +2724,9 @@ func TestPodToleratesTaints(t *testing.T) {
}

for _, test := range podTolerateTaintsTests {
tolerationMatch := TolerationMatch{FakeNodeInfo(test.node)}
nodeInfo := schedulercache.NewNodeInfo()
nodeInfo.SetNode(&test.node)
fits, err := tolerationMatch.PodToleratesNodeTaints(test.pod, PredicateMetadata(test.pod), nodeInfo)
fits, err := PodToleratesNodeTaints(test.pod, PredicateMetadata(test.pod), nodeInfo)
if fits == false && !reflect.DeepEqual(err, ErrTaintsTolerationsNotMatch) {
t.Errorf("%s, unexpected error: %v", test.test, err)
}
Expand Down
36 changes: 15 additions & 21 deletions plugin/pkg/scheduler/algorithm/priorities/node_affinity.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,31 +25,20 @@ import (
"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
)

type NodeAffinity struct {
nodeLister algorithm.NodeLister
}

func NewNodeAffinityPriority(nodeLister algorithm.NodeLister) algorithm.PriorityFunction {
nodeAffinity := &NodeAffinity{
nodeLister: nodeLister,
}
return nodeAffinity.CalculateNodeAffinityPriority
}

// CalculateNodeAffinityPriority prioritizes nodes according to node affinity scheduling preferences
// indicated in PreferredDuringSchedulingIgnoredDuringExecution. Each time a node match a preferredSchedulingTerm,
// it will a get an add of preferredSchedulingTerm.Weight. Thus, the more preferredSchedulingTerms
// the node satisfies and the more the preferredSchedulingTerm that is satisfied weights, the higher
// score the node gets.
func (s *NodeAffinity) CalculateNodeAffinityPriority(pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) {
var maxCount int
counts := map[string]int{}

func CalculateNodeAffinityPriority(pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) {
nodes, err := nodeLister.List()
if err != nil {
return nil, err
}

var maxCount float64
counts := make(map[string]float64, len(nodes.Items))

affinity, err := api.GetAffinityFromPodAnnotations(pod.Annotations)
if err != nil {
return nil, err
Expand All @@ -72,7 +61,7 @@ func (s *NodeAffinity) CalculateNodeAffinityPriority(pod *api.Pod, nodeNameToInf

for _, node := range nodes.Items {
if nodeSelector.Matches(labels.Set(node.Labels)) {
counts[node.Name] += int(preferredSchedulingTerm.Weight)
counts[node.Name] += float64(preferredSchedulingTerm.Weight)
}

if counts[node.Name] > maxCount {
Expand All @@ -82,15 +71,20 @@ func (s *NodeAffinity) CalculateNodeAffinityPriority(pod *api.Pod, nodeNameToInf
}
}

result := []schedulerapi.HostPriority{}
result := make(schedulerapi.HostPriorityList, 0, len(nodes.Items))
for i := range nodes.Items {
node := &nodes.Items[i]
fScore := float64(0)
if maxCount > 0 {
fScore = 10 * (float64(counts[node.Name]) / float64(maxCount))
fScore := 10 * (counts[node.Name] / maxCount)
result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)})
if glog.V(10) {
// We explicitly don't do glog.V(10).Infof() to avoid computing all the parameters if this is
// not logged. There is visible performance gain from it.
glog.Infof("%v -> %v: NodeAffinityPriority, Score: (%d)", pod.Name, node.Name, int(fScore))
}
} else {
result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: 0})
}
result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)})
glog.V(10).Infof("%v -> %v: NodeAffinityPriority, Score: (%d)", pod.Name, node.Name, int(fScore))
}
return result, nil
}
Original file line number Diff line number Diff line change
Expand Up @@ -156,8 +156,7 @@ func TestNodeAffinityPriority(t *testing.T) {
}

for _, test := range tests {
nodeAffinity := NodeAffinity{nodeLister: algorithm.FakeNodeLister(api.NodeList{Items: test.nodes})}
list, err := nodeAffinity.CalculateNodeAffinityPriority(test.pod, schedulercache.CreateNodeNameToInfoMap(nil), algorithm.FakeNodeLister(api.NodeList{Items: test.nodes}))
list, err := CalculateNodeAffinityPriority(test.pod, schedulercache.CreateNodeNameToInfoMap(nil), algorithm.FakeNodeLister(api.NodeList{Items: test.nodes}))
if err != nil {
t.Errorf("unexpected error: %v", err)
}
Expand Down
51 changes: 22 additions & 29 deletions plugin/pkg/scheduler/algorithm/priorities/taint_toleration.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,22 +24,10 @@ import (
"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
)

// NodeTaints hold the node lister
type TaintToleration struct {
nodeLister algorithm.NodeLister
}

// NewTaintTolerationPriority
func NewTaintTolerationPriority(nodeLister algorithm.NodeLister) algorithm.PriorityFunction {
taintToleration := &TaintToleration{
nodeLister: nodeLister,
}
return taintToleration.ComputeTaintTolerationPriority
}

// CountIntolerableTaintsPreferNoSchedule gives the count of intolerable taints of a pod with effect PreferNoSchedule
func countIntolerableTaintsPreferNoSchedule(taints []api.Taint, tolerations []api.Toleration) (intolerableTaints int) {
for _, taint := range taints {
func countIntolerableTaintsPreferNoSchedule(taints []api.Taint, tolerations []api.Toleration) (intolerableTaints float64) {
for i := range taints {
taint := &taints[i]
// check only on taints that have effect PreferNoSchedule
if taint.Effect != api.TaintEffectPreferNoSchedule {
continue
Expand All @@ -54,27 +42,27 @@ func countIntolerableTaintsPreferNoSchedule(taints []api.Taint, tolerations []ap

// getAllTolerationEffectPreferNoSchedule gets the list of all Toleration with Effect PreferNoSchedule
func getAllTolerationPreferNoSchedule(tolerations []api.Toleration) (tolerationList []api.Toleration) {
for _, toleration := range tolerations {
for i := range tolerations {
toleration := &tolerations[i]
if len(toleration.Effect) == 0 || toleration.Effect == api.TaintEffectPreferNoSchedule {
tolerationList = append(tolerationList, toleration)
tolerationList = append(tolerationList, *toleration)
}
}
return
}

// ComputeTaintTolerationPriority prepares the priority list for all the nodes based on the number of intolerable taints on the node
func (s *TaintToleration) ComputeTaintTolerationPriority(pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) {
// counts hold the count of intolerable taints of a pod for a given node
counts := make(map[string]int)

// the max value of counts
var maxCount int

func ComputeTaintTolerationPriority(pod *api.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) {
nodes, err := nodeLister.List()
if err != nil {
return nil, err
}

// the max value of counts
var maxCount float64
// counts hold the count of intolerable taints of a pod for a given node
counts := make(map[string]float64, len(nodes.Items))

tolerations, err := api.GetTolerationsFromPodAnnotations(pod.Annotations)
if err != nil {
return nil, err
Expand All @@ -99,14 +87,19 @@ func (s *TaintToleration) ComputeTaintTolerationPriority(pod *api.Pod, nodeNameT

// The maximum priority value to give to a node
// Priority values range from 0 - maxPriority
const maxPriority = 10
const maxPriority = float64(10)
result := make(schedulerapi.HostPriorityList, 0, len(nodes.Items))
for _, node := range nodes.Items {
fScore := float64(maxPriority)
for i := range nodes.Items {
node := &nodes.Items[i]
fScore := maxPriority
if maxCount > 0 {
fScore = (1.0 - float64(counts[node.Name])/float64(maxCount)) * 10
fScore = (1.0 - counts[node.Name]/maxCount) * 10
}
if glog.V(10) {
// We explicitly don't do glog.V(10).Infof() to avoid computing all the parameters if this is
// not logged. There is visible performance gain from it.
glog.Infof("%v -> %v: Taint Toleration Priority, Score: (%d)", pod.Name, node.Name, int(fScore))
}
glog.V(10).Infof("%v -> %v: Taint Toleration Priority, Score: (%d)", pod.Name, node.Name, int(fScore))

result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)})
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -212,8 +212,7 @@ func TestTaintAndToleration(t *testing.T) {
}
for _, test := range tests {
nodeNameToInfo := schedulercache.CreateNodeNameToInfoMap([]*api.Pod{{}})
taintToleration := TaintToleration{nodeLister: algorithm.FakeNodeLister(api.NodeList{Items: test.nodes})}
list, err := taintToleration.ComputeTaintTolerationPriority(
list, err := ComputeTaintTolerationPriority(
test.pod,
nodeNameToInfo,
algorithm.FakeNodeLister(api.NodeList{Items: test.nodes}))
Expand Down
27 changes: 3 additions & 24 deletions plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,12 +147,7 @@ func defaultPredicates() sets.String {
factory.RegisterFitPredicate("GeneralPredicates", predicates.GeneralPredicates),

// Fit is determined based on whether a pod can tolerate all of the node's taints
factory.RegisterFitPredicateFactory(
"PodToleratesNodeTaints",
func(args factory.PluginFactoryArgs) algorithm.FitPredicate {
return predicates.NewTolerationMatchPredicate(args.NodeInfo)
},
),
factory.RegisterFitPredicate("PodToleratesNodeTaints", predicates.PodToleratesNodeTaints),

// Fit is determined by node memory pressure condition.
factory.RegisterFitPredicate("CheckNodeMemoryPressure", predicates.CheckNodeMemoryPressurePredicate),
Expand All @@ -175,23 +170,7 @@ func defaultPriorities() sets.String {
Weight: 1,
},
),
factory.RegisterPriorityConfigFactory(
"NodeAffinityPriority",
factory.PriorityConfigFactory{
Function: func(args factory.PluginFactoryArgs) algorithm.PriorityFunction {
return priorities.NewNodeAffinityPriority(args.NodeLister)
},
Weight: 1,
},
),
factory.RegisterPriorityConfigFactory(
"TaintTolerationPriority",
factory.PriorityConfigFactory{
Function: func(args factory.PluginFactoryArgs) algorithm.PriorityFunction {
return priorities.NewTaintTolerationPriority(args.NodeLister)
},
Weight: 1,
},
),
factory.RegisterPriorityFunction("NodeAffinityPriority", priorities.CalculateNodeAffinityPriority, 1),
factory.RegisterPriorityFunction("TaintTolerationPriority", priorities.ComputeTaintTolerationPriority, 1),
)
}
3 changes: 2 additions & 1 deletion plugin/pkg/scheduler/factory/factory.go
Original file line number Diff line number Diff line change
Expand Up @@ -433,7 +433,8 @@ func (f *ConfigFactory) responsibleForPod(pod *api.Pod) bool {

func getNodeConditionPredicate() cache.NodeConditionPredicate {
return func(node *api.Node) bool {
for _, cond := range node.Status.Conditions {
for i := range node.Status.Conditions {
cond := &node.Status.Conditions[i]
// We consider the node for scheduling only when its:
// - NodeReady condition status is ConditionTrue,
// - NodeOutOfDisk condition status is ConditionFalse,
Expand Down