Skip to content

Commit

Permalink
Merge pull request #35932 from jayunit100/sched_events_spam_reduce
Browse files Browse the repository at this point in the history
Automatic merge from submit-queue

Reduce spam in Events from scheduler by counter aggregation of failure

Fixes #35842
Part of overall #35555
  • Loading branch information
Kubernetes Submit Queue committed Nov 7, 2016
2 parents 9534c4f + 5d5bc67 commit f715b26
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 34 deletions.
20 changes: 15 additions & 5 deletions plugin/pkg/scheduler/generic_scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,14 +49,24 @@ var ErrNoNodesAvailable = fmt.Errorf("no nodes available to schedule pods")
func (f *FitError) Error() string {
var buf bytes.Buffer
buf.WriteString(fmt.Sprintf("pod (%s) failed to fit in any node\n", f.Pod.Name))
for node, predicates := range f.FailedPredicates {
reasons := make([]string, 0)
reasons := make(map[string]int)
for _, predicates := range f.FailedPredicates {
for _, pred := range predicates {
reasons = append(reasons, pred.GetReason())
reasons[pred.GetReason()] += 1
}
reasonMsg := fmt.Sprintf("fit failure on node (%s): %s\n", node, strings.Join(reasons, ", "))
buf.WriteString(reasonMsg)
}

sortReasonsHistogram := func() []string {
reasonStrings := []string{}
for k, v := range reasons {
reasonStrings = append(reasonStrings, fmt.Sprintf("%v (%v)", k, v))
}
sort.Strings(reasonStrings)
return reasonStrings
}

reasonMsg := fmt.Sprintf("fit failure summary on nodes : %v", strings.Join(sortReasonsHistogram(), ", "))
buf.WriteString(reasonMsg)
return buf.String()
}

Expand Down
76 changes: 47 additions & 29 deletions plugin/pkg/scheduler/scheduler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package scheduler

import (
"errors"
"fmt"
"reflect"
"testing"
"time"
Expand Down Expand Up @@ -331,49 +332,66 @@ func TestSchedulerFailedSchedulingReasons(t *testing.T) {
defer close(stop)
queuedPodStore := clientcache.NewFIFO(clientcache.MetaNamespaceKeyFunc)
scache := schedulercache.New(10*time.Minute, stop)
node := api.Node{
ObjectMeta: api.ObjectMeta{Name: "machine1"},
Status: api.NodeStatus{
Capacity: api.ResourceList{
api.ResourceCPU: *(resource.NewQuantity(2, resource.DecimalSI)),
api.ResourceMemory: *(resource.NewQuantity(100, resource.DecimalSI)),
api.ResourcePods: *(resource.NewQuantity(10, resource.DecimalSI)),
},
Allocatable: api.ResourceList{
api.ResourceCPU: *(resource.NewQuantity(2, resource.DecimalSI)),
api.ResourceMemory: *(resource.NewQuantity(100, resource.DecimalSI)),
api.ResourcePods: *(resource.NewQuantity(10, resource.DecimalSI)),
}},

// Design the baseline for the pods, and we will make nodes that dont fit it later.
var cpu = int64(4)
var mem = int64(500)
podWithTooBigResourceRequests := podWithResources("bar", "", api.ResourceList{
api.ResourceCPU: *(resource.NewQuantity(cpu, resource.DecimalSI)),
api.ResourceMemory: *(resource.NewQuantity(mem, resource.DecimalSI)),
}, api.ResourceList{
api.ResourceCPU: *(resource.NewQuantity(cpu, resource.DecimalSI)),
api.ResourceMemory: *(resource.NewQuantity(mem, resource.DecimalSI)),
})

// create several nodes which cannot schedule the above pod
nodes := []*api.Node{}
for i := 0; i < 100; i++ {
node := api.Node{
ObjectMeta: api.ObjectMeta{Name: fmt.Sprintf("machine%v", i)},
Status: api.NodeStatus{
Capacity: api.ResourceList{
api.ResourceCPU: *(resource.NewQuantity(cpu/2, resource.DecimalSI)),
api.ResourceMemory: *(resource.NewQuantity(mem/5, resource.DecimalSI)),
api.ResourcePods: *(resource.NewQuantity(10, resource.DecimalSI)),
},
Allocatable: api.ResourceList{
api.ResourceCPU: *(resource.NewQuantity(cpu/2, resource.DecimalSI)),
api.ResourceMemory: *(resource.NewQuantity(mem/5, resource.DecimalSI)),
api.ResourcePods: *(resource.NewQuantity(10, resource.DecimalSI)),
}},
}
scache.AddNode(&node)
nodes = append(nodes, &node)
}
scache.AddNode(&node)
nodeLister := algorithm.FakeNodeLister([]*api.Node{&node})
nodeLister := algorithm.FakeNodeLister(nodes)
predicateMap := map[string]algorithm.FitPredicate{
"PodFitsResources": predicates.PodFitsResources,
}

// Create expected failure reasons for all the nodes. Hopefully they will get rolled up into a non-spammy summary.
failedPredicatesMap := FailedPredicateMap{}
for _, node := range nodes {
failedPredicatesMap[node.Name] = []algorithm.PredicateFailureReason{
predicates.NewInsufficientResourceError(api.ResourceCPU, 4000, 0, 2000),
predicates.NewInsufficientResourceError(api.ResourceMemory, 500, 0, 100),
}
}
scheduler, _, errChan := setupTestScheduler(queuedPodStore, scache, nodeLister, predicateMap)

podWithTooBigResourceRequests := podWithResources("bar", "", api.ResourceList{
api.ResourceCPU: *(resource.NewQuantity(4, resource.DecimalSI)),
api.ResourceMemory: *(resource.NewQuantity(500, resource.DecimalSI)),
}, api.ResourceList{
api.ResourceCPU: *(resource.NewQuantity(4, resource.DecimalSI)),
api.ResourceMemory: *(resource.NewQuantity(500, resource.DecimalSI)),
})
queuedPodStore.Add(podWithTooBigResourceRequests)
scheduler.scheduleOne()

select {
case err := <-errChan:
expectErr := &FitError{
Pod: podWithTooBigResourceRequests,
FailedPredicates: FailedPredicateMap{node.Name: []algorithm.PredicateFailureReason{
predicates.NewInsufficientResourceError(api.ResourceCPU, 4000, 0, 2000),
predicates.NewInsufficientResourceError(api.ResourceMemory, 500, 0, 100),
}},
Pod: podWithTooBigResourceRequests,
FailedPredicates: failedPredicatesMap,
}
if len(fmt.Sprint(expectErr)) > 150 {
t.Errorf("message is too spammy ! %v ", len(fmt.Sprint(expectErr)))
}
if !reflect.DeepEqual(expectErr, err) {
t.Errorf("err want=%+v, get=%+v", expectErr, err)
t.Errorf("\n err \nWANT=%+v,\nGOT=%+v", expectErr, err)
}
case <-time.After(wait.ForeverTestTimeout):
t.Fatalf("timeout after %v", wait.ForeverTestTimeout)
Expand Down

0 comments on commit f715b26

Please sign in to comment.