Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

daemonset: differentiate between cases in nodeShouldRun #38787

Merged
merged 2 commits into from Jan 11, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
108 changes: 90 additions & 18 deletions pkg/controller/daemon/daemoncontroller.go
Expand Up @@ -387,8 +387,11 @@ func (dsc *DaemonSetsController) addNode(obj interface{}) {
node := obj.(*v1.Node)
for i := range dsList.Items {
ds := &dsList.Items[i]
shouldEnqueue := dsc.nodeShouldRunDaemonPod(node, ds)
if shouldEnqueue {
_, shouldSchedule, _, err := dsc.nodeShouldRunDaemonPod(node, ds)
if err != nil {
continue
}
if shouldSchedule {
dsc.enqueueDaemonSet(ds)
}
}
Expand All @@ -406,14 +409,21 @@ func (dsc *DaemonSetsController) updateNode(old, cur interface{}) {
glog.V(4).Infof("Error enqueueing daemon sets: %v", err)
return
}
// TODO: it'd be nice to pass a hint with these enqueues, so that each ds would only examine the added node (unless it has other work to do, too).
for i := range dsList.Items {
ds := &dsList.Items[i]
shouldEnqueue := (dsc.nodeShouldRunDaemonPod(oldNode, ds) != dsc.nodeShouldRunDaemonPod(curNode, ds))
if shouldEnqueue {
_, oldShouldSchedule, oldShouldContinueRunning, err := dsc.nodeShouldRunDaemonPod(oldNode, ds)
if err != nil {
continue
}
_, currentShouldSchedule, currentShouldContinueRunning, err := dsc.nodeShouldRunDaemonPod(curNode, ds)
if err != nil {
continue
}
if (oldShouldSchedule != currentShouldSchedule) || (oldShouldContinueRunning != currentShouldContinueRunning) {
dsc.enqueueDaemonSet(ds)
}
}
// TODO: it'd be nice to pass a hint with these enqueues, so that each ds would only examine the added node (unless it has other work to do, too).
}

// getNodesToDaemonSetPods returns a map from nodes to daemon pods (corresponding to ds) running on the nodes.
Expand Down Expand Up @@ -451,22 +461,25 @@ func (dsc *DaemonSetsController) manage(ds *extensions.DaemonSet) error {
}
var nodesNeedingDaemonPods, podsToDelete []string
for _, node := range nodeList.Items {
shouldRun := dsc.nodeShouldRunDaemonPod(&node, ds)
_, shouldSchedule, shouldContinueRunning, err := dsc.nodeShouldRunDaemonPod(&node, ds)
if err != nil {
continue
}

daemonPods, isRunning := nodeToDaemonPods[node.Name]

switch {
case shouldRun && !isRunning:
case shouldSchedule && !isRunning:
// If daemon pod is supposed to be running on node, but isn't, create daemon pod.
nodesNeedingDaemonPods = append(nodesNeedingDaemonPods, node.Name)
case shouldRun && len(daemonPods) > 1:
case shouldContinueRunning && len(daemonPods) > 1:
// If daemon pod is supposed to be running on node, but more than 1 daemon pod is running, delete the excess daemon pods.
// Sort the daemon pods by creation time, so the the oldest is preserved.
sort.Sort(podByCreationTimestamp(daemonPods))
for i := 1; i < len(daemonPods); i++ {
podsToDelete = append(podsToDelete, daemonPods[i].Name)
}
case !shouldRun && isRunning:
case !shouldContinueRunning && isRunning:
// If daemon pod isn't supposed to run on node, but it is, delete all daemon pods on node.
for i := range daemonPods {
podsToDelete = append(podsToDelete, daemonPods[i].Name)
Expand Down Expand Up @@ -588,11 +601,14 @@ func (dsc *DaemonSetsController) updateDaemonSetStatus(ds *extensions.DaemonSet)

var desiredNumberScheduled, currentNumberScheduled, numberMisscheduled, numberReady int
for _, node := range nodeList.Items {
shouldRun := dsc.nodeShouldRunDaemonPod(&node, ds)
wantToRun, _, _, err := dsc.nodeShouldRunDaemonPod(&node, ds)
if err != nil {
return err
}

scheduled := len(nodeToDaemonPods[node.Name]) > 0

if shouldRun {
if wantToRun {
desiredNumberScheduled++
if scheduled {
currentNumberScheduled++
Expand Down Expand Up @@ -658,16 +674,35 @@ func (dsc *DaemonSetsController) syncDaemonSet(key string) error {
return dsc.updateDaemonSetStatus(ds)
}

func (dsc *DaemonSetsController) nodeShouldRunDaemonPod(node *v1.Node, ds *extensions.DaemonSet) bool {
// nodeShouldRunDaemonPod checks a set of preconditions against a (node,daemonset) and returns a
// summary. Returned booleans are:
// * wantToRun:
// Returns true when a user would expect a pod to run on this node and ignores conditions
// such as OutOfDisk or insufficent resource that would cause a daemonset pod not to schedule.
// This is primarily used to populate daemonset status.
// * shouldSchedule:
// Returns true when a daemonset should be scheduled to a node if a daemonset pod is not already
// running on that node.
// * shouldContinueRunning:
// Returns true when a daemonset should continue running on a node if a daemonset pod is already
// running on that node.
func (dsc *DaemonSetsController) nodeShouldRunDaemonPod(node *v1.Node, ds *extensions.DaemonSet) (wantToRun, shouldSchedule, shouldContinueRunning bool, err error) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can I ask you to do a table test for this method that shows the outcomes?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will do.

// Because these bools require an && of all their required conditions, we start
// with all bools set to true and set a bool to false if a condition is not met.
// A bool should probably not be set to true after this line.
wantToRun, shouldSchedule, shouldContinueRunning = true, true, true
// If the daemon set specifies a node name, check that it matches with node.Name.
if !(ds.Spec.Template.Spec.NodeName == "" || ds.Spec.Template.Spec.NodeName == node.Name) {
return false
return false, false, false, nil
}

// TODO: Move it to the predicates
for _, c := range node.Status.Conditions {
if c.Type == v1.NodeOutOfDisk && c.Status == v1.ConditionTrue {
return false
// the kubelet will evict this pod if it needs to. Let kubelet
// decide whether to continue running this pod so leave shouldContinueRunning
// set to true
shouldSchedule = false
}
}

Expand Down Expand Up @@ -695,22 +730,59 @@ func (dsc *DaemonSetsController) nodeShouldRunDaemonPod(node *v1.Node, ds *exten

nodeInfo := schedulercache.NewNodeInfo(pods...)
nodeInfo.SetNode(node)
fit, reasons, err := predicates.GeneralPredicates(newPod, nil, nodeInfo)
_, reasons, err := predicates.GeneralPredicates(newPod, nil, nodeInfo)
if err != nil {
glog.Warningf("GeneralPredicates failed on ds '%s/%s' due to unexpected error: %v", ds.ObjectMeta.Namespace, ds.ObjectMeta.Name, err)
return false, false, false, err
}
for _, r := range reasons {
glog.V(4).Infof("GeneralPredicates failed on ds '%s/%s' for reason: %v", ds.ObjectMeta.Namespace, ds.ObjectMeta.Name, r.GetReason())
switch reason := r.(type) {
case *predicates.InsufficientResourceError:
dsc.eventRecorder.Eventf(ds, v1.EventTypeNormal, "FailedPlacement", "failed to place pod on %q: %s", node.ObjectMeta.Name, reason.Error())
shouldSchedule = false
case *predicates.PredicateFailureError:
if reason == predicates.ErrPodNotFitsHostPorts {
dsc.eventRecorder.Eventf(ds, v1.EventTypeNormal, "FailedPlacement", "failed to place pod on %q: host port conflict", node.ObjectMeta.Name)
var emitEvent bool
// we try to partition predicates into two partitions here: intentional on the part of the operator and not.
switch reason {
// intentional
case
predicates.ErrNodeSelectorNotMatch,
predicates.ErrPodNotMatchHostName,
predicates.ErrNodeLabelPresenceViolated,
// this one is probably intentional since it's a workaround for not having
// pod hard anti affinity.
predicates.ErrPodNotFitsHostPorts:
wantToRun, shouldSchedule, shouldContinueRunning = false, false, false
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is node affinity something that doen't hold true for DaemonSets and works only for pods that are processed by the scheduler?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Kargakis node affinity bubbles up here as a predicates.ErrNodeSelectorNotMatch. The DaemonSet obeys "required during scheduling" node affinities.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, actually nevermind my question. Required/ignored during execution is yet to be implemented.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Basically "required during execution" only is not implemented yet. Here you have implemented "required during execution" for daemon sets. My real question from the begining was "Is node affinity going to affect daemon sets? Will I be able to specify predicates in the pod template of a daemon set?"

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, the plan is to support node affinity in daemonsets. We will need to modify this method when "required during execution" is implemented.

// unintentional
case
predicates.ErrDiskConflict,
predicates.ErrVolumeZoneConflict,
predicates.ErrMaxVolumeCountExceeded,
predicates.ErrNodeUnderMemoryPressure,
predicates.ErrNodeUnderDiskPressure:
// wantToRun and shouldContinueRunning are likely true here. They are
// absolutely true at the time of writing the comment. See first comment
// of this method.
shouldSchedule = false
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you make it explicit that the other two booleans are true here?

emitEvent = true
// unexpected
case
predicates.ErrPodAffinityNotMatch,
predicates.ErrServiceAffinityViolated,
predicates.ErrTaintsTolerationsNotMatch:
return false, false, false, fmt.Errorf("unexpected reason: GeneralPredicates should not return reason %s", reason.GetReason())
default:
glog.V(4).Infof("unknownd predicate failure reason: %s", reason.GetReason())
wantToRun, shouldSchedule, shouldContinueRunning = false, false, false
emitEvent = true
}
if emitEvent {
dsc.eventRecorder.Eventf(ds, v1.EventTypeNormal, "FailedPlacement", "failed to place pod on %q: %s", node.ObjectMeta.Name, reason.GetReason())
}
}
}
return fit
return
}

// byCreationTimestamp sorts a list by creation timestamp, using their names as a tie breaker.
Expand Down
133 changes: 133 additions & 0 deletions pkg/controller/daemon/daemoncontroller_test.go
Expand Up @@ -267,6 +267,23 @@ func TestInsufficentCapacityNodeDaemonDoesNotLaunchPod(t *testing.T) {
syncAndValidateDaemonSets(t, manager, ds, podControl, 0, 0)
}

// DaemonSets should not unschedule a daemonset pod from a node with insufficient free resource
func TestInsufficentCapacityNodeDaemonDoesNotUnscheduleRunningPod(t *testing.T) {
podSpec := resourcePodSpec("too-much-mem", "75M", "75m")
podSpec.NodeName = "too-much-mem"
manager, podControl, _ := newTestController()
node := newNode("too-much-mem", nil)
node.Status.Allocatable = allocatableResources("100M", "200m")
manager.nodeStore.Add(node)
manager.podStore.Indexer.Add(&v1.Pod{
Spec: podSpec,
})
ds := newDaemonSet("foo")
ds.Spec.Template.Spec = podSpec
manager.dsStore.Add(ds)
syncAndValidateDaemonSets(t, manager, ds, podControl, 0, 0)
}

func TestSufficentCapacityWithTerminatedPodsDaemonLaunchesPod(t *testing.T) {
podSpec := resourcePodSpec("too-much-mem", "75M", "75m")
manager, podControl, _ := newTestController()
Expand Down Expand Up @@ -634,3 +651,119 @@ func TestObservedGeneration(t *testing.T) {
t.Errorf("Wrong ObservedGeneration for daemon %s in status. Expected %d, got %d", updated.Name, daemon.Generation, updated.Status.ObservedGeneration)
}
}

func TestNodeShouldRunDaemonPod(t *testing.T) {
cases := []struct {
podsOnNode []*v1.Pod
ds *extensions.DaemonSet
wantToRun, shouldSchedule, shouldContinueRunning bool
err error
}{
{
ds: &extensions.DaemonSet{
Spec: extensions.DaemonSetSpec{
Selector: &metav1.LabelSelector{MatchLabels: simpleDaemonSetLabel},
Template: v1.PodTemplateSpec{
ObjectMeta: v1.ObjectMeta{
Labels: simpleDaemonSetLabel,
},
Spec: resourcePodSpec("", "50M", "0.5"),
},
},
},
wantToRun: true,
shouldSchedule: true,
shouldContinueRunning: true,
},
{
ds: &extensions.DaemonSet{
Spec: extensions.DaemonSetSpec{
Selector: &metav1.LabelSelector{MatchLabels: simpleDaemonSetLabel},
Template: v1.PodTemplateSpec{
ObjectMeta: v1.ObjectMeta{
Labels: simpleDaemonSetLabel,
},
Spec: resourcePodSpec("", "200M", "0.5"),
},
},
},
wantToRun: true,
shouldSchedule: false,
shouldContinueRunning: true,
},
{
ds: &extensions.DaemonSet{
Spec: extensions.DaemonSetSpec{
Selector: &metav1.LabelSelector{MatchLabels: simpleDaemonSetLabel},
Template: v1.PodTemplateSpec{
ObjectMeta: v1.ObjectMeta{
Labels: simpleDaemonSetLabel,
},
Spec: resourcePodSpec("other-node", "50M", "0.5"),
},
},
},
wantToRun: false,
shouldSchedule: false,
shouldContinueRunning: false,
},
{
podsOnNode: []*v1.Pod{
{
Spec: v1.PodSpec{
Containers: []v1.Container{{
Ports: []v1.ContainerPort{{
HostPort: 666,
}},
}},
},
},
},
ds: &extensions.DaemonSet{
Spec: extensions.DaemonSetSpec{
Selector: &metav1.LabelSelector{MatchLabels: simpleDaemonSetLabel},
Template: v1.PodTemplateSpec{
ObjectMeta: v1.ObjectMeta{
Labels: simpleDaemonSetLabel,
},
Spec: v1.PodSpec{
Containers: []v1.Container{{
Ports: []v1.ContainerPort{{
HostPort: 666,
}},
}},
},
},
},
},
wantToRun: false,
shouldSchedule: false,
shouldContinueRunning: false,
},
}

for i, c := range cases {
node := newNode("test-node", nil)
node.Status.Allocatable = allocatableResources("100M", "1")
manager, _, _ := newTestController()
manager.nodeStore.Store.Add(node)
for _, p := range c.podsOnNode {
manager.podStore.Indexer.Add(p)
p.Spec.NodeName = "test-node"
}
wantToRun, shouldSchedule, shouldContinueRunning, err := manager.nodeShouldRunDaemonPod(node, c.ds)

if wantToRun != c.wantToRun {
t.Errorf("[%v] expected wantToRun: %v, got: %v", i, c.wantToRun, wantToRun)
}
if shouldSchedule != c.shouldSchedule {
t.Errorf("[%v] expected shouldSchedule: %v, got: %v", i, c.shouldSchedule, shouldSchedule)
}
if shouldContinueRunning != c.shouldContinueRunning {
t.Errorf("[%v] expected shouldContinueRunning: %v, got: %v", i, c.shouldContinueRunning, shouldContinueRunning)
}
if err != c.err {
t.Errorf("[%v] expected err: %v, got: %v", i, c.err, err)
}
}
}