kubernetes · k8s-github-robot · Jan 11, 2017 · Dec 14, 2016 · Jan 9, 2017 · smarterclayton
diff --git a/pkg/controller/daemon/daemoncontroller.go b/pkg/controller/daemon/daemoncontroller.go
@@ -387,8 +387,11 @@ func (dsc *DaemonSetsController) addNode(obj interface{}) {
 	node := obj.(*v1.Node)
 	for i := range dsList.Items {
 		ds := &dsList.Items[i]
-		shouldEnqueue := dsc.nodeShouldRunDaemonPod(node, ds)
-		if shouldEnqueue {
+		_, shouldSchedule, _, err := dsc.nodeShouldRunDaemonPod(node, ds)
+		if err != nil {
+			continue
+		}
+		if shouldSchedule {
 			dsc.enqueueDaemonSet(ds)
 		}
 	}
@@ -406,14 +409,21 @@ func (dsc *DaemonSetsController) updateNode(old, cur interface{}) {
 		glog.V(4).Infof("Error enqueueing daemon sets: %v", err)
 		return
 	}
+	// TODO: it'd be nice to pass a hint with these enqueues, so that each ds would only examine the added node (unless it has other work to do, too).
 	for i := range dsList.Items {
 		ds := &dsList.Items[i]
-		shouldEnqueue := (dsc.nodeShouldRunDaemonPod(oldNode, ds) != dsc.nodeShouldRunDaemonPod(curNode, ds))
-		if shouldEnqueue {
+		_, oldShouldSchedule, oldShouldContinueRunning, err := dsc.nodeShouldRunDaemonPod(oldNode, ds)
+		if err != nil {
+			continue
+		}
+		_, currentShouldSchedule, currentShouldContinueRunning, err := dsc.nodeShouldRunDaemonPod(curNode, ds)
+		if err != nil {
+			continue
+		}
+		if (oldShouldSchedule != currentShouldSchedule) || (oldShouldContinueRunning != currentShouldContinueRunning) {
 			dsc.enqueueDaemonSet(ds)
 		}
 	}
-	// TODO: it'd be nice to pass a hint with these enqueues, so that each ds would only examine the added node (unless it has other work to do, too).
 }
 
 // getNodesToDaemonSetPods returns a map from nodes to daemon pods (corresponding to ds) running on the nodes.
@@ -451,22 +461,25 @@ func (dsc *DaemonSetsController) manage(ds *extensions.DaemonSet) error {
 	}
 	var nodesNeedingDaemonPods, podsToDelete []string
 	for _, node := range nodeList.Items {
-		shouldRun := dsc.nodeShouldRunDaemonPod(&node, ds)
+		_, shouldSchedule, shouldContinueRunning, err := dsc.nodeShouldRunDaemonPod(&node, ds)
+		if err != nil {
+			continue
+		}
 
 		daemonPods, isRunning := nodeToDaemonPods[node.Name]
 
 		switch {
-		case shouldRun && !isRunning:
+		case shouldSchedule && !isRunning:
 			// If daemon pod is supposed to be running on node, but isn't, create daemon pod.
 			nodesNeedingDaemonPods = append(nodesNeedingDaemonPods, node.Name)
-		case shouldRun && len(daemonPods) > 1:
+		case shouldContinueRunning && len(daemonPods) > 1:
 			// If daemon pod is supposed to be running on node, but more than 1 daemon pod is running, delete the excess daemon pods.
 			// Sort the daemon pods by creation time, so the the oldest is preserved.
 			sort.Sort(podByCreationTimestamp(daemonPods))
 			for i := 1; i < len(daemonPods); i++ {
 				podsToDelete = append(podsToDelete, daemonPods[i].Name)
 			}
-		case !shouldRun && isRunning:
+		case !shouldContinueRunning && isRunning:
 			// If daemon pod isn't supposed to run on node, but it is, delete all daemon pods on node.
 			for i := range daemonPods {
 				podsToDelete = append(podsToDelete, daemonPods[i].Name)
@@ -588,11 +601,14 @@ func (dsc *DaemonSetsController) updateDaemonSetStatus(ds *extensions.DaemonSet)
 
 	var desiredNumberScheduled, currentNumberScheduled, numberMisscheduled, numberReady int
 	for _, node := range nodeList.Items {
-		shouldRun := dsc.nodeShouldRunDaemonPod(&node, ds)
+		wantToRun, _, _, err := dsc.nodeShouldRunDaemonPod(&node, ds)
+		if err != nil {
+			return err
+		}
 
 		scheduled := len(nodeToDaemonPods[node.Name]) > 0
 
-		if shouldRun {
+		if wantToRun {
 			desiredNumberScheduled++
 			if scheduled {
 				currentNumberScheduled++
@@ -658,16 +674,35 @@ func (dsc *DaemonSetsController) syncDaemonSet(key string) error {
 	return dsc.updateDaemonSetStatus(ds)
 }
 
-func (dsc *DaemonSetsController) nodeShouldRunDaemonPod(node *v1.Node, ds *extensions.DaemonSet) bool {
+// nodeShouldRunDaemonPod checks a set of preconditions against a (node,daemonset) and returns a
+// summary. Returned booleans are:
+// * wantToRun:
+//     Returns true when a user would expect a pod to run on this node and ignores conditions
+//     such as OutOfDisk or insufficent resource that would cause a daemonset pod not to schedule.
+//     This is primarily used to populate daemonset status.
+// * shouldSchedule:
+//     Returns true when a daemonset should be scheduled to a node if a daemonset pod is not already
+//     running on that node.
+// * shouldContinueRunning:
+//     Returns true when a daemonset should continue running on a node if a daemonset pod is already
+//     running on that node.
+func (dsc *DaemonSetsController) nodeShouldRunDaemonPod(node *v1.Node, ds *extensions.DaemonSet) (wantToRun, shouldSchedule, shouldContinueRunning bool, err error) {
+	// Because these bools require an && of all their required conditions, we start
+	// with all bools set to true and set a bool to false if a condition is not met.
+	// A bool should probably not be set to true after this line.
+	wantToRun, shouldSchedule, shouldContinueRunning = true, true, true
 	// If the daemon set specifies a node name, check that it matches with node.Name.
 	if !(ds.Spec.Template.Spec.NodeName == "" || ds.Spec.Template.Spec.NodeName == node.Name) {
-		return false
+		return false, false, false, nil
 	}
 
 	// TODO: Move it to the predicates
 	for _, c := range node.Status.Conditions {
 		if c.Type == v1.NodeOutOfDisk && c.Status == v1.ConditionTrue {
-			return false
+			// the kubelet will evict this pod if it needs to. Let kubelet
+			// decide whether to continue running this pod so leave shouldContinueRunning
+			// set to true
+			shouldSchedule = false
 		}
 	}
 
@@ -695,22 +730,59 @@ func (dsc *DaemonSetsController) nodeShouldRunDaemonPod(node *v1.Node, ds *exten
 
 	nodeInfo := schedulercache.NewNodeInfo(pods...)
 	nodeInfo.SetNode(node)
-	fit, reasons, err := predicates.GeneralPredicates(newPod, nil, nodeInfo)
+	_, reasons, err := predicates.GeneralPredicates(newPod, nil, nodeInfo)
 	if err != nil {
 		glog.Warningf("GeneralPredicates failed on ds '%s/%s' due to unexpected error: %v", ds.ObjectMeta.Namespace, ds.ObjectMeta.Name, err)
+		return false, false, false, err
 	}
 	for _, r := range reasons {
 		glog.V(4).Infof("GeneralPredicates failed on ds '%s/%s' for reason: %v", ds.ObjectMeta.Namespace, ds.ObjectMeta.Name, r.GetReason())
 		switch reason := r.(type) {
 		case *predicates.InsufficientResourceError:
 			dsc.eventRecorder.Eventf(ds, v1.EventTypeNormal, "FailedPlacement", "failed to place pod on %q: %s", node.ObjectMeta.Name, reason.Error())
+			shouldSchedule = false
 		case *predicates.PredicateFailureError:
-			if reason == predicates.ErrPodNotFitsHostPorts {
-				dsc.eventRecorder.Eventf(ds, v1.EventTypeNormal, "FailedPlacement", "failed to place pod on %q: host port conflict", node.ObjectMeta.Name)
+			var emitEvent bool
+			// we try to partition predicates into two partitions here: intentional on the part of the operator and not.
+			switch reason {
+			// intentional
+			case
+				predicates.ErrNodeSelectorNotMatch,
+				predicates.ErrPodNotMatchHostName,
+				predicates.ErrNodeLabelPresenceViolated,
+				// this one is probably intentional since it's a workaround for not having
+				// pod hard anti affinity.
+				predicates.ErrPodNotFitsHostPorts:
+				wantToRun, shouldSchedule, shouldContinueRunning = false, false, false
+			// unintentional
+			case
+				predicates.ErrDiskConflict,
+				predicates.ErrVolumeZoneConflict,
+				predicates.ErrMaxVolumeCountExceeded,
+				predicates.ErrNodeUnderMemoryPressure,
+				predicates.ErrNodeUnderDiskPressure:
+				// wantToRun and shouldContinueRunning are likely true here. They are
+				// absolutely true at the time of writing the comment. See first comment
+				// of this method.
+				shouldSchedule = false
+				emitEvent = true
+			// unexpected
+			case
+				predicates.ErrPodAffinityNotMatch,
+				predicates.ErrServiceAffinityViolated,
+				predicates.ErrTaintsTolerationsNotMatch:
+				return false, false, false, fmt.Errorf("unexpected reason: GeneralPredicates should not return reason %s", reason.GetReason())
+			default:
+				glog.V(4).Infof("unknownd predicate failure reason: %s", reason.GetReason())
+				wantToRun, shouldSchedule, shouldContinueRunning = false, false, false
+				emitEvent = true
+			}
+			if emitEvent {
+				dsc.eventRecorder.Eventf(ds, v1.EventTypeNormal, "FailedPlacement", "failed to place pod on %q: %s", node.ObjectMeta.Name, reason.GetReason())
 			}
 		}
 	}
-	return fit
+	return
 }
 
 // byCreationTimestamp sorts a list by creation timestamp, using their names as a tie breaker.

diff --git a/pkg/controller/daemon/daemoncontroller_test.go b/pkg/controller/daemon/daemoncontroller_test.go
@@ -267,6 +267,23 @@ func TestInsufficentCapacityNodeDaemonDoesNotLaunchPod(t *testing.T) {
 	syncAndValidateDaemonSets(t, manager, ds, podControl, 0, 0)
 }
 
+// DaemonSets should not unschedule a daemonset pod from a node with insufficient free resource
+func TestInsufficentCapacityNodeDaemonDoesNotUnscheduleRunningPod(t *testing.T) {
+	podSpec := resourcePodSpec("too-much-mem", "75M", "75m")
+	podSpec.NodeName = "too-much-mem"
+	manager, podControl, _ := newTestController()
+	node := newNode("too-much-mem", nil)
+	node.Status.Allocatable = allocatableResources("100M", "200m")
+	manager.nodeStore.Add(node)
+	manager.podStore.Indexer.Add(&v1.Pod{
+		Spec: podSpec,
+	})
+	ds := newDaemonSet("foo")
+	ds.Spec.Template.Spec = podSpec
+	manager.dsStore.Add(ds)
+	syncAndValidateDaemonSets(t, manager, ds, podControl, 0, 0)
+}
+
 func TestSufficentCapacityWithTerminatedPodsDaemonLaunchesPod(t *testing.T) {
 	podSpec := resourcePodSpec("too-much-mem", "75M", "75m")
 	manager, podControl, _ := newTestController()
@@ -634,3 +651,119 @@ func TestObservedGeneration(t *testing.T) {
 		t.Errorf("Wrong ObservedGeneration for daemon %s in status. Expected %d, got %d", updated.Name, daemon.Generation, updated.Status.ObservedGeneration)
 	}
 }
+
+func TestNodeShouldRunDaemonPod(t *testing.T) {
+	cases := []struct {
+		podsOnNode                                       []*v1.Pod
+		ds                                               *extensions.DaemonSet
+		wantToRun, shouldSchedule, shouldContinueRunning bool
+		err                                              error
+	}{
+		{
+			ds: &extensions.DaemonSet{
+				Spec: extensions.DaemonSetSpec{
+					Selector: &metav1.LabelSelector{MatchLabels: simpleDaemonSetLabel},
+					Template: v1.PodTemplateSpec{
+						ObjectMeta: v1.ObjectMeta{
+							Labels: simpleDaemonSetLabel,
+						},
+						Spec: resourcePodSpec("", "50M", "0.5"),
+					},
+				},
+			},
+			wantToRun:             true,
+			shouldSchedule:        true,
+			shouldContinueRunning: true,
+		},
+		{
+			ds: &extensions.DaemonSet{
+				Spec: extensions.DaemonSetSpec{
+					Selector: &metav1.LabelSelector{MatchLabels: simpleDaemonSetLabel},
+					Template: v1.PodTemplateSpec{
+						ObjectMeta: v1.ObjectMeta{
+							Labels: simpleDaemonSetLabel,
+						},
+						Spec: resourcePodSpec("", "200M", "0.5"),
+					},
+				},
+			},
+			wantToRun:             true,
+			shouldSchedule:        false,
+			shouldContinueRunning: true,
+		},
+		{
+			ds: &extensions.DaemonSet{
+				Spec: extensions.DaemonSetSpec{
+					Selector: &metav1.LabelSelector{MatchLabels: simpleDaemonSetLabel},
+					Template: v1.PodTemplateSpec{
+						ObjectMeta: v1.ObjectMeta{
+							Labels: simpleDaemonSetLabel,
+						},
+						Spec: resourcePodSpec("other-node", "50M", "0.5"),
+					},
+				},
+			},
+			wantToRun:             false,
+			shouldSchedule:        false,
+			shouldContinueRunning: false,
+		},
+		{
+			podsOnNode: []*v1.Pod{
+				{
+					Spec: v1.PodSpec{
+						Containers: []v1.Container{{
+							Ports: []v1.ContainerPort{{
+								HostPort: 666,
+							}},
+						}},
+					},
+				},
+			},
+			ds: &extensions.DaemonSet{
+				Spec: extensions.DaemonSetSpec{
+					Selector: &metav1.LabelSelector{MatchLabels: simpleDaemonSetLabel},
+					Template: v1.PodTemplateSpec{
+						ObjectMeta: v1.ObjectMeta{
+							Labels: simpleDaemonSetLabel,
+						},
+						Spec: v1.PodSpec{
+							Containers: []v1.Container{{
+								Ports: []v1.ContainerPort{{
+									HostPort: 666,
+								}},
+							}},
+						},
+					},
+				},
+			},
+			wantToRun:             false,
+			shouldSchedule:        false,
+			shouldContinueRunning: false,
+		},
+	}
+
+	for i, c := range cases {
+		node := newNode("test-node", nil)
+		node.Status.Allocatable = allocatableResources("100M", "1")
+		manager, _, _ := newTestController()
+		manager.nodeStore.Store.Add(node)
+		for _, p := range c.podsOnNode {
+			manager.podStore.Indexer.Add(p)
+			p.Spec.NodeName = "test-node"
+		}
+		wantToRun, shouldSchedule, shouldContinueRunning, err := manager.nodeShouldRunDaemonPod(node, c.ds)
+
+		if wantToRun != c.wantToRun {
+			t.Errorf("[%v] expected wantToRun: %v, got: %v", i, c.wantToRun, wantToRun)
+		}
+		if shouldSchedule != c.shouldSchedule {
+			t.Errorf("[%v] expected shouldSchedule: %v, got: %v", i, c.shouldSchedule, shouldSchedule)
+		}
+		if shouldContinueRunning != c.shouldContinueRunning {
+			t.Errorf("[%v] expected shouldContinueRunning: %v, got: %v", i, c.shouldContinueRunning, shouldContinueRunning)
+		}
+		if err != c.err {
+			t.Errorf("[%v] expected err: %v, got: %v", i, c.err, err)
+		}
+	}
+}