kubernetes · k8s-github-robot · Jun 2, 2018 · Jun 2, 2018 · Jun 2, 2018 · Jun 2, 2018
diff --git a/pkg/controller/daemon/daemon_controller.go b/pkg/controller/daemon/daemon_controller.go
@@ -762,7 +762,7 @@ func (dsc *DaemonSetsController) getDaemonPods(ds *apps.DaemonSet) ([]*v1.Pod, e
 	return cm.ClaimPods(pods)
 }
 
-// getNodesToDaemonPods returns a map from nodes to daemon pods (corresponding to ds) running on the nodes.
+// getNodesToDaemonPods returns a map from nodes to daemon pods (corresponding to ds) created for the nodes.
 // This also reconciles ControllerRef by adopting/orphaning.
 // Note that returned Pods are pointers to objects in the cache.
 // If you want to modify one, you need to deep-copy it first.
@@ -774,9 +774,16 @@ func (dsc *DaemonSetsController) getNodesToDaemonPods(ds *apps.DaemonSet) (map[s
 	// Group Pods by Node name.
 	nodeToDaemonPods := make(map[string][]*v1.Pod)
 	for _, pod := range claimedPods {
-		nodeName := pod.Spec.NodeName
+		nodeName, err := util.GetTargetNodeName(pod)
+		if err != nil {
+			glog.Warningf("Failed to get target node name of Pod %v/%v in DaemonSet %v/%v",
+				pod.Namespace, pod.Name, ds.Namespace, ds.Name)
+			continue
+		}
+
 		nodeToDaemonPods[nodeName] = append(nodeToDaemonPods[nodeName], pod)
 	}
+
 	return nodeToDaemonPods, nil
 }
 
@@ -850,7 +857,7 @@ func (dsc *DaemonSetsController) podsShouldBeOnNode(
 		// If daemon pod is supposed to be running on node, but more than 1 daemon pod is running, delete the excess daemon pods.
 		// Sort the daemon pods by creation time, so the oldest is preserved.
 		if len(daemonPodsRunning) > 1 {
-			sort.Sort(podByCreationTimestamp(daemonPodsRunning))
+			sort.Sort(podByCreationTimestampAndPhase(daemonPodsRunning))
 			for i := 1; i < len(daemonPodsRunning); i++ {
 				podsToDelete = append(podsToDelete, daemonPodsRunning[i].Name)
 			}
@@ -870,7 +877,7 @@ func (dsc *DaemonSetsController) podsShouldBeOnNode(
 // which nodes should not run a Pod of ds but currently running one, it calls function
 // syncNodes with a list of pods to remove and a list of nodes to run a Pod of ds.
 func (dsc *DaemonSetsController) manage(ds *apps.DaemonSet, hash string) error {
-	// Find out which nodes are running the daemon pods controlled by ds.
+	// Find out the pods which are created for the nodes by DaemonSet.
 	nodeToDaemonPods, err := dsc.getNodesToDaemonPods(ds)
 	if err != nil {
 		return fmt.Errorf("couldn't get node to daemon pod mapping for daemon set %q: %v", ds.Name, err)
@@ -962,9 +969,12 @@ func (dsc *DaemonSetsController) syncNodes(ds *apps.DaemonSet, podsToDelete, nod
 
 				podTemplate := &template
 
-				if false /*disabled for 1.10*/ && utilfeature.DefaultFeatureGate.Enabled(features.ScheduleDaemonSetPods) {
+				if utilfeature.DefaultFeatureGate.Enabled(features.ScheduleDaemonSetPods) {
 					podTemplate = template.DeepCopy()
-					podTemplate.Spec.Affinity = util.ReplaceDaemonSetPodHostnameNodeAffinity(
+					// The pod's NodeAffinity will be updated to make sure the Pod is bound
+					// to the target node by default scheduler. It is safe to do so because there
+					// should be no conflicting node affinity with the target node.
+					podTemplate.Spec.Affinity = util.ReplaceDaemonSetPodNodeNameNodeAffinity(
 						podTemplate.Spec.Affinity, nodesNeedingDaemonPods[ix])
 					podTemplate.Spec.Tolerations = util.AppendNoScheduleTolerationIfNotExist(podTemplate.Spec.Tolerations)
 
@@ -1098,7 +1108,7 @@ func (dsc *DaemonSetsController) updateDaemonSetStatus(ds *apps.DaemonSet, hash
 				currentNumberScheduled++
 				// Sort the daemon pods by creation time, so that the oldest is first.
 				daemonPods, _ := nodeToDaemonPods[node.Name]
-				sort.Sort(podByCreationTimestamp(daemonPods))
+				sort.Sort(podByCreationTimestampAndPhase(daemonPods))
 				pod := daemonPods[0]
 				if podutil.IsPodReady(pod) {
 					numberReady++
@@ -1414,7 +1424,7 @@ func Predicates(pod *v1.Pod, nodeInfo *schedulercache.NodeInfo) (bool, []algorit
 	var predicateFails []algorithm.PredicateFailureReason
 
 	// If ScheduleDaemonSetPods is enabled, only check nodeSelector and nodeAffinity.
-	if false /*disabled for 1.10*/ && utilfeature.DefaultFeatureGate.Enabled(features.ScheduleDaemonSetPods) {
+	if utilfeature.DefaultFeatureGate.Enabled(features.ScheduleDaemonSetPods) {
 		fit, reasons, err := nodeSelectionPredicates(pod, nil, nodeInfo)
 		if err != nil {
 			return false, predicateFails, err
@@ -1466,12 +1476,21 @@ func (o byCreationTimestamp) Less(i, j int) bool {
 	return o[i].CreationTimestamp.Before(&o[j].CreationTimestamp)
 }
 
-type podByCreationTimestamp []*v1.Pod
+type podByCreationTimestampAndPhase []*v1.Pod
+
+func (o podByCreationTimestampAndPhase) Len() int      { return len(o) }
+func (o podByCreationTimestampAndPhase) Swap(i, j int) { o[i], o[j] = o[j], o[i] }
 
-func (o podByCreationTimestamp) Len() int      { return len(o) }
-func (o podByCreationTimestamp) Swap(i, j int) { o[i], o[j] = o[j], o[i] }
+func (o podByCreationTimestampAndPhase) Less(i, j int) bool {
+	// Scheduled Pod first
+	if len(o[i].Spec.NodeName) != 0 && len(o[j].Spec.NodeName) == 0 {
+		return true
+	}
+
+	if len(o[i].Spec.NodeName) == 0 && len(o[j].Spec.NodeName) != 0 {
+		return false
+	}
 
-func (o podByCreationTimestamp) Less(i, j int) bool {
 	if o[i].CreationTimestamp.Equal(&o[j].CreationTimestamp) {
 		return o[i].Name < o[j].Name
 	}

diff --git a/pkg/controller/daemon/util/BUILD b/pkg/controller/daemon/util/BUILD
@@ -14,7 +14,6 @@ go_library(
         "//pkg/api/v1/pod:go_default_library",
         "//pkg/apis/core/v1/helper:go_default_library",
         "//pkg/features:go_default_library",
-        "//pkg/kubelet/apis:go_default_library",
         "//pkg/kubelet/types:go_default_library",
         "//pkg/scheduler/algorithm:go_default_library",
         "//vendor/k8s.io/api/apps/v1:go_default_library",
@@ -45,9 +44,12 @@ go_test(
     embed = [":go_default_library"],
     deps = [
         "//pkg/api/testapi:go_default_library",
+        "//pkg/features:go_default_library",
         "//pkg/kubelet/apis:go_default_library",
+        "//pkg/scheduler/algorithm:go_default_library",
         "//vendor/k8s.io/api/core/v1:go_default_library",
         "//vendor/k8s.io/api/extensions/v1beta1:go_default_library",
         "//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
+        "//vendor/k8s.io/apiserver/pkg/util/feature:go_default_library",
     ],
 )
diff --git a/pkg/controller/daemon/util/daemonset_util.go b/pkg/controller/daemon/util/daemonset_util.go
@@ -29,7 +29,6 @@ import (
 	podutil "k8s.io/kubernetes/pkg/api/v1/pod"
 	v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
 	"k8s.io/kubernetes/pkg/features"
-	kubeletapis "k8s.io/kubernetes/pkg/kubelet/apis"
 	kubelettypes "k8s.io/kubernetes/pkg/kubelet/types"
 	"k8s.io/kubernetes/pkg/scheduler/algorithm"
 )
@@ -136,19 +135,20 @@ func SplitByAvailablePods(minReadySeconds int32, pods []*v1.Pod) ([]*v1.Pod, []*
 	return availablePods, unavailablePods
 }
 
-// ReplaceDaemonSetPodHostnameNodeAffinity replaces the 'kubernetes.io/hostname' NodeAffinity term with
-// the given "nodeName" in the "affinity" terms.
-func ReplaceDaemonSetPodHostnameNodeAffinity(affinity *v1.Affinity, nodename string) *v1.Affinity {
+// ReplaceDaemonSetPodNodeNameNodeAffinity replaces the RequiredDuringSchedulingIgnoredDuringExecution
+// NodeAffinity of the given affinity with a new NodeAffinity that selects the given nodeName.
+// Note that this function assumes that no NodeAffinity conflicts with the selected nodeName.
+func ReplaceDaemonSetPodNodeNameNodeAffinity(affinity *v1.Affinity, nodename string) *v1.Affinity {
+	nodeSelReq := v1.NodeSelectorRequirement{
+		Key:      algorithm.NodeFieldSelectorKeyNodeName,
+		Operator: v1.NodeSelectorOpIn,
+		Values:   []string{nodename},
+	}
+
 	nodeSelector := &v1.NodeSelector{
 		NodeSelectorTerms: []v1.NodeSelectorTerm{
 			{
-				MatchExpressions: []v1.NodeSelectorRequirement{
-					{
-						Key:      kubeletapis.LabelHostname,
-						Operator: v1.NodeSelectorOpIn,
-						Values:   []string{nodename},
-					},
-				},
+				MatchFields: []v1.NodeSelectorRequirement{nodeSelReq},
 			},
 		},
 	}
@@ -175,28 +175,12 @@ func ReplaceDaemonSetPodHostnameNodeAffinity(affinity *v1.Affinity, nodename str
 		return affinity
 	}
 
-	nodeSelectorTerms := []v1.NodeSelectorTerm{}
-
-	// Removes hostname node selector, as only the target hostname will take effect.
-	for _, term := range nodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms {
-		exps := []v1.NodeSelectorRequirement{}
-		for _, exp := range term.MatchExpressions {
-			if exp.Key != kubeletapis.LabelHostname {
-				exps = append(exps, exp)
-			}
-		}
-
-		if len(exps) > 0 {
-			term.MatchExpressions = exps
-			nodeSelectorTerms = append(nodeSelectorTerms, term)
-		}
-	}
-
-	// Adds target hostname NodeAffinity term.
-	nodeSelectorTerms = append(nodeSelectorTerms, nodeSelector.NodeSelectorTerms[0])
-
 	// Replace node selector with the new one.
-	nodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms = nodeSelectorTerms
+	nodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms = []v1.NodeSelectorTerm{
+		{
+			MatchFields: []v1.NodeSelectorRequirement{nodeSelReq},
+		},
+	}
 
 	return affinity
 }
@@ -225,3 +209,42 @@ func AppendNoScheduleTolerationIfNotExist(tolerations []v1.Toleration) []v1.Tole
 
 	return tolerations
 }
+
+// GetTargetNodeName get the target node name of DaemonSet pods. If `.spec.NodeName` is not empty (nil),
+// return `.spec.NodeName`; otherwise, retrieve node name of pending pods from NodeAffinity. Return error
+// if failed to retrieve node name from `.spec.NodeName` and NodeAffinity.
+func GetTargetNodeName(pod *v1.Pod) (string, error) {
+	if len(pod.Spec.NodeName) != 0 {
+		return pod.Spec.NodeName, nil
+	}
+
+	// If ScheduleDaemonSetPods was enabled before, retrieve node name of unscheduled pods from NodeAffinity
+	if pod.Spec.Affinity == nil ||
+		pod.Spec.Affinity.NodeAffinity == nil ||
+		pod.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution == nil {
+		return "", fmt.Errorf("no spec.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution for pod %s/%s",
+			pod.Namespace, pod.Name)
+	}
+
+	terms := pod.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms
+	if len(terms) < 1 {
+		return "", fmt.Errorf("no nodeSelectorTerms in requiredDuringSchedulingIgnoredDuringExecution of pod %s/%s",
+			pod.Namespace, pod.Name)
+	}
+
+	for _, term := range terms {
+		for _, exp := range term.MatchFields {
+			if exp.Key == algorithm.NodeFieldSelectorKeyNodeName &&
+				exp.Operator == v1.NodeSelectorOpIn {
+				if len(exp.Values) != 1 {
+					return "", fmt.Errorf("the matchFields value of '%s' is not unique for pod %s/%s",
+						algorithm.NodeFieldSelectorKeyNodeName, pod.Namespace, pod.Name)
+				}
+
+				return exp.Values[0], nil
+			}
+		}
+	}
+
+	return "", fmt.Errorf("no node name found for pod %s/%s", pod.Namespace, pod.Name)
+}