diff --git a/pkg/controller/daemon/daemon_controller.go b/pkg/controller/daemon/daemon_controller.go index ce2d90ef417b..5eec47531443 100644 --- a/pkg/controller/daemon/daemon_controller.go +++ b/pkg/controller/daemon/daemon_controller.go @@ -1053,6 +1053,30 @@ func (dsc *DaemonSetsController) simulate(newPod *v1.Pod, node *v1.Node, ds *ext Effect: v1.TaintEffectNoExecute, }) + // According to TaintNodesByCondition, all DaemonSet pods should tolerate + // MemoryPressure and DisPressure taints, and the critical pods should tolerate + // OutOfDisk taint additional. + v1helper.AddOrUpdateTolerationInPod(newPod, &v1.Toleration{ + Key: algorithm.TaintNodeDiskPressure, + Operator: v1.TolerationOpExists, + Effect: v1.TaintEffectNoSchedule, + }) + + v1helper.AddOrUpdateTolerationInPod(newPod, &v1.Toleration{ + Key: algorithm.TaintNodeMemoryPressure, + Operator: v1.TolerationOpExists, + Effect: v1.TaintEffectNoSchedule, + }) + + if utilfeature.DefaultFeatureGate.Enabled(features.ExperimentalCriticalPodAnnotation) && + kubelettypes.IsCriticalPod(newPod) { + v1helper.AddOrUpdateTolerationInPod(newPod, &v1.Toleration{ + Key: algorithm.TaintNodeOutOfDisk, + Operator: v1.TolerationOpExists, + Effect: v1.TaintEffectNoSchedule, + }) + } + pods := []*v1.Pod{} podList, err := dsc.podLister.List(labels.Everything()) @@ -1225,6 +1249,11 @@ func Predicates(pod *v1.Pod, nodeInfo *schedulercache.NodeInfo) (bool, []algorit func NodeConditionPredicates(nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason) { reasons := []algorithm.PredicateFailureReason{} + // If TaintNodesByCondition feature was enabled, account PodToleratesNodeTaints predicates. + if utilfeature.DefaultFeatureGate.Enabled(features.TaintNodesByCondition) { + return true, nil + } + for _, c := range nodeInfo.Node().Status.Conditions { // TODO: There are other node status that the DaemonSet should ideally respect too, // e.g. MemoryPressure, and DiskPressure diff --git a/pkg/controller/daemon/daemon_controller_test.go b/pkg/controller/daemon/daemon_controller_test.go index 9e0b7d7d128e..e3387d9aae95 100644 --- a/pkg/controller/daemon/daemon_controller_test.go +++ b/pkg/controller/daemon/daemon_controller_test.go @@ -1251,6 +1251,68 @@ func TestOutOfDiskNodeDaemonLaunchesCriticalPod(t *testing.T) { } } +// DaemonSet should launch a critical pod even when the node with OutOfDisk taints. +func TestTaintOutOfDiskNodeDaemonLaunchesCriticalPod(t *testing.T) { + for _, strategy := range updateStrategies() { + ds := newDaemonSet("critical") + ds.Spec.UpdateStrategy = *strategy + setDaemonSetCritical(ds) + manager, podControl, _ := newTestController(ds) + + node := newNode("not-enough-disk", nil) + node.Status.Conditions = []v1.NodeCondition{{Type: v1.NodeOutOfDisk, Status: v1.ConditionTrue}} + node.Spec.Taints = []v1.Taint{{Key: algorithm.TaintNodeOutOfDisk, Effect: v1.TaintEffectNoSchedule}} + manager.nodeStore.Add(node) + + // NOTE: Whether or not TaintNodesByCondition is enabled, it'll add toleration to DaemonSet pods. + + // Without enabling critical pod annotation feature gate, we shouldn't create critical pod + utilfeature.DefaultFeatureGate.Set("ExperimentalCriticalPodAnnotation=False") + utilfeature.DefaultFeatureGate.Set("TaintNodesByCondition=True") + manager.dsStore.Add(ds) + syncAndValidateDaemonSets(t, manager, ds, podControl, 0, 0, 0) + + // With enabling critical pod annotation feature gate, we will create critical pod + utilfeature.DefaultFeatureGate.Set("ExperimentalCriticalPodAnnotation=True") + utilfeature.DefaultFeatureGate.Set("TaintNodesByCondition=False") + manager.dsStore.Add(ds) + syncAndValidateDaemonSets(t, manager, ds, podControl, 1, 0, 0) + + // Rollback feature gate to false. + utilfeature.DefaultFeatureGate.Set("TaintNodesByCondition=False") + utilfeature.DefaultFeatureGate.Set("ExperimentalCriticalPodAnnotation=False") + } +} + +// DaemonSet should launch a pod even when the node with MemoryPressure/DiskPressure taints. +func TestTaintPressureNodeDaemonLaunchesPod(t *testing.T) { + for _, strategy := range updateStrategies() { + ds := newDaemonSet("critical") + ds.Spec.UpdateStrategy = *strategy + setDaemonSetCritical(ds) + manager, podControl, _ := newTestController(ds) + + node := newNode("resources-pressure", nil) + node.Status.Conditions = []v1.NodeCondition{ + {Type: v1.NodeDiskPressure, Status: v1.ConditionTrue}, + {Type: v1.NodeMemoryPressure, Status: v1.ConditionTrue}, + } + node.Spec.Taints = []v1.Taint{ + {Key: algorithm.TaintNodeDiskPressure, Effect: v1.TaintEffectNoSchedule}, + {Key: algorithm.TaintNodeMemoryPressure, Effect: v1.TaintEffectNoSchedule}, + } + manager.nodeStore.Add(node) + + // Enabling critical pod and taint nodes by condition feature gate should create critical pod + utilfeature.DefaultFeatureGate.Set("TaintNodesByCondition=True") + manager.dsStore.Add(ds) + syncAndValidateDaemonSets(t, manager, ds, podControl, 1, 0, 0) + + // Rollback feature gate to false. + utilfeature.DefaultFeatureGate.Set("TaintNodesByCondition=False") + } +} + // DaemonSet should launch a critical pod even when the node has insufficient free resource. func TestInsufficientCapacityNodeDaemonLaunchesCriticalPod(t *testing.T) { for _, strategy := range updateStrategies() { diff --git a/pkg/controller/daemon/util/BUILD b/pkg/controller/daemon/util/BUILD index a14a6b39c072..e74429a2bf31 100644 --- a/pkg/controller/daemon/util/BUILD +++ b/pkg/controller/daemon/util/BUILD @@ -15,11 +15,14 @@ go_library( deps = [ "//pkg/api/v1/helper:go_default_library", "//pkg/api/v1/pod:go_default_library", + "//pkg/features:go_default_library", + "//pkg/kubelet/types:go_default_library", "//pkg/util/labels:go_default_library", "//plugin/pkg/scheduler/algorithm:go_default_library", "//vendor/k8s.io/api/core/v1:go_default_library", "//vendor/k8s.io/api/extensions/v1beta1:go_default_library", "//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", + "//vendor/k8s.io/apiserver/pkg/util/feature:go_default_library", "//vendor/k8s.io/client-go/kubernetes/scheme:go_default_library", ], ) diff --git a/pkg/controller/daemon/util/daemonset_util.go b/pkg/controller/daemon/util/daemonset_util.go index 8c0bc1aaa320..ebb8f86a18bd 100644 --- a/pkg/controller/daemon/util/daemonset_util.go +++ b/pkg/controller/daemon/util/daemonset_util.go @@ -22,9 +22,12 @@ import ( "k8s.io/api/core/v1" extensions "k8s.io/api/extensions/v1beta1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + utilfeature "k8s.io/apiserver/pkg/util/feature" "k8s.io/client-go/kubernetes/scheme" v1helper "k8s.io/kubernetes/pkg/api/v1/helper" podutil "k8s.io/kubernetes/pkg/api/v1/pod" + "k8s.io/kubernetes/pkg/features" + kubelettypes "k8s.io/kubernetes/pkg/kubelet/types" labelsutil "k8s.io/kubernetes/pkg/util/labels" "k8s.io/kubernetes/plugin/pkg/scheduler/algorithm" ) @@ -55,6 +58,30 @@ func CreatePodTemplate(template v1.PodTemplateSpec, generation int64, hash strin Effect: v1.TaintEffectNoExecute, }) + // According to TaintNodesByCondition feature, all DaemonSet pods should tolerate + // MemoryPressure and DisPressure taints, and the critical pods should tolerate + // OutOfDisk taint. + v1helper.AddOrUpdateTolerationInPodSpec(&newTemplate.Spec, &v1.Toleration{ + Key: algorithm.TaintNodeDiskPressure, + Operator: v1.TolerationOpExists, + Effect: v1.TaintEffectNoSchedule, + }) + + v1helper.AddOrUpdateTolerationInPodSpec(&newTemplate.Spec, &v1.Toleration{ + Key: algorithm.TaintNodeMemoryPressure, + Operator: v1.TolerationOpExists, + Effect: v1.TaintEffectNoSchedule, + }) + + if utilfeature.DefaultFeatureGate.Enabled(features.ExperimentalCriticalPodAnnotation) && + kubelettypes.IsCritical(newTemplate.Namespace, newTemplate.Annotations) { + v1helper.AddOrUpdateTolerationInPodSpec(&newTemplate.Spec, &v1.Toleration{ + Key: algorithm.TaintNodeOutOfDisk, + Operator: v1.TolerationOpExists, + Effect: v1.TaintEffectNoExecute, + }) + } + templateGenerationStr := fmt.Sprint(generation) newTemplate.ObjectMeta.Labels = labelsutil.CloneAndAddLabel( template.ObjectMeta.Labels, diff --git a/pkg/kubelet/types/pod_update.go b/pkg/kubelet/types/pod_update.go index 45a53a0145ef..2b95b3b18793 100644 --- a/pkg/kubelet/types/pod_update.go +++ b/pkg/kubelet/types/pod_update.go @@ -141,11 +141,17 @@ func (sp SyncPodType) String() string { // key. Both the rescheduler and the kubelet use this key to make admission // and scheduling decisions. func IsCriticalPod(pod *v1.Pod) bool { + return IsCritical(pod.Namespace, pod.Annotations) +} + +// IsCritical returns true if parameters bear the critical pod annotation +// key. The DaemonSetController use this key directly to make scheduling decisions. +func IsCritical(ns string, annotations map[string]string) bool { // Critical pods are restricted to "kube-system" namespace as of now. - if pod.Namespace != kubeapi.NamespaceSystem { + if ns != kubeapi.NamespaceSystem { return false } - val, ok := pod.Annotations[CriticalPodAnnotationKey] + val, ok := annotations[CriticalPodAnnotationKey] if ok && val == "" { return true }