kubernetes · k8s-ci-robot · Mar 24, 2023 · Jan 13, 2023 · vadasambar · Mar 20, 2023
diff --git a/cluster-autoscaler/FAQ.md b/cluster-autoscaler/FAQ.md
@@ -785,6 +785,7 @@ The following startup parameters are supported for cluster autoscaler:
 | `aws-use-static-instance-list` | Should CA fetch instance types in runtime or use a static list. AWS only | false
 | `skip-nodes-with-system-pods` | If true cluster autoscaler will never delete nodes with pods from kube-system (except for DaemonSet or mirror pods) | true
 | `skip-nodes-with-local-storage`| If true cluster autoscaler will never delete nodes with pods with local storage, e.g. EmptyDir or HostPath | true
+| `skip-nodes-with-custom-controller-pods` | If true cluster autoscaler will never delete nodes with pods owned by custom controllers | true
 | `min-replica-count` | Minimum number or replicas that a replica set or replication controller should have to allow their pods deletion in scale down | 0
 | `daemonset-eviction-for-empty-nodes` | Whether DaemonSet pods will be gracefully terminated from empty nodes | false
 | `daemonset-eviction-for-occupied-nodes` | Whether DaemonSet pods will be gracefully terminated from non-empty nodes | true

diff --git a/cluster-autoscaler/config/autoscaling_options.go b/cluster-autoscaler/config/autoscaling_options.go
@@ -240,6 +240,8 @@ type AutoscalingOptions struct {
 	SkipNodesWithSystemPods bool
 	// SkipNodesWithLocalStorage tells if nodes with pods with local storage, e.g. EmptyDir or HostPath, should be deleted
 	SkipNodesWithLocalStorage bool
+	// SkipNodesWithCustomControllerPods tells if nodes with custom-controller owned pods should be skipped from deletion (skip if 'true')
+	SkipNodesWithCustomControllerPods bool
 	// MinReplicaCount controls the minimum number of replicas that a replica set or replication controller should have
 	// to allow their pods deletion in scale down
 	MinReplicaCount int

diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go
@@ -169,9 +169,10 @@ func NewStaticAutoscaler(
 	processors.ScaleDownCandidatesNotifier.Register(clusterStateRegistry)
 
 	deleteOptions := simulator.NodeDeleteOptions{
-		SkipNodesWithSystemPods:   opts.SkipNodesWithSystemPods,
-		SkipNodesWithLocalStorage: opts.SkipNodesWithLocalStorage,
-		MinReplicaCount:           opts.MinReplicaCount,
+		SkipNodesWithSystemPods:           opts.SkipNodesWithSystemPods,
+		SkipNodesWithLocalStorage:         opts.SkipNodesWithLocalStorage,
+		MinReplicaCount:                   opts.MinReplicaCount,
+		SkipNodesWithCustomControllerPods: opts.SkipNodesWithCustomControllerPods,
 	}
 
 	// TODO: Populate the ScaleDownActuator/Planner fields in AutoscalingContext

diff --git a/cluster-autoscaler/main.go b/cluster-autoscaler/main.go
@@ -216,6 +216,7 @@ var (
 	maxNodeGroupBinpackingDuration     = flag.Duration("max-nodegroup-binpacking-duration", 10*time.Second, "Maximum time that will be spent in binpacking simulation for each NodeGroup.")
 	skipNodesWithSystemPods            = flag.Bool("skip-nodes-with-system-pods", true, "If true cluster autoscaler will never delete nodes with pods from kube-system (except for DaemonSet or mirror pods)")
 	skipNodesWithLocalStorage          = flag.Bool("skip-nodes-with-local-storage", true, "If true cluster autoscaler will never delete nodes with pods with local storage, e.g. EmptyDir or HostPath")
+	skipNodesWithCustomControllerPods  = flag.Bool("skip-nodes-with-custom-controller-pods", true, "If true cluster autoscaler will never delete nodes with pods owned by custom controllers")
 	minReplicaCount                    = flag.Int("min-replica-count", 0, "Minimum number or replicas that a replica set or replication controller should have to allow their pods deletion in scale down")
 	nodeDeleteDelayAfterTaint          = flag.Duration("node-delete-delay-after-taint", 5*time.Second, "How long to wait before deleting a node after tainting it")
 	scaleDownSimulationTimeout         = flag.Duration("scale-down-simulation-timeout", 30*time.Second, "How long should we run scale down simulation.")
@@ -328,6 +329,7 @@ func createAutoscalingOptions() config.AutoscalingOptions {
 		NodeDeleteDelayAfterTaint:          *nodeDeleteDelayAfterTaint,
 		ScaleDownSimulationTimeout:         *scaleDownSimulationTimeout,
 		ParallelDrain:                      *parallelDrain,
+		SkipNodesWithCustomControllerPods:  *skipNodesWithCustomControllerPods,
 		NodeGroupSetRatios: config.NodeGroupDifferenceRatios{
 			MaxCapacityMemoryDifferenceRatio: *maxCapacityMemoryDifferenceRatio,
 			MaxAllocatableDifferenceRatio:    *maxAllocatableDifferenceRatio,

diff --git a/cluster-autoscaler/simulator/drain.go b/cluster-autoscaler/simulator/drain.go
@@ -35,6 +35,8 @@ type NodeDeleteOptions struct {
 	SkipNodesWithSystemPods bool
 	// SkipNodesWithLocalStorage tells if nodes with pods with local storage, e.g. EmptyDir or HostPath, should be deleted
 	SkipNodesWithLocalStorage bool
+	// SkipNodesWithCustomControllerPods tells if nodes with custom-controller owned pods should be skipped from deletion (skip if 'true')
+	SkipNodesWithCustomControllerPods bool
 	// MinReplicaCount controls the minimum number of replicas that a replica set or replication controller should have
 	// to allow their pods deletion in scale down
 	MinReplicaCount int
@@ -57,6 +59,7 @@ func GetPodsToMove(nodeInfo *schedulerframework.NodeInfo, deleteOptions NodeDele
 		pdbs,
 		deleteOptions.SkipNodesWithSystemPods,
 		deleteOptions.SkipNodesWithLocalStorage,
+		deleteOptions.SkipNodesWithCustomControllerPods,
 		listers,
 		int32(deleteOptions.MinReplicaCount),
 		timestamp)

diff --git a/cluster-autoscaler/simulator/drain_test.go b/cluster-autoscaler/simulator/drain_test.go
@@ -42,9 +42,10 @@ func TestGetPodsToMove(t *testing.T) {
 		},
 	}
 	deleteOptions := NodeDeleteOptions{
-		SkipNodesWithSystemPods:   true,
-		SkipNodesWithLocalStorage: true,
-		MinReplicaCount:           0,
+		SkipNodesWithSystemPods:           true,
+		SkipNodesWithLocalStorage:         true,
+		MinReplicaCount:                   0,
+		SkipNodesWithCustomControllerPods: true,
 	}
 	_, _, blockingPod, err := GetPodsToMove(schedulerframework.NewNodeInfo(pod1), deleteOptions, nil, nil, testTime)
 	assert.Error(t, err)

diff --git a/cluster-autoscaler/utils/drain/drain.go b/cluster-autoscaler/utils/drain/drain.go
@@ -78,13 +78,13 @@ func GetPodsForDeletionOnNodeDrain(
 	pdbs []*policyv1.PodDisruptionBudget,
 	skipNodesWithSystemPods bool,
 	skipNodesWithLocalStorage bool,
+	skipNodesWithCustomControllerPods bool,
 	listers kube_util.ListerRegistry,
 	minReplica int32,
 	currentTime time.Time) (pods []*apiv1.Pod, daemonSetPods []*apiv1.Pod, blockingPod *BlockingPod, err error) {
 
 	pods = []*apiv1.Pod{}
 	daemonSetPods = []*apiv1.Pod{}
-	checkReferences := listers != nil
 	// filter kube-system PDBs to avoid doing it for every kube-system pod
 	kubeSystemPDBs := make([]*policyv1.PodDisruptionBudget, 0)
 	for _, pdb := range pdbs {
@@ -93,6 +93,7 @@ func GetPodsForDeletionOnNodeDrain(
 		}
 	}
 
+	isDaemonSetPod := false
 	for _, pod := range podList {
 		if pod_util.IsMirrorPod(pod) {
 			continue
@@ -106,101 +107,25 @@ func GetPodsForDeletionOnNodeDrain(
 			continue
 		}
 
-		isDaemonSetPod := false
 		replicated := false
 		safeToEvict := hasSafeToEvictAnnotation(pod)
 		terminal := isPodTerminal(pod)
 
-		controllerRef := ControllerRef(pod)
-		refKind := ""
-		if controllerRef != nil {
-			refKind = controllerRef.Kind
-		}
-
-		// For now, owner controller must be in the same namespace as the pod
-		// so OwnerReference doesn't have its own Namespace field
-		controllerNamespace := pod.Namespace
-
-		if refKind == "ReplicationController" {
-			if checkReferences {
-				rc, err := listers.ReplicationControllerLister().ReplicationControllers(controllerNamespace).Get(controllerRef.Name)
-				// Assume a reason for an error is because the RC is either
-				// gone/missing or that the rc has too few replicas configured.
-				// TODO: replace the minReplica check with pod disruption budget.
-				if err == nil && rc != nil {
-					if rc.Spec.Replicas != nil && *rc.Spec.Replicas < minReplica {
-						return []*apiv1.Pod{}, []*apiv1.Pod{}, &BlockingPod{Pod: pod, Reason: MinReplicasReached}, fmt.Errorf("replication controller for %s/%s has too few replicas spec: %d min: %d",
-							pod.Namespace, pod.Name, rc.Spec.Replicas, minReplica)
-					}
-					replicated = true
-				} else {
-					return []*apiv1.Pod{}, []*apiv1.Pod{}, &BlockingPod{Pod: pod, Reason: ControllerNotFound}, fmt.Errorf("replication controller for %s/%s is not available, err: %v", pod.Namespace, pod.Name, err)
-				}
-			} else {
-				replicated = true
+		if skipNodesWithCustomControllerPods {
+			// TODO(vadasambar): remove this when we get rid of skipNodesWithCustomControllerPods
+			replicated, isDaemonSetPod, blockingPod, err = legacyCheckForReplicatedPods(listers, pod, minReplica)
+			if err != nil {
+				return []*apiv1.Pod{}, []*apiv1.Pod{}, blockingPod, err
 			}
-		} else if pod_util.IsDaemonSetPod(pod) {
-			isDaemonSetPod = true
-			// don't have listener for other DaemonSet kind
-			// TODO: we should use a generic client for checking the reference.
-			if checkReferences && refKind == "DaemonSet" {
-				_, err := listers.DaemonSetLister().DaemonSets(controllerNamespace).Get(controllerRef.Name)
-				if apierrors.IsNotFound(err) {
-					return []*apiv1.Pod{}, []*apiv1.Pod{}, &BlockingPod{Pod: pod, Reason: ControllerNotFound}, fmt.Errorf("daemonset for %s/%s is not present, err: %v", pod.Namespace, pod.Name, err)
-				} else if err != nil {
-					return []*apiv1.Pod{}, []*apiv1.Pod{}, &BlockingPod{Pod: pod, Reason: UnexpectedError}, fmt.Errorf("error when trying to get daemonset for %s/%s , err: %v", pod.Namespace, pod.Name, err)
-				}
-			}
-		} else if refKind == "Job" {
-			if checkReferences {
-				job, err := listers.JobLister().Jobs(controllerNamespace).Get(controllerRef.Name)
-
-				// Assume the only reason for an error is because the Job is
-				// gone/missing, not for any other cause.  TODO(mml): something more
-				// sophisticated than this
-				if err == nil && job != nil {
-					replicated = true
-				} else {
-					return []*apiv1.Pod{}, []*apiv1.Pod{}, &BlockingPod{Pod: pod, Reason: ControllerNotFound}, fmt.Errorf("job for %s/%s is not available: err: %v", pod.Namespace, pod.Name, err)
-				}
-			} else {
+		} else {
+			if ControllerRef(pod) != nil {
 				replicated = true
 			}
-		} else if refKind == "ReplicaSet" {
-			if checkReferences {
-				rs, err := listers.ReplicaSetLister().ReplicaSets(controllerNamespace).Get(controllerRef.Name)
-
-				// Assume the only reason for an error is because the RS is
-				// gone/missing, not for any other cause.  TODO(mml): something more
-				// sophisticated than this
-				if err == nil && rs != nil {
-					if rs.Spec.Replicas != nil && *rs.Spec.Replicas < minReplica {
-						return []*apiv1.Pod{}, []*apiv1.Pod{}, &BlockingPod{Pod: pod, Reason: MinReplicasReached}, fmt.Errorf("replication controller for %s/%s has too few replicas spec: %d min: %d",
-							pod.Namespace, pod.Name, rs.Spec.Replicas, minReplica)
-					}
-					replicated = true
-				} else {
-					return []*apiv1.Pod{}, []*apiv1.Pod{}, &BlockingPod{Pod: pod, Reason: ControllerNotFound}, fmt.Errorf("replication controller for %s/%s is not available, err: %v", pod.Namespace, pod.Name, err)
-				}
-			} else {
-				replicated = true
-			}
-		} else if refKind == "StatefulSet" {
-			if checkReferences {
-				ss, err := listers.StatefulSetLister().StatefulSets(controllerNamespace).Get(controllerRef.Name)
-
-				// Assume the only reason for an error is because the StatefulSet is
-				// gone/missing, not for any other cause.  TODO(mml): something more
-				// sophisticated than this
-				if err == nil && ss != nil {
-					replicated = true
-				} else {
-					return []*apiv1.Pod{}, []*apiv1.Pod{}, &BlockingPod{Pod: pod, Reason: ControllerNotFound}, fmt.Errorf("statefulset for %s/%s is not available: err: %v", pod.Namespace, pod.Name, err)
-				}
-			} else {
-				replicated = true
+			if pod_util.IsDaemonSetPod(pod) {
+				isDaemonSetPod = true
 			}
 		}
+
 		if isDaemonSetPod {
 			daemonSetPods = append(daemonSetPods, pod)
 			continue
@@ -231,6 +156,104 @@ func GetPodsForDeletionOnNodeDrain(
 	return pods, daemonSetPods, nil, nil
 }
 
+func legacyCheckForReplicatedPods(listers kube_util.ListerRegistry, pod *apiv1.Pod, minReplica int32) (replicated bool, isDaemonSetPod bool, blockingPod *BlockingPod, err error) {
+	replicated = false
+	refKind := ""
+	checkReferences := listers != nil
+	isDaemonSetPod = false
+
+	controllerRef := ControllerRef(pod)
+	if controllerRef != nil {
+		refKind = controllerRef.Kind
+	}
+
+	// For now, owner controller must be in the same namespace as the pod
+	// so OwnerReference doesn't have its own Namespace field
+	controllerNamespace := pod.Namespace
+	if refKind == "ReplicationController" {
+		if checkReferences {
+			rc, err := listers.ReplicationControllerLister().ReplicationControllers(controllerNamespace).Get(controllerRef.Name)
+			// Assume a reason for an error is because the RC is either
+			// gone/missing or that the rc has too few replicas configured.
+			// TODO: replace the minReplica check with pod disruption budget.
+			if err == nil && rc != nil {
+				if rc.Spec.Replicas != nil && *rc.Spec.Replicas < minReplica {
+					return replicated, isDaemonSetPod, &BlockingPod{Pod: pod, Reason: MinReplicasReached}, fmt.Errorf("replication controller for %s/%s has too few replicas spec: %d min: %d",
+						pod.Namespace, pod.Name, rc.Spec.Replicas, minReplica)
+				}
+				replicated = true
+			} else {
+				return replicated, isDaemonSetPod, &BlockingPod{Pod: pod, Reason: ControllerNotFound}, fmt.Errorf("replication controller for %s/%s is not available, err: %v", pod.Namespace, pod.Name, err)
+			}
+		} else {
+			replicated = true
+		}
+	} else if pod_util.IsDaemonSetPod(pod) {
+		isDaemonSetPod = true
+		// don't have listener for other DaemonSet kind
+		// TODO: we should use a generic client for checking the reference.
+		if checkReferences && refKind == "DaemonSet" {
+			_, err := listers.DaemonSetLister().DaemonSets(controllerNamespace).Get(controllerRef.Name)
+			if apierrors.IsNotFound(err) {
+				return replicated, isDaemonSetPod, &BlockingPod{Pod: pod, Reason: ControllerNotFound}, fmt.Errorf("daemonset for %s/%s is not present, err: %v", pod.Namespace, pod.Name, err)
+			} else if err != nil {
+				return replicated, isDaemonSetPod, &BlockingPod{Pod: pod, Reason: UnexpectedError}, fmt.Errorf("error when trying to get daemonset for %s/%s , err: %v", pod.Namespace, pod.Name, err)
+			}
+		}
+	} else if refKind == "Job" {
+		if checkReferences {
+			job, err := listers.JobLister().Jobs(controllerNamespace).Get(controllerRef.Name)
+
+			// Assume the only reason for an error is because the Job is
+			// gone/missing, not for any other cause.  TODO(mml): something more
+			// sophisticated than this
+			if err == nil && job != nil {
+				replicated = true
+			} else {
+				return replicated, isDaemonSetPod, &BlockingPod{Pod: pod, Reason: ControllerNotFound}, fmt.Errorf("job for %s/%s is not available: err: %v", pod.Namespace, pod.Name, err)
+			}
+		} else {
+			replicated = true
+		}
+	} else if refKind == "ReplicaSet" {
+		if checkReferences {
+			rs, err := listers.ReplicaSetLister().ReplicaSets(controllerNamespace).Get(controllerRef.Name)
+
+			// Assume the only reason for an error is because the RS is
+			// gone/missing, not for any other cause.  TODO(mml): something more
+			// sophisticated than this
+			if err == nil && rs != nil {
+				if rs.Spec.Replicas != nil && *rs.Spec.Replicas < minReplica {
+					return replicated, isDaemonSetPod, &BlockingPod{Pod: pod, Reason: MinReplicasReached}, fmt.Errorf("replication controller for %s/%s has too few replicas spec: %d min: %d",
+						pod.Namespace, pod.Name, rs.Spec.Replicas, minReplica)
+				}
+				replicated = true
+			} else {
+				return replicated, isDaemonSetPod, &BlockingPod{Pod: pod, Reason: ControllerNotFound}, fmt.Errorf("replication controller for %s/%s is not available, err: %v", pod.Namespace, pod.Name, err)
+			}
+		} else {
+			replicated = true
+		}
+	} else if refKind == "StatefulSet" {
+		if checkReferences {
+			ss, err := listers.StatefulSetLister().StatefulSets(controllerNamespace).Get(controllerRef.Name)
+
+			// Assume the only reason for an error is because the StatefulSet is
+			// gone/missing, not for any other cause.  TODO(mml): something more
+			// sophisticated than this
+			if err == nil && ss != nil {
+				replicated = true
+			} else {
+				return replicated, isDaemonSetPod, &BlockingPod{Pod: pod, Reason: ControllerNotFound}, fmt.Errorf("statefulset for %s/%s is not available: err: %v", pod.Namespace, pod.Name, err)
+			}
+		} else {
+			replicated = true
+		}
+	}
+
+	return replicated, isDaemonSetPod, &BlockingPod{}, nil
+}
+
 // ControllerRef returns the OwnerReference to pod's controller.
 func ControllerRef(pod *apiv1.Pod) *metav1.OwnerReference {
 	return metav1.GetControllerOf(pod)