Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: check only controller ref to decide if a pod is replicated #5507

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cluster-autoscaler/FAQ.md
Original file line number Diff line number Diff line change
Expand Up @@ -785,6 +785,7 @@ The following startup parameters are supported for cluster autoscaler:
| `aws-use-static-instance-list` | Should CA fetch instance types in runtime or use a static list. AWS only | false
| `skip-nodes-with-system-pods` | If true cluster autoscaler will never delete nodes with pods from kube-system (except for DaemonSet or mirror pods) | true
| `skip-nodes-with-local-storage`| If true cluster autoscaler will never delete nodes with pods with local storage, e.g. EmptyDir or HostPath | true
| `skip-nodes-with-custom-controller-pods` | If true cluster autoscaler will never delete nodes with pods owned by custom controllers | true
| `min-replica-count` | Minimum number or replicas that a replica set or replication controller should have to allow their pods deletion in scale down | 0
| `daemonset-eviction-for-empty-nodes` | Whether DaemonSet pods will be gracefully terminated from empty nodes | false
| `daemonset-eviction-for-occupied-nodes` | Whether DaemonSet pods will be gracefully terminated from non-empty nodes | true
Expand Down
2 changes: 2 additions & 0 deletions cluster-autoscaler/config/autoscaling_options.go
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,8 @@ type AutoscalingOptions struct {
SkipNodesWithSystemPods bool
// SkipNodesWithLocalStorage tells if nodes with pods with local storage, e.g. EmptyDir or HostPath, should be deleted
SkipNodesWithLocalStorage bool
// SkipNodesWithCustomControllerPods tells if nodes with custom-controller owned pods should be skipped from deletion (skip if 'true')
SkipNodesWithCustomControllerPods bool
// MinReplicaCount controls the minimum number of replicas that a replica set or replication controller should have
// to allow their pods deletion in scale down
MinReplicaCount int
Expand Down
7 changes: 4 additions & 3 deletions cluster-autoscaler/core/static_autoscaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -169,9 +169,10 @@ func NewStaticAutoscaler(
processors.ScaleDownCandidatesNotifier.Register(clusterStateRegistry)

deleteOptions := simulator.NodeDeleteOptions{
SkipNodesWithSystemPods: opts.SkipNodesWithSystemPods,
SkipNodesWithLocalStorage: opts.SkipNodesWithLocalStorage,
MinReplicaCount: opts.MinReplicaCount,
SkipNodesWithSystemPods: opts.SkipNodesWithSystemPods,
SkipNodesWithLocalStorage: opts.SkipNodesWithLocalStorage,
MinReplicaCount: opts.MinReplicaCount,
SkipNodesWithCustomControllerPods: opts.SkipNodesWithCustomControllerPods,
}

// TODO: Populate the ScaleDownActuator/Planner fields in AutoscalingContext
Expand Down
2 changes: 2 additions & 0 deletions cluster-autoscaler/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,7 @@ var (
maxNodeGroupBinpackingDuration = flag.Duration("max-nodegroup-binpacking-duration", 10*time.Second, "Maximum time that will be spent in binpacking simulation for each NodeGroup.")
skipNodesWithSystemPods = flag.Bool("skip-nodes-with-system-pods", true, "If true cluster autoscaler will never delete nodes with pods from kube-system (except for DaemonSet or mirror pods)")
skipNodesWithLocalStorage = flag.Bool("skip-nodes-with-local-storage", true, "If true cluster autoscaler will never delete nodes with pods with local storage, e.g. EmptyDir or HostPath")
skipNodesWithCustomControllerPods = flag.Bool("skip-nodes-with-custom-controller-pods", true, "If true cluster autoscaler will never delete nodes with pods owned by custom controllers")
minReplicaCount = flag.Int("min-replica-count", 0, "Minimum number or replicas that a replica set or replication controller should have to allow their pods deletion in scale down")
nodeDeleteDelayAfterTaint = flag.Duration("node-delete-delay-after-taint", 5*time.Second, "How long to wait before deleting a node after tainting it")
scaleDownSimulationTimeout = flag.Duration("scale-down-simulation-timeout", 30*time.Second, "How long should we run scale down simulation.")
Expand Down Expand Up @@ -328,6 +329,7 @@ func createAutoscalingOptions() config.AutoscalingOptions {
NodeDeleteDelayAfterTaint: *nodeDeleteDelayAfterTaint,
ScaleDownSimulationTimeout: *scaleDownSimulationTimeout,
ParallelDrain: *parallelDrain,
SkipNodesWithCustomControllerPods: *skipNodesWithCustomControllerPods,
NodeGroupSetRatios: config.NodeGroupDifferenceRatios{
MaxCapacityMemoryDifferenceRatio: *maxCapacityMemoryDifferenceRatio,
MaxAllocatableDifferenceRatio: *maxAllocatableDifferenceRatio,
Expand Down
3 changes: 3 additions & 0 deletions cluster-autoscaler/simulator/drain.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ type NodeDeleteOptions struct {
SkipNodesWithSystemPods bool
// SkipNodesWithLocalStorage tells if nodes with pods with local storage, e.g. EmptyDir or HostPath, should be deleted
SkipNodesWithLocalStorage bool
// SkipNodesWithCustomControllerPods tells if nodes with custom-controller owned pods should be skipped from deletion (skip if 'true')
SkipNodesWithCustomControllerPods bool
// MinReplicaCount controls the minimum number of replicas that a replica set or replication controller should have
// to allow their pods deletion in scale down
MinReplicaCount int
Expand All @@ -57,6 +59,7 @@ func GetPodsToMove(nodeInfo *schedulerframework.NodeInfo, deleteOptions NodeDele
pdbs,
deleteOptions.SkipNodesWithSystemPods,
deleteOptions.SkipNodesWithLocalStorage,
deleteOptions.SkipNodesWithCustomControllerPods,
listers,
int32(deleteOptions.MinReplicaCount),
timestamp)
Expand Down
7 changes: 4 additions & 3 deletions cluster-autoscaler/simulator/drain_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,10 @@ func TestGetPodsToMove(t *testing.T) {
},
}
deleteOptions := NodeDeleteOptions{
SkipNodesWithSystemPods: true,
SkipNodesWithLocalStorage: true,
MinReplicaCount: 0,
SkipNodesWithSystemPods: true,
SkipNodesWithLocalStorage: true,
MinReplicaCount: 0,
SkipNodesWithCustomControllerPods: true,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have set SkipNodesWithCustomControllerPods: true to preserve current behavior of these test.

}
_, _, blockingPod, err := GetPodsToMove(schedulerframework.NewNodeInfo(pod1), deleteOptions, nil, nil, testTime)
assert.Error(t, err)
Expand Down
197 changes: 110 additions & 87 deletions cluster-autoscaler/utils/drain/drain.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,13 +78,13 @@ func GetPodsForDeletionOnNodeDrain(
pdbs []*policyv1.PodDisruptionBudget,
skipNodesWithSystemPods bool,
skipNodesWithLocalStorage bool,
skipNodesWithCustomControllerPods bool,
listers kube_util.ListerRegistry,
minReplica int32,
currentTime time.Time) (pods []*apiv1.Pod, daemonSetPods []*apiv1.Pod, blockingPod *BlockingPod, err error) {

pods = []*apiv1.Pod{}
daemonSetPods = []*apiv1.Pod{}
checkReferences := listers != nil
// filter kube-system PDBs to avoid doing it for every kube-system pod
kubeSystemPDBs := make([]*policyv1.PodDisruptionBudget, 0)
for _, pdb := range pdbs {
Expand All @@ -93,6 +93,7 @@ func GetPodsForDeletionOnNodeDrain(
}
}

isDaemonSetPod := false
for _, pod := range podList {
if pod_util.IsMirrorPod(pod) {
continue
Expand All @@ -106,101 +107,25 @@ func GetPodsForDeletionOnNodeDrain(
continue
}

isDaemonSetPod := false
replicated := false
safeToEvict := hasSafeToEvictAnnotation(pod)
terminal := isPodTerminal(pod)

controllerRef := ControllerRef(pod)
refKind := ""
if controllerRef != nil {
refKind = controllerRef.Kind
}

// For now, owner controller must be in the same namespace as the pod
// so OwnerReference doesn't have its own Namespace field
controllerNamespace := pod.Namespace

if refKind == "ReplicationController" {
if checkReferences {
rc, err := listers.ReplicationControllerLister().ReplicationControllers(controllerNamespace).Get(controllerRef.Name)
// Assume a reason for an error is because the RC is either
// gone/missing or that the rc has too few replicas configured.
// TODO: replace the minReplica check with pod disruption budget.
if err == nil && rc != nil {
if rc.Spec.Replicas != nil && *rc.Spec.Replicas < minReplica {
return []*apiv1.Pod{}, []*apiv1.Pod{}, &BlockingPod{Pod: pod, Reason: MinReplicasReached}, fmt.Errorf("replication controller for %s/%s has too few replicas spec: %d min: %d",
pod.Namespace, pod.Name, rc.Spec.Replicas, minReplica)
}
replicated = true
} else {
return []*apiv1.Pod{}, []*apiv1.Pod{}, &BlockingPod{Pod: pod, Reason: ControllerNotFound}, fmt.Errorf("replication controller for %s/%s is not available, err: %v", pod.Namespace, pod.Name, err)
}
} else {
replicated = true
if skipNodesWithCustomControllerPods {
// TODO(vadasambar): remove this when we get rid of skipNodesWithCustomControllerPods
replicated, isDaemonSetPod, blockingPod, err = legacyCheckForReplicatedPods(listers, pod, minReplica)
if err != nil {
return []*apiv1.Pod{}, []*apiv1.Pod{}, blockingPod, err
}
} else if pod_util.IsDaemonSetPod(pod) {
isDaemonSetPod = true
// don't have listener for other DaemonSet kind
// TODO: we should use a generic client for checking the reference.
if checkReferences && refKind == "DaemonSet" {
_, err := listers.DaemonSetLister().DaemonSets(controllerNamespace).Get(controllerRef.Name)
if apierrors.IsNotFound(err) {
return []*apiv1.Pod{}, []*apiv1.Pod{}, &BlockingPod{Pod: pod, Reason: ControllerNotFound}, fmt.Errorf("daemonset for %s/%s is not present, err: %v", pod.Namespace, pod.Name, err)
} else if err != nil {
return []*apiv1.Pod{}, []*apiv1.Pod{}, &BlockingPod{Pod: pod, Reason: UnexpectedError}, fmt.Errorf("error when trying to get daemonset for %s/%s , err: %v", pod.Namespace, pod.Name, err)
}
}
} else if refKind == "Job" {
if checkReferences {
job, err := listers.JobLister().Jobs(controllerNamespace).Get(controllerRef.Name)

// Assume the only reason for an error is because the Job is
// gone/missing, not for any other cause. TODO(mml): something more
// sophisticated than this
if err == nil && job != nil {
replicated = true
} else {
return []*apiv1.Pod{}, []*apiv1.Pod{}, &BlockingPod{Pod: pod, Reason: ControllerNotFound}, fmt.Errorf("job for %s/%s is not available: err: %v", pod.Namespace, pod.Name, err)
}
} else {
} else {
if ControllerRef(pod) != nil {
replicated = true
}
} else if refKind == "ReplicaSet" {
if checkReferences {
rs, err := listers.ReplicaSetLister().ReplicaSets(controllerNamespace).Get(controllerRef.Name)

// Assume the only reason for an error is because the RS is
// gone/missing, not for any other cause. TODO(mml): something more
// sophisticated than this
if err == nil && rs != nil {
if rs.Spec.Replicas != nil && *rs.Spec.Replicas < minReplica {
return []*apiv1.Pod{}, []*apiv1.Pod{}, &BlockingPod{Pod: pod, Reason: MinReplicasReached}, fmt.Errorf("replication controller for %s/%s has too few replicas spec: %d min: %d",
pod.Namespace, pod.Name, rs.Spec.Replicas, minReplica)
}
replicated = true
} else {
return []*apiv1.Pod{}, []*apiv1.Pod{}, &BlockingPod{Pod: pod, Reason: ControllerNotFound}, fmt.Errorf("replication controller for %s/%s is not available, err: %v", pod.Namespace, pod.Name, err)
}
} else {
replicated = true
}
} else if refKind == "StatefulSet" {
if checkReferences {
ss, err := listers.StatefulSetLister().StatefulSets(controllerNamespace).Get(controllerRef.Name)

// Assume the only reason for an error is because the StatefulSet is
// gone/missing, not for any other cause. TODO(mml): something more
// sophisticated than this
if err == nil && ss != nil {
replicated = true
} else {
return []*apiv1.Pod{}, []*apiv1.Pod{}, &BlockingPod{Pod: pod, Reason: ControllerNotFound}, fmt.Errorf("statefulset for %s/%s is not available: err: %v", pod.Namespace, pod.Name, err)
}
} else {
replicated = true
if pod_util.IsDaemonSetPod(pod) {
isDaemonSetPod = true
}
}

if isDaemonSetPod {
daemonSetPods = append(daemonSetPods, pod)
continue
Expand Down Expand Up @@ -231,6 +156,104 @@ func GetPodsForDeletionOnNodeDrain(
return pods, daemonSetPods, nil, nil
}

func legacyCheckForReplicatedPods(listers kube_util.ListerRegistry, pod *apiv1.Pod, minReplica int32) (replicated bool, isDaemonSetPod bool, blockingPod *BlockingPod, err error) {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The name legacyCheckForReplicatedPods doesn't account for checking for daemon set pods or blocking pods but using legacyCheckForReplicatedAndDaemonSetAndBlockingPods feels too long so I have kept it as is for now.

Copy link
Member Author

@vadasambar vadasambar Mar 15, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that the return type replicated bool, isDaemonSetPod bool, blockingPod *BlockingPod, err error has names. I did this because I think it improves readability when you hover over the function call in IDEs e.g.,
image

This might not be the best approach since it can confuse someone when they look at line 162 and 165 below (because the variables are already defined due to them being named return types). Open to other ideas here.

replicated = false
refKind := ""
checkReferences := listers != nil
isDaemonSetPod = false

controllerRef := ControllerRef(pod)
Copy link
Member Author

@vadasambar vadasambar Mar 15, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the same code as our current code here abstracted into a function. I have done some minor changes like:

  1. Change return values to include replicated and isDaemonSetPod. Also, remove []*apiv1.Pod since we don't need to return it from this function. Returning pod list is done in the else part here.
  2. Don't do appending to daemonSetPods slice here. Instead just return if isDaemonSetPod and let the calling function do the appending because I thought it was cleaner and need the appending part even if we remove this function in the future.

if controllerRef != nil {
refKind = controllerRef.Kind
}

// For now, owner controller must be in the same namespace as the pod
// so OwnerReference doesn't have its own Namespace field
controllerNamespace := pod.Namespace
if refKind == "ReplicationController" {
if checkReferences {
rc, err := listers.ReplicationControllerLister().ReplicationControllers(controllerNamespace).Get(controllerRef.Name)
// Assume a reason for an error is because the RC is either
// gone/missing or that the rc has too few replicas configured.
// TODO: replace the minReplica check with pod disruption budget.
if err == nil && rc != nil {
if rc.Spec.Replicas != nil && *rc.Spec.Replicas < minReplica {
return replicated, isDaemonSetPod, &BlockingPod{Pod: pod, Reason: MinReplicasReached}, fmt.Errorf("replication controller for %s/%s has too few replicas spec: %d min: %d",
pod.Namespace, pod.Name, rc.Spec.Replicas, minReplica)
}
replicated = true
} else {
return replicated, isDaemonSetPod, &BlockingPod{Pod: pod, Reason: ControllerNotFound}, fmt.Errorf("replication controller for %s/%s is not available, err: %v", pod.Namespace, pod.Name, err)
}
} else {
replicated = true
}
} else if pod_util.IsDaemonSetPod(pod) {
isDaemonSetPod = true
// don't have listener for other DaemonSet kind
// TODO: we should use a generic client for checking the reference.
if checkReferences && refKind == "DaemonSet" {
_, err := listers.DaemonSetLister().DaemonSets(controllerNamespace).Get(controllerRef.Name)
if apierrors.IsNotFound(err) {
return replicated, isDaemonSetPod, &BlockingPod{Pod: pod, Reason: ControllerNotFound}, fmt.Errorf("daemonset for %s/%s is not present, err: %v", pod.Namespace, pod.Name, err)
} else if err != nil {
return replicated, isDaemonSetPod, &BlockingPod{Pod: pod, Reason: UnexpectedError}, fmt.Errorf("error when trying to get daemonset for %s/%s , err: %v", pod.Namespace, pod.Name, err)
}
}
} else if refKind == "Job" {
if checkReferences {
job, err := listers.JobLister().Jobs(controllerNamespace).Get(controllerRef.Name)

// Assume the only reason for an error is because the Job is
// gone/missing, not for any other cause. TODO(mml): something more
// sophisticated than this
if err == nil && job != nil {
replicated = true
} else {
return replicated, isDaemonSetPod, &BlockingPod{Pod: pod, Reason: ControllerNotFound}, fmt.Errorf("job for %s/%s is not available: err: %v", pod.Namespace, pod.Name, err)
}
} else {
replicated = true
}
} else if refKind == "ReplicaSet" {
if checkReferences {
rs, err := listers.ReplicaSetLister().ReplicaSets(controllerNamespace).Get(controllerRef.Name)

// Assume the only reason for an error is because the RS is
// gone/missing, not for any other cause. TODO(mml): something more
// sophisticated than this
if err == nil && rs != nil {
if rs.Spec.Replicas != nil && *rs.Spec.Replicas < minReplica {
return replicated, isDaemonSetPod, &BlockingPod{Pod: pod, Reason: MinReplicasReached}, fmt.Errorf("replication controller for %s/%s has too few replicas spec: %d min: %d",
pod.Namespace, pod.Name, rs.Spec.Replicas, minReplica)
}
replicated = true
} else {
return replicated, isDaemonSetPod, &BlockingPod{Pod: pod, Reason: ControllerNotFound}, fmt.Errorf("replication controller for %s/%s is not available, err: %v", pod.Namespace, pod.Name, err)
}
} else {
replicated = true
}
} else if refKind == "StatefulSet" {
if checkReferences {
ss, err := listers.StatefulSetLister().StatefulSets(controllerNamespace).Get(controllerRef.Name)

// Assume the only reason for an error is because the StatefulSet is
// gone/missing, not for any other cause. TODO(mml): something more
// sophisticated than this
if err == nil && ss != nil {
replicated = true
} else {
return replicated, isDaemonSetPod, &BlockingPod{Pod: pod, Reason: ControllerNotFound}, fmt.Errorf("statefulset for %s/%s is not available: err: %v", pod.Namespace, pod.Name, err)
}
} else {
replicated = true
}
}

return replicated, isDaemonSetPod, &BlockingPod{}, nil
}

// ControllerRef returns the OwnerReference to pod's controller.
func ControllerRef(pod *apiv1.Pod) *metav1.OwnerReference {
return metav1.GetControllerOf(pod)
Expand Down
Loading