-
Notifications
You must be signed in to change notification settings - Fork 38.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix ReplicaSet deletion rank for multiple colocation nodes #102949
Conversation
@FillZpp: Adding the "do-not-merge/release-note-label-needed" label because no release-note block was detected, please follow our release note process to remove it. Instructions for interacting with me using PR comments are available here. If you have questions or suggestions related to my behavior, please file an issue against the kubernetes/test-infra repository. |
@FillZpp: This issue is currently awaiting triage. If a SIG or subproject determines this is a relevant issue, they will accept it by applying the The Instructions for interacting with me using PR comments are available here. If you have questions or suggestions related to my behavior, please file an issue against the kubernetes/test-infra repository. |
Hi @FillZpp. Thanks for your PR. I'm waiting for a kubernetes member to verify that this patch is reasonable to test. If it is, they should reply with Once the patch is verified, the new status will be reflected by the I understand the commands that are listed here. Instructions for interacting with me using PR comments are available here. If you have questions or suggestions related to my behavior, please file an issue against the kubernetes/test-infra repository. |
/ok-to-test lgtm Hope folks at sig-apps can approve this. https://kubernetes.io/docs/concepts/workloads/controllers/replicaset/#scaling-a-replicaset |
@@ -819,7 +819,9 @@ func getPodsRankedByRelatedPodsOnSameNode(podsToRank, relatedPods []*v1.Pod) con | |||
} | |||
ranks := make([]int, len(podsToRank)) | |||
for i, pod := range podsToRank { | |||
ranks[i] = podsOnNode[pod.Spec.NodeName] | |||
if ranks[i] = podsOnNode[pod.Spec.NodeName]; ranks[i] > 0 { | |||
podsOnNode[pod.Spec.NodeName]-- |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is removing the usefulness of other criteria, like the time the pods have been running.
Can you think of an alternative to not lose that?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
cc @damemi who has been looking into this.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, it would probably be better to try to tie this into the sort
function that is already called on these podsWithRanks, as done in getPodsToDelete()
. See
kubernetes/pkg/controller/controller_utils.go
Lines 820 to 894 in d8aad79
func (s ActivePodsWithRanks) Less(i, j int) bool { | |
// 1. Unassigned < assigned | |
// If only one of the pods is unassigned, the unassigned one is smaller | |
if s.Pods[i].Spec.NodeName != s.Pods[j].Spec.NodeName && (len(s.Pods[i].Spec.NodeName) == 0 || len(s.Pods[j].Spec.NodeName) == 0) { | |
return len(s.Pods[i].Spec.NodeName) == 0 | |
} | |
// 2. PodPending < PodUnknown < PodRunning | |
if podPhaseToOrdinal[s.Pods[i].Status.Phase] != podPhaseToOrdinal[s.Pods[j].Status.Phase] { | |
return podPhaseToOrdinal[s.Pods[i].Status.Phase] < podPhaseToOrdinal[s.Pods[j].Status.Phase] | |
} | |
// 3. Not ready < ready | |
// If only one of the pods is not ready, the not ready one is smaller | |
if podutil.IsPodReady(s.Pods[i]) != podutil.IsPodReady(s.Pods[j]) { | |
return !podutil.IsPodReady(s.Pods[i]) | |
} | |
// 4. higher pod-deletion-cost < lower pod-deletion cost | |
if utilfeature.DefaultFeatureGate.Enabled(features.PodDeletionCost) { | |
pi, _ := helper.GetDeletionCostFromPodAnnotations(s.Pods[i].Annotations) | |
pj, _ := helper.GetDeletionCostFromPodAnnotations(s.Pods[j].Annotations) | |
if pi != pj { | |
return pi < pj | |
} | |
} | |
// 5. Doubled up < not doubled up | |
// If one of the two pods is on the same node as one or more additional | |
// ready pods that belong to the same replicaset, whichever pod has more | |
// colocated ready pods is less | |
if s.Rank[i] != s.Rank[j] { | |
return s.Rank[i] > s.Rank[j] | |
} | |
// TODO: take availability into account when we push minReadySeconds information from deployment into pods, | |
// see https://github.com/kubernetes/kubernetes/issues/22065 | |
// 6. Been ready for empty time < less time < more time | |
// If both pods are ready, the latest ready one is smaller | |
if podutil.IsPodReady(s.Pods[i]) && podutil.IsPodReady(s.Pods[j]) { | |
readyTime1 := podReadyTime(s.Pods[i]) | |
readyTime2 := podReadyTime(s.Pods[j]) | |
if !readyTime1.Equal(readyTime2) { | |
if !utilfeature.DefaultFeatureGate.Enabled(features.LogarithmicScaleDown) { | |
return afterOrZero(readyTime1, readyTime2) | |
} else { | |
if s.Now.IsZero() || readyTime1.IsZero() || readyTime2.IsZero() { | |
return afterOrZero(readyTime1, readyTime2) | |
} | |
rankDiff := logarithmicRankDiff(*readyTime1, *readyTime2, s.Now) | |
if rankDiff == 0 { | |
return s.Pods[i].UID < s.Pods[j].UID | |
} | |
return rankDiff < 0 | |
} | |
} | |
} | |
// 7. Pods with containers with higher restart counts < lower restart counts | |
if maxContainerRestarts(s.Pods[i]) != maxContainerRestarts(s.Pods[j]) { | |
return maxContainerRestarts(s.Pods[i]) > maxContainerRestarts(s.Pods[j]) | |
} | |
// 8. Empty creation time pods < newer pods < older pods | |
if !s.Pods[i].CreationTimestamp.Equal(&s.Pods[j].CreationTimestamp) { | |
if !utilfeature.DefaultFeatureGate.Enabled(features.LogarithmicScaleDown) { | |
return afterOrZero(&s.Pods[i].CreationTimestamp, &s.Pods[j].CreationTimestamp) | |
} else { | |
if s.Now.IsZero() || s.Pods[i].CreationTimestamp.IsZero() || s.Pods[j].CreationTimestamp.IsZero() { | |
return afterOrZero(&s.Pods[i].CreationTimestamp, &s.Pods[j].CreationTimestamp) | |
} | |
rankDiff := logarithmicRankDiff(s.Pods[i].CreationTimestamp, s.Pods[j].CreationTimestamp, s.Now) | |
if rankDiff == 0 { | |
return s.Pods[i].UID < s.Pods[j].UID | |
} | |
return rankDiff < 0 | |
} | |
} | |
return false | |
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
One alternative I can think of is to do the sorting without ranks first, and then run the loop to add the ranks and finally do a second stable sort.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Or... we can simply remove the ranks. Now that we have randomized removals https://github.com/kubernetes/enhancements/tree/master/keps/sig-apps/2185-random-pod-select-on-replicaset-downscale, the nodes with higher number of pods are more likely to get pods removed :)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think these two logic don't conflict. With this PR, pods with the same rank can still sort by randomized.
For example, there are 5 nodes and each of them has 2 pods on it. So, the rank of 5 pods on the different 5 nodes will be 1 and the other 5 pods will be 0. The random algorithm will sort the former 5 pods by randomized and the latter 5 pods by randomized.
@alculquicondor @damemi What do you think?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But there is no comparison between a-0 and a-1. You are directly assigning a preference without checking the timestamp.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You are directly assigning a preference without checking the timestamp.
Yes, that might need to sort the pods on same node by timestamp and then give them the right ranks?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Correct. I think it's only possible through two consecutive sort operations, as I proposed above. However, I think if we get rid of the rank entirely, we get the same result, probabilistically.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
WDYT?
309b1ca
to
2c0d5a1
Compare
@alculquicondor Hey, sorry for this late reply, have gone through a busy time.. 😢 I just updated the code so that pods on the same node will have the right ranks. Would you please take a look again? |
2c0d5a1
to
62b321e
Compare
/retest |
} | ||
} | ||
for _, podsOnNode := range podsOnNodes { | ||
// sort pods on the same node with active and timestamp | ||
sort.Sort(controller.ActivePods(podsOnNode)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This function doesn't sort with logarithmic comparisons
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In my opinion, there is no need to sort pods on the same node with logarithmic comparisons, which are designed to solve the sort problem across different topologies (such as multiple failure domains).
The pods on different nodes that have the same rank will be sort by logarithmic in the later ActivePodsWithRanks.
You might be able to build some sort of unit test that plays the scenario you want to solve and compare your PR with what I'm proposing (removing the ranks) |
62b321e
to
c8b326c
Compare
/remove-lifecycle stale |
/assign @soltysh /retest |
1d46db6
to
b5bc898
Compare
b5bc898
to
b8064ad
Compare
[APPROVALNOTIFIER] This PR is NOT APPROVED This pull-request has been approved by: FillZpp The full list of commands accepted by this bot can be found here.
Needs approval from an approver in each of these files:
Approvers can indicate their approval by writing |
b8064ad
to
4b8d762
Compare
/retest |
The Kubernetes project currently lacks enough contributors to adequately respond to all issues and PRs. This bot triages issues and PRs according to the following rules:
You can:
Please send feedback to sig-contributor-experience at kubernetes/community. /lifecycle stale |
The Kubernetes project currently lacks enough active contributors to adequately respond to all issues and PRs. This bot triages issues and PRs according to the following rules:
You can:
Please send feedback to sig-contributor-experience at kubernetes/community. /lifecycle rotten |
Signed-off-by: FillZpp <FillZpp.pub@gmail.com>
4b8d762
to
7ce7e19
Compare
/remove-lifecycle rotten |
The Kubernetes project currently lacks enough contributors to adequately respond to all issues and PRs. This bot triages issues and PRs according to the following rules:
You can:
Please send feedback to sig-contributor-experience at kubernetes/community. /lifecycle stale |
The Kubernetes project currently lacks enough active contributors to adequately respond to all issues and PRs. This bot triages issues and PRs according to the following rules:
You can:
Please send feedback to sig-contributor-experience at kubernetes/community. /lifecycle rotten |
The Kubernetes project currently lacks enough active contributors to adequately respond to all issues and PRs. This bot triages PRs according to the following rules:
You can:
Please send feedback to sig-contributor-experience at kubernetes/community. /close |
@k8s-triage-robot: Closed this PR. In response to this:
Instructions for interacting with me using PR comments are available here. If you have questions or suggestions related to my behavior, please file an issue against the kubernetes/test-infra repository. |
Signed-off-by: FillZpp FillZpp.pub@gmail.com
What type of PR is this?
/kind bug
What this PR does / why we need it:
Fix ReplicaSet deletion rank for multiple colocation nodes.
Which issue(s) this PR fixes:
Fixes #102948
Special notes for your reviewer:
Now for a ReplicaSet or Deployment which has five Pods on two Nodes:
When replicas changed from
5
to2
, the deletion rank that controller will get for Pods:So ReplicaSet will choose all three Pods on
node01
to delete.This PR fixes the rank to be:
So that controller will delete Pods more evenly.
Does this PR introduce a user-facing change?
No user-facing changed.
/sig apps