Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ignore failed pods to not block rolling update daemonset #78170

Merged
merged 1 commit into from Jul 24, 2019
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
16 changes: 3 additions & 13 deletions pkg/controller/daemon/daemon_controller.go
Expand Up @@ -862,13 +862,12 @@ func (dsc *DaemonSetsController) resolveControllerRef(namespace string, controll
// podsShouldBeOnNode figures out the DaemonSet pods to be created and deleted on the given node:
// - nodesNeedingDaemonPods: the pods need to start on the node
// - podsToDelete: the Pods need to be deleted on the node
// - failedPodsObserved: the number of failed pods on node
// - err: unexpected error
func (dsc *DaemonSetsController) podsShouldBeOnNode(
node *v1.Node,
nodeToDaemonPods map[string][]*v1.Pod,
ds *apps.DaemonSet,
) (nodesNeedingDaemonPods, podsToDelete []string, failedPodsObserved int, err error) {
) (nodesNeedingDaemonPods, podsToDelete []string, err error) {

wantToRun, shouldSchedule, shouldContinueRunning, err := dsc.nodeShouldRunDaemonPod(node, ds)
if err != nil {
Expand Down Expand Up @@ -896,8 +895,6 @@ func (dsc *DaemonSetsController) podsShouldBeOnNode(
continue
}
if pod.Status.Phase == v1.PodFailed {
failedPodsObserved++

// This is a critical place where DS is often fighting with kubelet that rejects pods.
// We need to avoid hot looping and backoff.
backoffKey := failedPodsBackoffKey(ds, node.Name)
Expand Down Expand Up @@ -938,7 +935,7 @@ func (dsc *DaemonSetsController) podsShouldBeOnNode(
}
}

return nodesNeedingDaemonPods, podsToDelete, failedPodsObserved, nil
return nodesNeedingDaemonPods, podsToDelete, nil
}

// manage manages the scheduling and running of Pods of ds on nodes.
Expand All @@ -955,9 +952,8 @@ func (dsc *DaemonSetsController) manage(ds *apps.DaemonSet, nodeList []*v1.Node,
// For each node, if the node is running the daemon pod but isn't supposed to, kill the daemon
// pod. If the node is supposed to run the daemon pod, but isn't, create the daemon pod on the node.
var nodesNeedingDaemonPods, podsToDelete []string
var failedPodsObserved int
for _, node := range nodeList {
nodesNeedingDaemonPodsOnNode, podsToDeleteOnNode, failedPodsObservedOnNode, err := dsc.podsShouldBeOnNode(
nodesNeedingDaemonPodsOnNode, podsToDeleteOnNode, err := dsc.podsShouldBeOnNode(
node, nodeToDaemonPods, ds)

if err != nil {
Expand All @@ -966,7 +962,6 @@ func (dsc *DaemonSetsController) manage(ds *apps.DaemonSet, nodeList []*v1.Node,

nodesNeedingDaemonPods = append(nodesNeedingDaemonPods, nodesNeedingDaemonPodsOnNode...)
podsToDelete = append(podsToDelete, podsToDeleteOnNode...)
failedPodsObserved += failedPodsObservedOnNode
}

// Remove unscheduled pods assigned to not existing nodes when daemonset pods are scheduled by scheduler.
Expand All @@ -980,11 +975,6 @@ func (dsc *DaemonSetsController) manage(ds *apps.DaemonSet, nodeList []*v1.Node,
return err
}

// Throw an error when the daemon pods fail, to use ratelimiter to prevent kill-recreate hot loop
if failedPodsObserved > 0 {
return fmt.Errorf("deleted %d failed pods of DaemonSet %s/%s", failedPodsObserved, ds.Namespace, ds.Name)
}

return nil
}

Expand Down