From 166147d5c464a125da6813ee32bcd4017d404f7d Mon Sep 17 00:00:00 2001 From: daihao Date: Tue, 21 May 2019 17:38:53 +0800 Subject: [PATCH] ignore failed pods to not stuck rolling update daemonset --- pkg/controller/daemon/daemon_controller.go | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/pkg/controller/daemon/daemon_controller.go b/pkg/controller/daemon/daemon_controller.go index 11fc71eb5989..6fef233b4efd 100644 --- a/pkg/controller/daemon/daemon_controller.go +++ b/pkg/controller/daemon/daemon_controller.go @@ -862,13 +862,12 @@ func (dsc *DaemonSetsController) resolveControllerRef(namespace string, controll // podsShouldBeOnNode figures out the DaemonSet pods to be created and deleted on the given node: // - nodesNeedingDaemonPods: the pods need to start on the node // - podsToDelete: the Pods need to be deleted on the node -// - failedPodsObserved: the number of failed pods on node // - err: unexpected error func (dsc *DaemonSetsController) podsShouldBeOnNode( node *v1.Node, nodeToDaemonPods map[string][]*v1.Pod, ds *apps.DaemonSet, -) (nodesNeedingDaemonPods, podsToDelete []string, failedPodsObserved int, err error) { +) (nodesNeedingDaemonPods, podsToDelete []string, err error) { wantToRun, shouldSchedule, shouldContinueRunning, err := dsc.nodeShouldRunDaemonPod(node, ds) if err != nil { @@ -896,8 +895,6 @@ func (dsc *DaemonSetsController) podsShouldBeOnNode( continue } if pod.Status.Phase == v1.PodFailed { - failedPodsObserved++ - // This is a critical place where DS is often fighting with kubelet that rejects pods. // We need to avoid hot looping and backoff. backoffKey := failedPodsBackoffKey(ds, node.Name) @@ -938,7 +935,7 @@ func (dsc *DaemonSetsController) podsShouldBeOnNode( } } - return nodesNeedingDaemonPods, podsToDelete, failedPodsObserved, nil + return nodesNeedingDaemonPods, podsToDelete, nil } // manage manages the scheduling and running of Pods of ds on nodes. @@ -955,9 +952,8 @@ func (dsc *DaemonSetsController) manage(ds *apps.DaemonSet, nodeList []*v1.Node, // For each node, if the node is running the daemon pod but isn't supposed to, kill the daemon // pod. If the node is supposed to run the daemon pod, but isn't, create the daemon pod on the node. var nodesNeedingDaemonPods, podsToDelete []string - var failedPodsObserved int for _, node := range nodeList { - nodesNeedingDaemonPodsOnNode, podsToDeleteOnNode, failedPodsObservedOnNode, err := dsc.podsShouldBeOnNode( + nodesNeedingDaemonPodsOnNode, podsToDeleteOnNode, err := dsc.podsShouldBeOnNode( node, nodeToDaemonPods, ds) if err != nil { @@ -966,7 +962,6 @@ func (dsc *DaemonSetsController) manage(ds *apps.DaemonSet, nodeList []*v1.Node, nodesNeedingDaemonPods = append(nodesNeedingDaemonPods, nodesNeedingDaemonPodsOnNode...) podsToDelete = append(podsToDelete, podsToDeleteOnNode...) - failedPodsObserved += failedPodsObservedOnNode } // Remove unscheduled pods assigned to not existing nodes when daemonset pods are scheduled by scheduler. @@ -980,11 +975,6 @@ func (dsc *DaemonSetsController) manage(ds *apps.DaemonSet, nodeList []*v1.Node, return err } - // Throw an error when the daemon pods fail, to use ratelimiter to prevent kill-recreate hot loop - if failedPodsObserved > 0 { - return fmt.Errorf("deleted %d failed pods of DaemonSet %s/%s", failedPodsObserved, ds.Namespace, ds.Name) - } - return nil }