From 79c2426069b239110bf056fac6981f3315a57fdf Mon Sep 17 00:00:00 2001 From: David Sauer Date: Sat, 28 Nov 2020 22:52:08 +0100 Subject: [PATCH] taint node with PreferNoSchedule to avoid double draining of pods --- cmd/kured/main.go | 57 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/cmd/kured/main.go b/cmd/kured/main.go index a9312884a..f3e15bf7b 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -39,6 +39,7 @@ var ( prometheusURL string alertFilter *regexp.Regexp rebootSentinel string + preferNoScheduleTaint string slackHookURL string slackUsername string slackChannel string @@ -85,6 +86,8 @@ func main() { "alert names to ignore when checking for active alerts") rootCmd.PersistentFlags().StringVar(&rebootSentinel, "reboot-sentinel", "/var/run/reboot-required", "path to file whose existence signals need to reboot") + rootCmd.PersistentFlags().StringVar(&preferNoScheduleTaint, "prefer-no-schedule-taint", "weave.works/kured-node-taint", + "taint to avoid double drained pods") rootCmd.PersistentFlags().StringVar(&slackHookURL, "slack-hook-url", "", "slack hook URL for reboot notfications") @@ -279,6 +282,54 @@ func uncordon(client *kubernetes.Clientset, node *v1.Node) { } } +func preferNoSchedule(client *kubernetes.Clientset, nodeID string, rebootDesired bool) { + updatedNode, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{}) + if err != nil || updatedNode == nil { + log.Fatalf("Error reading node %s: %v", nodeID, err) + } + + taintExists := false + for _, taint := range updatedNode.Spec.Taints { + if taint.Key == preferNoScheduleTaint { + taintExists = true + break + } + } + + if taintExists && rebootDesired { + log.Debugf("Taint %v exists already for node %v.", preferNoScheduleTaint, nodeID) + return + } + + if !taintExists && !rebootDesired { + log.Debugf("Taint %v already missing for node %v.", preferNoScheduleTaint, nodeID) + return + } + + if rebootDesired { + updatedNode.Spec.Taints = append(updatedNode.Spec.Taints, v1.Taint{ + Key: preferNoScheduleTaint, + Value: "until-reboot", + Effect: v1.TaintEffectPreferNoSchedule, + }) + } else { + taints := []v1.Taint{} + for _, taint := range updatedNode.Spec.Taints { + if taint.Key != preferNoScheduleTaint { + taints = append(taints, taint) + } + } + updatedNode.Spec.Taints = taints + } + + _, err = client.CoreV1().Nodes().Update(context.TODO(), updatedNode, metav1.UpdateOptions{}) + if err != nil { + log.Fatalf("Error updating taint for node %s: %v", nodeID, err) + } + + log.Infof("Successfully updated taint for node %v", nodeID) +} + func commandReboot(nodeID string) { log.Infof("Commanding reboot for node: %s", nodeID) @@ -336,6 +387,10 @@ func rebootAsRequired(nodeID string, window *timewindow.TimeWindow, TTL time.Dur release(lock) } + if !rebootRequired() { + preferNoSchedule(client, nodeID, false) + } + source := rand.NewSource(time.Now().UnixNano()) tick := delaytick.New(source, period) for range tick { @@ -357,6 +412,8 @@ func rebootAsRequired(nodeID string, window *timewindow.TimeWindow, TTL time.Dur } nodeMeta.Unschedulable = node.Spec.Unschedulable + preferNoSchedule(client, nodeID, true) + if !acquire(lock, &nodeMeta, TTL) { continue }