Skip to content

Commit

Permalink
taint node with PreferNoSchedule to avoid double draining of pods
Browse files Browse the repository at this point in the history
  • Loading branch information
damoon committed Nov 29, 2020
1 parent ad227f2 commit 523dbba
Showing 1 changed file with 107 additions and 0 deletions.
107 changes: 107 additions & 0 deletions cmd/kured/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package main

import (
"context"
"encoding/json"
"fmt"
"math/rand"
"net/http"
Expand All @@ -14,6 +15,7 @@ import (
"github.com/spf13/cobra"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
types "k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
kubectldrain "k8s.io/kubectl/pkg/drain"
Expand All @@ -39,6 +41,7 @@ var (
prometheusURL string
alertFilter *regexp.Regexp
rebootSentinel string
preferNoScheduleTaint string
slackHookURL string
slackUsername string
slackChannel string
Expand Down Expand Up @@ -85,6 +88,8 @@ func main() {
"alert names to ignore when checking for active alerts")
rootCmd.PersistentFlags().StringVar(&rebootSentinel, "reboot-sentinel", "/var/run/reboot-required",
"path to file whose existence signals need to reboot")
rootCmd.PersistentFlags().StringVar(&preferNoScheduleTaint, "prefer-no-schedule-taint", "weave.works/kured-node-reboot",
"taint to avoid double drained pods")

rootCmd.PersistentFlags().StringVar(&slackHookURL, "slack-hook-url", "",
"slack hook URL for reboot notfications")
Expand Down Expand Up @@ -279,6 +284,101 @@ func uncordon(client *kubernetes.Clientset, node *v1.Node) {
}
}

func preferNoSchedule(client *kubernetes.Clientset, nodeID string, rebootDesired bool) {
updatedNode, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{})
if err != nil || updatedNode == nil {
log.Fatalf("Error reading node %s: %v", nodeID, err)
}

taintExists := false
offset := 0
for i, taint := range updatedNode.Spec.Taints {
if taint.Key == preferNoScheduleTaint {
taintExists = true
offset = i
break
}
}

if taintExists && rebootDesired {
log.Debugf("Taint %v exists already for node %v.", preferNoScheduleTaint, nodeID)
return
}

if !taintExists && !rebootDesired {
log.Debugf("Taint %v already missing for node %v.", preferNoScheduleTaint, nodeID)
return
}

type patchTaints struct {
Op string `json:"op"`
Path string `json:"path"`
Value interface{} `json:"value,omitempty"`
}

taint := v1.Taint{
Key: preferNoScheduleTaint,
Effect: v1.TaintEffectPreferNoSchedule,
}

patches := []patchTaints{}

if len(updatedNode.Spec.Taints) == 0 {
// add first taint and ensure to keep current taints
patches = []patchTaints{
{
Op: "test",
Path: "/spec",
Value: updatedNode.Spec,
},
{
Op: "add",
Path: "/spec/taints",
Value: []v1.Taint{},
},
{
Op: "add",
Path: "/spec/taints/-",
Value: taint,
},
}
} else if taintExists {
// remove taint and ensure to test against race conditions
patches = []patchTaints{
{
Op: "test",
Path: fmt.Sprintf("/spec/taints/%d", offset),
Value: taint,
},
{
Op: "remove",
Path: fmt.Sprintf("/spec/taints/%d", offset),
},
}
} else {
// add missing taint to exsting list
patches = []patchTaints{
{
Op: "add",
Path: "/spec/taints/-",
Value: taint,
},
}
}

patchBytes, err := json.Marshal(patches)
if err != nil {
log.Fatalf("Error encoding taint patcht for node %s: %v", nodeID, err)
}

_, err = client.CoreV1().Nodes().Patch(context.TODO(), nodeID, types.JSONPatchType, patchBytes, metav1.PatchOptions{})
if err != nil {
log.Fatalf("Error patching taint for node %s: %v", nodeID, err)
}

log.Info("Tainted node")
}

func commandReboot(nodeID string) {
log.Infof("Commanding reboot for node: %s", nodeID)

Expand Down Expand Up @@ -336,6 +436,10 @@ func rebootAsRequired(nodeID string, window *timewindow.TimeWindow, TTL time.Dur
release(lock)
}

if !rebootRequired() {
preferNoSchedule(client, nodeID, false)
}

source := rand.NewSource(time.Now().UnixNano())
tick := delaytick.New(source, period)
for range tick {
Expand All @@ -347,6 +451,8 @@ func rebootAsRequired(nodeID string, window *timewindow.TimeWindow, TTL time.Dur
continue
}

preferNoSchedule(client, nodeID, true)

if rebootBlocked(client, nodeID) {
continue
}
Expand Down Expand Up @@ -392,6 +498,7 @@ func root(cmd *cobra.Command, args []string) {
} else {
log.Info("Lock TTL not set, lock will remain until being released")
}
log.Infof("PreferNoSchedule taint: %s", preferNoScheduleTaint)
log.Infof("Reboot Sentinel: %s every %v", rebootSentinel, period)
log.Infof("Blocking Pod Selectors: %v", podSelectors)
log.Infof("Reboot on: %v", window)
Expand Down

0 comments on commit 523dbba

Please sign in to comment.