Skip to content

Commit

Permalink
taint node with PreferNoSchedule to avoid double draining of pods
Browse files Browse the repository at this point in the history
  • Loading branch information
damoon committed Nov 30, 2020
1 parent 29ca5eb commit 0f60dba
Show file tree
Hide file tree
Showing 2 changed files with 169 additions and 14 deletions.
44 changes: 30 additions & 14 deletions cmd/kured/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,27 +24,29 @@ import (
"github.com/weaveworks/kured/pkg/daemonsetlock"
"github.com/weaveworks/kured/pkg/delaytick"
"github.com/weaveworks/kured/pkg/notifications/slack"
"github.com/weaveworks/kured/pkg/taints"
"github.com/weaveworks/kured/pkg/timewindow"
)

var (
version = "unreleased"

// Command line flags
period time.Duration
dsNamespace string
dsName string
lockAnnotation string
lockTTL time.Duration
prometheusURL string
alertFilter *regexp.Regexp
rebootSentinel string
slackHookURL string
slackUsername string
slackChannel string
messageTemplateDrain string
messageTemplateReboot string
podSelectors []string
period time.Duration
dsNamespace string
dsName string
lockAnnotation string
lockTTL time.Duration
prometheusURL string
alertFilter *regexp.Regexp
rebootSentinel string
preferNoScheduleTaintName string
slackHookURL string
slackUsername string
slackChannel string
messageTemplateDrain string
messageTemplateReboot string
podSelectors []string

rebootDays []string
rebootStart string
Expand Down Expand Up @@ -85,6 +87,8 @@ func main() {
"alert names to ignore when checking for active alerts")
rootCmd.PersistentFlags().StringVar(&rebootSentinel, "reboot-sentinel", "/var/run/reboot-required",
"path to file whose existence signals need to reboot")
rootCmd.PersistentFlags().StringVar(&preferNoScheduleTaintName, "prefer-no-schedule-taint", "weave.works/kured-node-reboot",
"taint name used for marking nodes pending reboot (to prevent them from getting a pod when another nodes reboots)")

rootCmd.PersistentFlags().StringVar(&slackHookURL, "slack-hook-url", "",
"slack hook URL for reboot notfications")
Expand Down Expand Up @@ -336,10 +340,19 @@ func rebootAsRequired(nodeID string, window *timewindow.TimeWindow, TTL time.Dur
release(lock)
}

preferNoScheduleTaint := taints.New(client, nodeID, preferNoScheduleTaintName, v1.TaintEffectPreferNoSchedule)

// Remove taint immediately during startup to quickly allow scheduling again.
if !rebootRequired() {
preferNoScheduleTaint.Disable()
}

source := rand.NewSource(time.Now().UnixNano())
tick := delaytick.New(source, period)
for range tick {
if !window.Contains(time.Now()) {
// Remove taint outside the reboot time window to allow for normal operation.
preferNoScheduleTaint.Disable()
continue
}

Expand All @@ -348,6 +361,8 @@ func rebootAsRequired(nodeID string, window *timewindow.TimeWindow, TTL time.Dur
}

if rebootBlocked(client, nodeID) {
// Prefer to not schedule pods onto this node to avoid draing the same pod multiple times.
preferNoScheduleTaint.Enable()
continue
}

Expand Down Expand Up @@ -392,6 +407,7 @@ func root(cmd *cobra.Command, args []string) {
} else {
log.Info("Lock TTL not set, lock will remain until being released")
}
log.Infof("PreferNoSchedule taint: %s", preferNoScheduleTaintName)
log.Infof("Reboot Sentinel: %s every %v", rebootSentinel, period)
log.Infof("Blocking Pod Selectors: %v", podSelectors)
log.Infof("Reboot on: %v", window)
Expand Down
139 changes: 139 additions & 0 deletions pkg/taints/taints.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
package taints

import (
"context"
"encoding/json"
"fmt"

"github.com/prometheus/common/log"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/kubernetes"
)

type taint struct {
client *kubernetes.Clientset
nodeID string
taintName string
effect v1.TaintEffect
}

// New provides a new taint. Taints allow to set soft and hard limitations for scheduling and executing pods on nodes.
func New(client *kubernetes.Clientset, nodeID, taintName string, effect v1.TaintEffect) *taint {
return &taint{
client: client,
nodeID: nodeID,
taintName: taintName,
effect: effect,
}
}

// Enable creates the taint for a node. Creating an existing taint is a noop.
func (t *taint) Enable() {
preferNoSchedule(t.client, t.nodeID, t.taintName, t.effect, true)
}

// Disable removes the taint for a node. Removing a missing taint is a noop.
func (t *taint) Disable() {
preferNoSchedule(t.client, t.nodeID, t.taintName, t.effect, false)
}

func preferNoSchedule(client *kubernetes.Clientset, nodeID, taintName string, effect v1.TaintEffect, taintShouldExists bool) {
updatedNode, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{})
if err != nil || updatedNode == nil {
log.Fatalf("Error reading node %s: %v", nodeID, err)
}

taintExists := false
offset := 0
for i, taint := range updatedNode.Spec.Taints {
if taint.Key == taintName {
taintExists = true
offset = i
break
}
}

if taintExists && taintShouldExists {
log.Debugf("Taint %v exists already for node %v.", taintName, nodeID)
return
}

if !taintExists && !taintShouldExists {
log.Debugf("Taint %v already missing for node %v.", taintName, nodeID)
return
}

type patchTaints struct {
Op string `json:"op"`
Path string `json:"path"`
Value interface{} `json:"value,omitempty"`
}

taint := v1.Taint{
Key: taintName,
Effect: effect,
}

var patches []patchTaints

if len(updatedNode.Spec.Taints) == 0 {
// add first taint and ensure to keep current taints
patches = []patchTaints{
{
Op: "test",
Path: "/spec",
Value: updatedNode.Spec,
},
{
Op: "add",
Path: "/spec/taints",
Value: []v1.Taint{},
},
{
Op: "add",
Path: "/spec/taints/-",
Value: taint,
},
}
} else if taintExists {
// remove taint and ensure to test against race conditions
patches = []patchTaints{
{
Op: "test",
Path: fmt.Sprintf("/spec/taints/%d", offset),
Value: taint,
},
{
Op: "remove",
Path: fmt.Sprintf("/spec/taints/%d", offset),
},
}
} else {
// add missing taint to exsting list
patches = []patchTaints{
{
Op: "add",
Path: "/spec/taints/-",
Value: taint,
},
}
}

patchBytes, err := json.Marshal(patches)
if err != nil {
log.Fatalf("Error encoding taint patcht for node %s: %v", nodeID, err)
}

_, err = client.CoreV1().Nodes().Patch(context.TODO(), nodeID, types.JSONPatchType, patchBytes, metav1.PatchOptions{})
if err != nil {
log.Fatalf("Error patching taint for node %s: %v", nodeID, err)
}

if taintShouldExists {
log.Info("Node taint added")
} else {
log.Info("Node taint removed")
}
}

0 comments on commit 0f60dba

Please sign in to comment.