Skip to content

Commit

Permalink
taint node with PreferNoSchedule to prevent receiving (and double dra…
Browse files Browse the repository at this point in the history
…ining) additional pods from other rebooting nodes
  • Loading branch information
damoon committed Dec 1, 2020
1 parent 29ca5eb commit 6caf951
Show file tree
Hide file tree
Showing 2 changed files with 170 additions and 14 deletions.
44 changes: 30 additions & 14 deletions cmd/kured/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,27 +24,29 @@ import (
"github.com/weaveworks/kured/pkg/daemonsetlock"
"github.com/weaveworks/kured/pkg/delaytick"
"github.com/weaveworks/kured/pkg/notifications/slack"
"github.com/weaveworks/kured/pkg/taints"
"github.com/weaveworks/kured/pkg/timewindow"
)

var (
version = "unreleased"

// Command line flags
period time.Duration
dsNamespace string
dsName string
lockAnnotation string
lockTTL time.Duration
prometheusURL string
alertFilter *regexp.Regexp
rebootSentinel string
slackHookURL string
slackUsername string
slackChannel string
messageTemplateDrain string
messageTemplateReboot string
podSelectors []string
period time.Duration
dsNamespace string
dsName string
lockAnnotation string
lockTTL time.Duration
prometheusURL string
alertFilter *regexp.Regexp
rebootSentinel string
preferNoScheduleTaintName string
slackHookURL string
slackUsername string
slackChannel string
messageTemplateDrain string
messageTemplateReboot string
podSelectors []string

rebootDays []string
rebootStart string
Expand Down Expand Up @@ -85,6 +87,8 @@ func main() {
"alert names to ignore when checking for active alerts")
rootCmd.PersistentFlags().StringVar(&rebootSentinel, "reboot-sentinel", "/var/run/reboot-required",
"path to file whose existence signals need to reboot")
rootCmd.PersistentFlags().StringVar(&preferNoScheduleTaintName, "prefer-no-schedule-taint", "weave.works/kured-node-reboot",
"taint name applied during pending node reboot (to prevent receiving additional pods from other rebooting nodes)")

rootCmd.PersistentFlags().StringVar(&slackHookURL, "slack-hook-url", "",
"slack hook URL for reboot notfications")
Expand Down Expand Up @@ -336,10 +340,19 @@ func rebootAsRequired(nodeID string, window *timewindow.TimeWindow, TTL time.Dur
release(lock)
}

preferNoScheduleTaint := taints.New(client, nodeID, preferNoScheduleTaintName, v1.TaintEffectPreferNoSchedule)

// Remove taint immediately during startup to quickly allow scheduling again.
if !rebootRequired() {
preferNoScheduleTaint.Disable()
}

source := rand.NewSource(time.Now().UnixNano())
tick := delaytick.New(source, period)
for range tick {
if !window.Contains(time.Now()) {
// Remove taint outside the reboot time window to allow for normal operation.
preferNoScheduleTaint.Disable()
continue
}

Expand All @@ -358,6 +371,8 @@ func rebootAsRequired(nodeID string, window *timewindow.TimeWindow, TTL time.Dur
nodeMeta.Unschedulable = node.Spec.Unschedulable

if !acquire(lock, &nodeMeta, TTL) {
// Prefer to not schedule pods onto this node to avoid draing the same pod multiple times.
preferNoScheduleTaint.Enable()
continue
}

Expand Down Expand Up @@ -392,6 +407,7 @@ func root(cmd *cobra.Command, args []string) {
} else {
log.Info("Lock TTL not set, lock will remain until being released")
}
log.Infof("PreferNoSchedule taint: %s", preferNoScheduleTaintName)
log.Infof("Reboot Sentinel: %s every %v", rebootSentinel, period)
log.Infof("Blocking Pod Selectors: %v", podSelectors)
log.Infof("Reboot on: %v", window)
Expand Down
140 changes: 140 additions & 0 deletions pkg/taints/taints.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
package taints

import (
"context"
"encoding/json"
"fmt"

log "github.com/sirupsen/logrus"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/kubernetes"
)

// Taint allows to set soft and hard limitations for scheduling and executing pods on nodes.
type Taint struct {
client *kubernetes.Clientset
nodeID string
taintName string
effect v1.TaintEffect
}

// New provides a new taint.
func New(client *kubernetes.Clientset, nodeID, taintName string, effect v1.TaintEffect) *Taint {
return &Taint{
client: client,
nodeID: nodeID,
taintName: taintName,
effect: effect,
}
}

// Enable creates the taint for a node. Creating an existing taint is a noop.
func (t *Taint) Enable() {
preferNoSchedule(t.client, t.nodeID, t.taintName, t.effect, true)
}

// Disable removes the taint for a node. Removing a missing taint is a noop.
func (t *Taint) Disable() {
preferNoSchedule(t.client, t.nodeID, t.taintName, t.effect, false)
}

func preferNoSchedule(client *kubernetes.Clientset, nodeID, taintName string, effect v1.TaintEffect, taintShouldExists bool) {
updatedNode, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{})
if err != nil || updatedNode == nil {
log.Fatalf("Error reading node %s: %v", nodeID, err)
}

taintExists := false
offset := 0
for i, taint := range updatedNode.Spec.Taints {
if taint.Key == taintName {
taintExists = true
offset = i
break
}
}

if taintExists && taintShouldExists {
log.Debugf("Taint %v exists already for node %v.", taintName, nodeID)
return
}

if !taintExists && !taintShouldExists {
log.Debugf("Taint %v already missing for node %v.", taintName, nodeID)
return
}

type patchTaints struct {
Op string `json:"op"`
Path string `json:"path"`
Value interface{} `json:"value,omitempty"`
}

taint := v1.Taint{
Key: taintName,
Effect: effect,
}

var patches []patchTaints

if len(updatedNode.Spec.Taints) == 0 {
// add first taint and ensure to keep current taints
patches = []patchTaints{
{
Op: "test",
Path: "/spec",
Value: updatedNode.Spec,
},
{
Op: "add",
Path: "/spec/taints",
Value: []v1.Taint{},
},
{
Op: "add",
Path: "/spec/taints/-",
Value: taint,
},
}
} else if taintExists {
// remove taint and ensure to test against race conditions
patches = []patchTaints{
{
Op: "test",
Path: fmt.Sprintf("/spec/taints/%d", offset),
Value: taint,
},
{
Op: "remove",
Path: fmt.Sprintf("/spec/taints/%d", offset),
},
}
} else {
// add missing taint to exsting list
patches = []patchTaints{
{
Op: "add",
Path: "/spec/taints/-",
Value: taint,
},
}
}

patchBytes, err := json.Marshal(patches)
if err != nil {
log.Fatalf("Error encoding taint patcht for node %s: %v", nodeID, err)
}

_, err = client.CoreV1().Nodes().Patch(context.TODO(), nodeID, types.JSONPatchType, patchBytes, metav1.PatchOptions{})
if err != nil {
log.Fatalf("Error patching taint for node %s: %v", nodeID, err)
}

if taintShouldExists {
log.Info("Node taint added")
} else {
log.Info("Node taint removed")
}
}

0 comments on commit 6caf951

Please sign in to comment.