Skip to content

Commit

Permalink
Reap tainted nodes (#35)
Browse files Browse the repository at this point in the history
* Reap nodes with taint

* Update README.md

* refactor (coverage)
  • Loading branch information
eytan-avisror committed Jul 28, 2020
1 parent 9d02bed commit 84d1d6e
Show file tree
Hide file tree
Showing 6 changed files with 385 additions and 44 deletions.
2 changes: 1 addition & 1 deletion cmd/governor/app/nodereaper.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,5 +62,5 @@ func init() {
nodeReapCmd.Flags().Int32Var(&nodeReaperArgs.ReapUnjoinedThresholdMinutes, "reap-unjoined-threshold-minutes", 15, "Reap N minute old nodes")
nodeReapCmd.Flags().StringVar(&nodeReaperArgs.ReapUnjoinedKey, "reap-unjoined-tag-key", "", "BE CAREFUL! EC2 tag key that identfies a joining node")
nodeReapCmd.Flags().StringVar(&nodeReaperArgs.ReapUnjoinedValue, "reap-unjoined-tag-value", "", "BE CAREFUL! EC2 tag value that identfies a joining node")

nodeReapCmd.Flags().StringArrayVar(&nodeReaperArgs.ReapTainted, "reap-tainted", []string{}, "marks nodes with a given taint reapable, must be in format of comma separated taints key=value:effect, key:effect or key")
}
5 changes: 5 additions & 0 deletions pkg/reaper/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,11 @@ Ghost nodes are nodes which point to an instance-id which is invalid or already

Unjoined nodes are nodes which fail to join the cluster and remain unjoined while taking capacity from the scaling groups. By default this feature is not enabled, but can be enabled by setting `--reap-unjoined=true`, you must also set `--reap-unjoined-threshold-minutes` which is the number of minutes passed since EC2 launch time to consider a node unjoined (we recommend setting a relatively high number here, e.g. 15), also `--reap-unjoined-tag-key` and `--reap-unjoined-tag-value` are required in order to identify the instances which failed to join, and should match an EC2 tag on the cluster nodes. when this is enabled, node-reaper will actively look at all EC2 instances with the mentioned key/value tag, and make sure they are joined in the cluster as nodes by looking at their `ProviderID`, if a matching node is not found and the EC2 instance has been up for more than the configured thershold, the instance will be terminated.

### Reaping Tainted Nodes

You can chose to mark nodes with certain taints reapable by using the `--reap-tainted` flag and providing a comma separated list of taint strings.
for example, `--reap-tainted NodeWithImpairedVolumes=true:NoSchedule,MyTaint:NoSchedule`, would mean nodes having either one of these taints will be drained & terminated. You can use the following formats for describing a taint - key=value:effect, key:effect, key.

### Example

```text
Expand Down
35 changes: 35 additions & 0 deletions pkg/reaper/nodereaper/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,46 @@ import (
"github.com/aws/aws-sdk-go/service/autoscaling/autoscalingiface"
"github.com/aws/aws-sdk-go/service/ec2"
"github.com/aws/aws-sdk-go/service/ec2/ec2iface"
"github.com/pkg/errors"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
)

func parseTaint(t string) (v1.Taint, bool, error) {
var key, value string
var effect v1.TaintEffect
var taint v1.Taint

parts := strings.Split(t, ":")

switch len(parts) {
case 1:
key = parts[0]
case 2:
effect = v1.TaintEffect(parts[1])
KV := strings.Split(parts[0], "=")

if len(KV) > 2 {
return taint, false, errors.Errorf("invalid taint %v provided", t)
}

key = KV[0]

if len(KV) == 2 {
value = KV[1]
}
default:
return taint, false, errors.Errorf("invalid taint %v provided", t)
}

taint.Key = key
taint.Value = value
taint.Effect = effect
taint.TimeAdded = &metav1.Time{Time: time.Time{}}
return taint, true, nil
}

func runCommand(call string, arg []string) (string, error) {
log.Infof("invoking >> %s %s", call, arg)
out, err := exec.Command(call, arg...).CombinedOutput()
Expand Down
74 changes: 65 additions & 9 deletions pkg/reaper/nodereaper/nodereaper.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,17 @@ limitations under the License.
package nodereaper

import (
"errors"
"fmt"
"os"
"reflect"
"time"

"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/aws/session"
"github.com/aws/aws-sdk-go/service/autoscaling"
"github.com/aws/aws-sdk-go/service/ec2"
"github.com/keikoproj/governor/pkg/reaper/common"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand All @@ -39,10 +40,10 @@ const (
terminatedStateName = "termination-issued"
drainingStateName = "draining"
reaperDisableLabelKey = "governor.keikoproj.io/node-reaper-disabled"
reapUnreadyDisabledLabelKey = "governor.keikoproj.io/reap-unready-disabled"
reapUnknownDisabledLabelKey = "governor.keikoproj.io/reap-unknown-disabled"
reapFlappyDisabledLabelKey = "governor.keikoproj.io/reap-flappy-disabled"
reapOldDisabledLabelKey = "governor.keikoproj.io/reap-old-disabled"
reapUnreadyDisabledLabelKey = "governor.keikoproj.io/reap-unready-disabled"
reapUnknownDisabledLabelKey = "governor.keikoproj.io/reap-unknown-disabled"
reapFlappyDisabledLabelKey = "governor.keikoproj.io/reap-flappy-disabled"
reapOldDisabledLabelKey = "governor.keikoproj.io/reap-old-disabled"
)

// Validate command line arguments
Expand All @@ -64,6 +65,7 @@ func (ctx *ReaperContext) validateArguments(args *Args) error {
ctx.NodeInstanceIDs = make(map[string]string)
ctx.AgeDrainReapableInstances = make([]AgeDrainReapableInstance, 0)
ctx.AgeKillOrder = make([]string, 0)
ctx.ReapTainted = make([]v1.Taint, 0)
ctx.EC2Region = args.EC2Region
ctx.ReapOld = args.ReapOld
ctx.MaxKill = args.MaxKill
Expand All @@ -75,6 +77,18 @@ func (ctx *ReaperContext) validateArguments(args *Args) error {
log.Infof("ASG Validation = %t", ctx.AsgValidation)
log.Infof("Post Reap Throttle = %v seconds", ctx.ReapThrottle)

for _, t := range args.ReapTainted {
var taint v1.Taint
var ok bool
var err error

if taint, ok, err = parseTaint(t); !ok {
return errors.Wrap(err, "failed to parse taint")
}

ctx.ReapTainted = append(ctx.ReapTainted, taint)
}

if ctx.MaxKill < 1 {
err := fmt.Errorf("--max-kill-nodes must be set to a number greater than or equal to 1")
log.Errorln(err)
Expand Down Expand Up @@ -245,7 +259,14 @@ func Run(args *Args) error {
log.Infoln("starting drain condition check for ghost nodes")
err = ctx.deriveGhostDrainReapableNodes(awsAuth)
if err != nil {
log.Errorf("failed to derive age drain-reapable nodes, %v", err)
log.Errorf("failed to derive ghost nodes, %v", err)
return err
}

log.Infoln("starting drain condition check for tainted nodes")
err = ctx.deriveTaintDrainReapableNodes()
if err != nil {
log.Errorf("failed to derive taint drain-reapable nodes, %v", err)
return err
}

Expand Down Expand Up @@ -281,6 +302,24 @@ func Run(args *Args) error {
return nil
}

func (ctx *ReaperContext) deriveTaintDrainReapableNodes() error {
if len(ctx.ReapTainted) == 0 {
return nil
}

log.Infoln("scanning for taint drain-reapable nodes")
for _, node := range ctx.AllNodes {
nodeInstanceID := getNodeInstanceID(&node)
for _, t := range ctx.ReapTainted {
if nodeIsTainted(t, node) {
ctx.addDrainable(node.Name, nodeInstanceID)
ctx.addReapable(node.Name, nodeInstanceID)
}
}
}
return nil
}

// Handle age-reapable nodes
func (ctx *ReaperContext) deriveAgeDrainReapableNodes() error {
log.Infoln("scanning for age drain-reapable nodes")
Expand All @@ -298,7 +337,7 @@ func (ctx *ReaperContext) deriveAgeDrainReapableNodes() error {

// Drain-Reap old nodes
if ctx.ReapOld {
if !nodeHasAnnotation(node, ageUnreapableAnnotationKey, "true") && !hasSkipLabel(node, reapOldDisabledLabelKey){
if !nodeHasAnnotation(node, ageUnreapableAnnotationKey, "true") && !hasSkipLabel(node, reapOldDisabledLabelKey) {
if nodeIsAgeReapable(nodeAgeMinutes, ageThreshold) {
log.Infof("node %v is drain-reapable !! State = OldAge, Diff = %v/%v", nodeName, nodeAgeMinutes, ageThreshold)
ctx.addAgeDrainReapable(nodeName, nodeInstanceID, nodeAgeMinutes)
Expand Down Expand Up @@ -326,7 +365,7 @@ func (ctx *ReaperContext) deriveFlappyDrainReapableNodes() error {

// Drain-Reap flappy nodes
if ctx.ReapFlappy {
if nodeIsFlappy(events, nodeName, countThreshold, "NodeReady") && !hasSkipLabel(node, reapFlappyDisabledLabelKey) {
if nodeIsFlappy(events, nodeName, countThreshold, "NodeReady") && !hasSkipLabel(node, reapFlappyDisabledLabelKey) {
log.Infof("node %v is drain-reapable !! State = ReadinessFlapping", nodeName)
ctx.addDrainable(nodeName, nodeInstanceID)
ctx.addReapable(nodeName, nodeInstanceID)
Expand Down Expand Up @@ -625,7 +664,7 @@ func (ctx *ReaperContext) scan(w ReaperAwsAuth) error {
log.Infof("found %v nodes, %v pods, and %v events", len(ctx.AllNodes), len(ctx.AllPods), len(ctx.AllEvents))
for _, node := range nodeList.Items {
ctx.NodeInstanceIDs[getNodeInstanceID(&node)] = node.Name
if (nodeStateIsNotReady(&node) || nodeStateIsUnknown(&node)) {
if nodeStateIsNotReady(&node) || nodeStateIsUnknown(&node) {
log.Infof("node %v is not ready", node.ObjectMeta.Name)
ctx.UnreadyNodes = append(ctx.UnreadyNodes, node)
}
Expand Down Expand Up @@ -720,6 +759,23 @@ func autoScalingGroupIsStable(w ReaperAwsAuth, instance string) (bool, error) {
return true, nil
}

func nodeIsTainted(taint v1.Taint, node v1.Node) bool {
for _, t := range node.Spec.Taints {
// ignore timeAdded
t.TimeAdded = &metav1.Time{Time: time.Time{}}

// handle key only match
if taint.Effect == v1.TaintEffect("") && taint.Value == "" && taint.Key == t.Key {
return true
}

if reflect.DeepEqual(taint, t) {
return true
}
}
return false
}

func nodeIsFlappy(events []v1.Event, name string, threshold int32, reason string) bool {
totalFlapEvents := make(map[string]int32)
for _, event := range events {
Expand Down
Loading

0 comments on commit 84d1d6e

Please sign in to comment.