Skip to content

Commit

Permalink
Reconsider unreapable nodes after specified time
Browse files Browse the repository at this point in the history
- Add flag `--reconsider-unreapable-node` with default value 10 mins
- When a node drain failed, set the current time to `governor.keikoproj.io/age-unreapable` annotation
- Backward compatibility with previous value for the annotation

Fixes: #39
  • Loading branch information
viveksyngh committed Oct 27, 2020
1 parent 926911b commit 1d1751c
Show file tree
Hide file tree
Showing 6 changed files with 169 additions and 3 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ clean:
rm -rf _output

test:
go test -v ./... -coverprofile ./coverage.txt
CGO_ENABLED=0 go test -v ./... -coverprofile ./coverage.txt

vtest:
go test -v ./... -coverprofile ./coverage.txt --logging-enabled
Expand Down
1 change: 1 addition & 0 deletions cmd/governor/app/nodereaper.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,4 +63,5 @@ func init() {
nodeReapCmd.Flags().StringVar(&nodeReaperArgs.ReapUnjoinedKey, "reap-unjoined-tag-key", "", "BE CAREFUL! EC2 tag key that identfies a joining node")
nodeReapCmd.Flags().StringVar(&nodeReaperArgs.ReapUnjoinedValue, "reap-unjoined-tag-value", "", "BE CAREFUL! EC2 tag value that identfies a joining node")
nodeReapCmd.Flags().StringArrayVar(&nodeReaperArgs.ReapTainted, "reap-tainted", []string{}, "marks nodes with a given taint reapable, must be in format of comma separated taints key=value:effect, key:effect or key")
nodeReapCmd.Flags().Float64Var(&nodeReaperArgs.ReconsiderUnreapableAfter, "reconsider-unreapable-after", 10, "Time (in minutes) after which reconsider unreapable nodes")
}
6 changes: 5 additions & 1 deletion pkg/reaper/nodereaper/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ func (ctx *ReaperContext) drainNode(name string, dryRun bool) error {
ctx.publishEvent(ctx.SelfNamespace, event)
if err.Error() == "command execution timed out" {
log.Warnf("failed to drain node %v, drain command timed-out", name)
ctx.annotateNode(name, ageUnreapableAnnotationKey, "true")
ctx.annotateNode(name, ageUnreapableAnnotationKey, getUTCNowStr())
ctx.uncordonNode(name, dryRun)
return err
}
Expand Down Expand Up @@ -422,3 +422,7 @@ func getInstanceIDByPrivateDNS(instances []*ec2.Instance, dnsName string) string
}
return ""
}

func getUTCNowStr() string {
return time.Now().UTC().Format(time.RFC3339)
}
47 changes: 46 additions & 1 deletion pkg/reaper/nodereaper/nodereaper.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,14 @@ func (ctx *ReaperContext) validateArguments(args *Args) error {
}
ctx.TimeToReap = args.ReapAfter

if args.ReconsiderUnreapableAfter < 10 {
err := fmt.Errorf("--reconsider-unreapable-after must be set to a number greater than or equal to 10")
log.Errorln(err)
return err
}

ctx.ReconsiderUnreapableAfter = args.ReconsiderUnreapableAfter

if args.ReapUnjoined {
if args.ReapUnjoinedThresholdMinutes < 10 {
err := fmt.Errorf("--reap-unjoined-threshold-minutes must be set to a number greater than or equal to 10")
Expand Down Expand Up @@ -157,6 +165,7 @@ func (ctx *ReaperContext) validateArguments(args *Args) error {
log.Infof("Reap Unready = %t, threshold = %v minutes", ctx.ReapUnready, ctx.TimeToReap)
log.Infof("Reap Ghost = %t, threshold = immediate", ctx.ReapGhost)
log.Infof("Reap Unjoined = %t, threshold = %v minutes by tag %v=%v", ctx.ReapUnjoined, ctx.ReapUnjoinedThresholdMinutes, ctx.ReapUnjoinedKey, ctx.ReapUnjoinedValue)
log.Infof("Reconsider Unreapable after = %v minutes", ctx.ReconsiderUnreapableAfter)

if !ctx.SoftReap {
log.Warnf("--soft-reap is off !! will not consider pods when reaping")
Expand Down Expand Up @@ -337,7 +346,7 @@ func (ctx *ReaperContext) deriveAgeDrainReapableNodes() error {

// Drain-Reap old nodes
if ctx.ReapOld {
if !nodeHasAnnotation(node, ageUnreapableAnnotationKey, "true") && !hasSkipLabel(node, reapOldDisabledLabelKey) {
if reconsiderUnreapableNode(node, ctx.ReconsiderUnreapableAfter) && !hasSkipLabel(node, reapOldDisabledLabelKey) {
if nodeIsAgeReapable(nodeAgeMinutes, ageThreshold) {
log.Infof("node %v is drain-reapable !! State = OldAge, Diff = %v/%v", nodeName, nodeAgeMinutes, ageThreshold)
ctx.addAgeDrainReapable(nodeName, nodeInstanceID, nodeAgeMinutes)
Expand Down Expand Up @@ -801,3 +810,39 @@ func nodeIsFlappy(events []v1.Event, name string, threshold int32, reason string
func hasSkipLabel(node v1.Node, label string) bool {
return node.ObjectMeta.Labels[reaperDisableLabelKey] == "true" || node.ObjectMeta.Labels[label] == "true"
}

func reconsiderUnreapableNode(node v1.Node, reapableAfter float64) bool {
//For backward compatibilty
if nodeHasAnnotation(node, ageUnreapableAnnotationKey, "true") {
return true
}

lastUnreapableTimeStr := getAnnotationValue(node, ageUnreapableAnnotationKey)
if lastUnreapableTimeStr == "" {
return true
}

lastUnreapableTime, err := time.Parse(time.RFC3339, lastUnreapableTimeStr)

//invalid date time format
if err != nil {
log.Infof("failed to parse age unreapable annotation value: %s", err.Error())
return false
}

now := time.Now().UTC()
if now.Sub(lastUnreapableTime).Minutes() >= reapableAfter {
return true
}

return false
}

func getAnnotationValue(node v1.Node, annotationKey string) string {
for k, v := range node.ObjectMeta.Annotations {
if k == annotationKey {
return v
}
}
return ""
}
114 changes: 114 additions & 0 deletions pkg/reaper/nodereaper/nodereaper_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1869,3 +1869,117 @@ func TestNodeIsTainted(t *testing.T) {
}
}
}

func TestReconsiderUnreapableNode(t *testing.T) {
testCases := []struct {
Name string
Node v1.Node
ReapableAfter float64
ReconsideUnreapable bool
}{
{
Name: "node without age unreapable annotation",
Node: v1.Node{
ObjectMeta: metav1.ObjectMeta{
Annotations: map[string]string{},
},
},
ReapableAfter: 10,
ReconsideUnreapable: true,
},
{
Name: "node with age unreapable set to true",
Node: v1.Node{
ObjectMeta: metav1.ObjectMeta{
Annotations: map[string]string{
ageUnreapableAnnotationKey: "true",
},
},
},
ReapableAfter: 10,
ReconsideUnreapable: true,
},
{
Name: "node with over age unreapable date time",
Node: v1.Node{
ObjectMeta: metav1.ObjectMeta{
Annotations: map[string]string{
ageUnreapableAnnotationKey: time.Now().UTC().Add(-10 * time.Minute).Format(time.RFC3339),
},
},
},
ReapableAfter: 10,
ReconsideUnreapable: true,
},
{
Name: "node with under age unreapable date time",
Node: v1.Node{
ObjectMeta: metav1.ObjectMeta{
Annotations: map[string]string{
ageUnreapableAnnotationKey: time.Now().UTC().Add(-5 * time.Minute).Format(time.RFC3339),
},
},
},
ReapableAfter: 10,
ReconsideUnreapable: false,
},
{
Name: "node with invalid age unreapable date time format",
Node: v1.Node{
ObjectMeta: metav1.ObjectMeta{
Annotations: map[string]string{
ageUnreapableAnnotationKey: time.Now().UTC().Add(-5 * time.Minute).Format(time.RFC1123),
},
},
},
ReapableAfter: 10,
ReconsideUnreapable: false,
},
}

for _, tc := range testCases {
got := reconsiderUnreapableNode(tc.Node, tc.ReapableAfter)
if got != tc.ReconsideUnreapable {
t.Fatalf("test #%v: expected match: %t, got: %t", tc.Name, tc.ReconsideUnreapable, got)
}
}
}

func TestGetAnnotationValue(t *testing.T) {
testCases := []struct {
Name string
Node v1.Node
AnnotationKey string
ExpectedValue string
}{
{
Name: "invalid key",
Node: v1.Node{
ObjectMeta: metav1.ObjectMeta{
Annotations: map[string]string{},
},
},
AnnotationKey: "key",
ExpectedValue: "",
},
{
Name: "with valid annotation key value",
Node: v1.Node{
ObjectMeta: metav1.ObjectMeta{
Annotations: map[string]string{
"key": "value",
},
},
},
AnnotationKey: "key",
ExpectedValue: "value",
},
}

for _, tc := range testCases {
got := getAnnotationValue(tc.Node, tc.AnnotationKey)
if got != tc.ExpectedValue {
t.Fatalf("test #%v: expected match: %s, got: %s", tc.Name, tc.ExpectedValue, got)
}
}
}
2 changes: 2 additions & 0 deletions pkg/reaper/nodereaper/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ type Args struct {
AgeReapThrottle int64
ReapAfter float64
ReapTainted []string
ReconsiderUnreapableAfter float64
}

// ReaperContext holds the context of the node-reaper and target cluster
Expand Down Expand Up @@ -87,6 +88,7 @@ type ReaperContext struct {
MaxKill int
TimeToReap float64
ReapTainted []v1.Taint
ReconsiderUnreapableAfter float64
// runtime
UnreadyNodes []v1.Node
AllNodes []v1.Node
Expand Down

0 comments on commit 1d1751c

Please sign in to comment.