Skip to content
This repository was archived by the owner on Apr 17, 2019. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions cluster-autoscaler/cluster_autoscaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,19 @@ func run(_ <-chan struct{}) {
}
}

// Check if there has been a constant difference between the number of nodes in k8s and
// the number of nodes on the cloud provider side.
// TODO: andrewskim - add protection for ready AWS nodes.
fixedSomething, err := fixNodeGroupSize(&autoscalingContext, time.Now())
if err != nil {
glog.Warningf("Failed to fix node group sizes: %v", err)
continue
}
if fixedSomething {
glog.V(0).Infof("Some node group target size was fixed, skipping the iteration")
continue
}

// TODO: remove once all of the unready node handling elements are in place.
if err := CheckGroupsAndNodes(readyNodes, autoscalingContext.CloudProvider); err != nil {
glog.Warningf("Cluster is not ready for autoscaling: %v", err)
Expand Down
28 changes: 28 additions & 0 deletions cluster-autoscaler/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -244,3 +244,31 @@ func removeOldUnregisteredNodes(unregisteredNodes []clusterstate.UnregisteredNod
}
return removedAny, nil
}

// Sets the target size of node groups to the current number of nodes in them
// if the difference was constant for a prolonged time. Returns true if managed
// to fix something.
func fixNodeGroupSize(contetxt *AutoscalingContext, currentTime time.Time) (bool, error) {
fixed := false
for _, nodeGroup := range contetxt.CloudProvider.NodeGroups() {
incorrectSize := contetxt.ClusterStateRegistry.GetIncorrectNodeGroupSize(nodeGroup.Id())
if incorrectSize == nil {
continue
}
if incorrectSize.FirstObserved.Add(contetxt.UnregisteredNodeRemovalTime).Before(currentTime) {

delta := incorrectSize.CurrentSize - incorrectSize.ExpectedSize
if delta < 0 {
glog.V(0).Infof("Decreasing size of %s, expected=%d current=%d delta=%d", nodeGroup.Id(),
incorrectSize.ExpectedSize,
incorrectSize.CurrentSize,
delta)
if err := nodeGroup.DecreaseTargetSize(delta); err != nil {
return fixed, fmt.Errorf("Failed to decrease %s: %v", nodeGroup.Id(), err)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If there is a one node group w/too large delata, the hole autoscaler algorithm for all groups will be stopped. Is it the intended behavior?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Discussed offline

}
fixed = true
}
}
}
return fixed, nil
}
39 changes: 39 additions & 0 deletions cluster-autoscaler/utils_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,3 +100,42 @@ func TestRemoveOldUnregisteredNodes(t *testing.T) {
deletedNode := getStringFromChan(deletedNodes)
assert.Equal(t, "ng1/ng1-2", deletedNode)
}

func TestRemoveFixNodeTargetSize(t *testing.T) {
sizeChanges := make(chan string, 10)
now := time.Now()

ng1_1 := BuildTestNode("ng1-1", 1000, 1000)
ng1_1.Spec.ProviderID = "ng1-1"
provider := testprovider.NewTestCloudProvider(func(nodegroup string, delta int) error {
sizeChanges <- fmt.Sprintf("%s/%d", nodegroup, delta)
return nil
}, nil)
provider.AddNodeGroup("ng1", 1, 10, 3)
provider.AddNode("ng1", ng1_1)

clusterState := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{
MaxTotalUnreadyPercentage: 10,
OkTotalUnreadyCount: 1,
})
err := clusterState.UpdateNodes([]*apiv1.Node{ng1_1}, now.Add(-time.Hour))
assert.NoError(t, err)

context := &AutoscalingContext{
CloudProvider: provider,
ClusterStateRegistry: clusterState,
UnregisteredNodeRemovalTime: 45 * time.Minute,
}

// Nothing should be fixed. The incorrect size state is not old enough.
removed, err := fixNodeGroupSize(context, now.Add(-50*time.Minute))
assert.NoError(t, err)
assert.False(t, removed)

// Node group should be decreased.
removed, err = fixNodeGroupSize(context, now)
assert.NoError(t, err)
assert.True(t, removed)
change := getStringFromChan(sizeChanges)
assert.Equal(t, "ng1/-2", change)
}