Skip to content

Commit

Permalink
cluster-autoscaler: Fix excessive calls to DescribeAutoScalingGroup
Browse files Browse the repository at this point in the history
By caching AWS refs for nodes/EC2 instances already known to be not in any of ASGs managed by cluster-autoscaler(CA).

Please beware of the edge case - this method is safe as long as users don't attach nodes by calling AttachInstances API after CA cached them. I believe, even if it was necessary, a warning in the documentation about the edge case is enough for now. If we really need to support the case, I will submit an another PR to invalidate the cache periodically so that CA can detect the formerly cached nodes are attached to ASG(s).

Resolves #45
  • Loading branch information
mumoshu committed Jun 8, 2017
1 parent 4101dea commit 2235046
Show file tree
Hide file tree
Showing 2 changed files with 108 additions and 32 deletions.
120 changes: 93 additions & 27 deletions cluster-autoscaler/cloudprovider/aws/aws_cloud_provider_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,21 +31,8 @@ type AutoScalingMock struct {
}

func (a *AutoScalingMock) DescribeAutoScalingGroups(i *autoscaling.DescribeAutoScalingGroupsInput) (*autoscaling.DescribeAutoScalingGroupsOutput, error) {
return &autoscaling.DescribeAutoScalingGroupsOutput{
AutoScalingGroups: []*autoscaling.Group{
{
DesiredCapacity: aws.Int64(2),
Instances: []*autoscaling.Instance{
{
InstanceId: aws.String("test-instance-id"),
},
{
InstanceId: aws.String("second-test-instance-id"),
},
},
},
},
}, nil
args := a.Called(i)
return args.Get(0).(*autoscaling.DescribeAutoScalingGroupsOutput), nil
}

func (a *AutoScalingMock) DescribeTags(i *autoscaling.DescribeTagsInput) (*autoscaling.DescribeTagsOutput, error) {
Expand Down Expand Up @@ -73,9 +60,27 @@ func (a *AutoScalingMock) TerminateInstanceInAutoScalingGroup(input *autoscaling
}

var testAwsManager = &AwsManager{
asgs: make([]*asgInformation, 0),
service: &AutoScalingMock{},
asgCache: make(map[AwsRef]*Asg),
asgs: make([]*asgInformation, 0),
service: &AutoScalingMock{},
asgCache: make(map[AwsRef]*Asg),
instancesNotInManagedAsg: make(map[AwsRef]struct{}),
}

func testDescribeAutoScalingGroupsOutput(desiredCap int64, instanceIds ...string) *autoscaling.DescribeAutoScalingGroupsOutput {
instances := []*autoscaling.Instance{}
for _, id := range instanceIds {
instances = append(instances, &autoscaling.Instance{
InstanceId: aws.String(id),
})
}
return &autoscaling.DescribeAutoScalingGroupsOutput{
AutoScalingGroups: []*autoscaling.Group{
{
DesiredCapacity: aws.Int64(desiredCap),
Instances: instances,
},
},
}
}

func testProvider(t *testing.T, m *AwsManager) *awsCloudProvider {
Expand Down Expand Up @@ -123,15 +128,29 @@ func TestNodeGroupForNode(t *testing.T) {
ProviderID: "aws:///us-east-1a/test-instance-id",
},
}
provider := testProvider(t, testAwsManager)
service := &AutoScalingMock{}
m := &AwsManager{
asgs: make([]*asgInformation, 0),
service: service,
asgCache: make(map[AwsRef]*Asg),
instancesNotInManagedAsg: make(map[AwsRef]struct{}),
}
provider := testProvider(t, m)
err := provider.addNodeGroup("1:5:test-asg")
assert.NoError(t, err)

service.On("DescribeAutoScalingGroups", &autoscaling.DescribeAutoScalingGroupsInput{
AutoScalingGroupNames: aws.StringSlice([]string{provider.asgs[0].Name}),
MaxRecords: aws.Int64(1),
}).Return(testDescribeAutoScalingGroupsOutput(1, "test-instance-id"))

group, err := provider.NodeGroupForNode(node)

assert.NoError(t, err)
assert.Equal(t, group.Id(), "test-asg")
assert.Equal(t, group.MinSize(), 1)
assert.Equal(t, group.MaxSize(), 5)
service.AssertNumberOfCalls(t, "DescribeAutoScalingGroups", 1)

// test node in cluster that is not in a group managed by cluster autoscaler
nodeNotInGroup := &apiv1.Node{
Expand All @@ -141,8 +160,10 @@ func TestNodeGroupForNode(t *testing.T) {
}

group, err = provider.NodeGroupForNode(nodeNotInGroup)

assert.NoError(t, err)
assert.Nil(t, group)
service.AssertNumberOfCalls(t, "DescribeAutoScalingGroups", 2)
}

func TestAwsRefFromProviderId(t *testing.T) {
Expand Down Expand Up @@ -173,20 +194,36 @@ func TestMinSize(t *testing.T) {
}

func TestTargetSize(t *testing.T) {
provider := testProvider(t, testAwsManager)
service := &AutoScalingMock{}
m := &AwsManager{
asgs: make([]*asgInformation, 0),
service: service,
asgCache: make(map[AwsRef]*Asg),
instancesNotInManagedAsg: make(map[AwsRef]struct{}),
}
provider := testProvider(t, m)
err := provider.addNodeGroup("1:5:test-asg")
assert.NoError(t, err)

service.On("DescribeAutoScalingGroups", &autoscaling.DescribeAutoScalingGroupsInput{
AutoScalingGroupNames: aws.StringSlice([]string{provider.asgs[0].Name}),
MaxRecords: aws.Int64(1),
}).Return(testDescribeAutoScalingGroupsOutput(2, "test-instance-id", "second-test-instance-id"))

targetSize, err := provider.asgs[0].TargetSize()
assert.Equal(t, targetSize, 2)
assert.NoError(t, err)

service.AssertNumberOfCalls(t, "DescribeAutoScalingGroups", 1)
}

func TestIncreaseSize(t *testing.T) {
service := &AutoScalingMock{}
m := &AwsManager{
asgs: make([]*asgInformation, 0),
service: service,
asgCache: make(map[AwsRef]*Asg),
asgs: make([]*asgInformation, 0),
service: service,
asgCache: make(map[AwsRef]*Asg),
instancesNotInManagedAsg: make(map[AwsRef]struct{}),
}
provider := testProvider(t, m)
err := provider.addNodeGroup("1:5:test-asg")
Expand All @@ -199,23 +236,42 @@ func TestIncreaseSize(t *testing.T) {
HonorCooldown: aws.Bool(false),
}).Return(&autoscaling.SetDesiredCapacityOutput{})

service.On("DescribeAutoScalingGroups", &autoscaling.DescribeAutoScalingGroupsInput{
AutoScalingGroupNames: aws.StringSlice([]string{provider.asgs[0].Name}),
MaxRecords: aws.Int64(1),
}).Return(testDescribeAutoScalingGroupsOutput(2, "test-instance-id", "second-test-instance-id"))

err = provider.asgs[0].IncreaseSize(1)
assert.NoError(t, err)
service.AssertNumberOfCalls(t, "SetDesiredCapacity", 1)
service.AssertNumberOfCalls(t, "DescribeAutoScalingGroups", 1)
}

func TestBelongs(t *testing.T) {
provider := testProvider(t, testAwsManager)
service := &AutoScalingMock{}
m := &AwsManager{
asgs: make([]*asgInformation, 0),
service: service,
asgCache: make(map[AwsRef]*Asg),
instancesNotInManagedAsg: make(map[AwsRef]struct{}),
}
provider := testProvider(t, m)
err := provider.addNodeGroup("1:5:test-asg")
assert.NoError(t, err)

service.On("DescribeAutoScalingGroups", &autoscaling.DescribeAutoScalingGroupsInput{
AutoScalingGroupNames: aws.StringSlice([]string{provider.asgs[0].Name}),
MaxRecords: aws.Int64(1),
}).Return(testDescribeAutoScalingGroupsOutput(1, "test-instance-id"))

invalidNode := &apiv1.Node{
Spec: apiv1.NodeSpec{
ProviderID: "aws:///us-east-1a/invalid-instance-id",
},
}
_, err = provider.asgs[0].Belongs(invalidNode)
assert.Error(t, err)
service.AssertNumberOfCalls(t, "DescribeAutoScalingGroups", 1)

validNode := &apiv1.Node{
Spec: apiv1.NodeSpec{
Expand All @@ -225,14 +281,18 @@ func TestBelongs(t *testing.T) {
belongs, err := provider.asgs[0].Belongs(validNode)
assert.Equal(t, belongs, true)
assert.NoError(t, err)
// As "test-instance-id" is already known to be managed by test-asg since the first `Belongs` call,
// No additional DescribAutoScalingGroup call is made
service.AssertNumberOfCalls(t, "DescribeAutoScalingGroups", 1)
}

func TestDeleteNodes(t *testing.T) {
service := &AutoScalingMock{}
m := &AwsManager{
asgs: make([]*asgInformation, 0),
service: service,
asgCache: make(map[AwsRef]*Asg),
asgs: make([]*asgInformation, 0),
service: service,
asgCache: make(map[AwsRef]*Asg),
instancesNotInManagedAsg: make(map[AwsRef]struct{}),
}

service.On("TerminateInstanceInAutoScalingGroup", &autoscaling.TerminateInstanceInAutoScalingGroupInput{
Expand All @@ -246,6 +306,11 @@ func TestDeleteNodes(t *testing.T) {
err := provider.addNodeGroup("1:5:test-asg")
assert.NoError(t, err)

service.On("DescribeAutoScalingGroups", &autoscaling.DescribeAutoScalingGroupsInput{
AutoScalingGroupNames: aws.StringSlice([]string{provider.asgs[0].Name}),
MaxRecords: aws.Int64(1),
}).Return(testDescribeAutoScalingGroupsOutput(2, "test-instance-id", "second-test-instance-id"))

node := &apiv1.Node{
Spec: apiv1.NodeSpec{
ProviderID: "aws:///us-east-1a/test-instance-id",
Expand All @@ -254,6 +319,7 @@ func TestDeleteNodes(t *testing.T) {
err = provider.asgs[0].DeleteNodes([]*apiv1.Node{node})
assert.NoError(t, err)
service.AssertNumberOfCalls(t, "TerminateInstanceInAutoScalingGroup", 1)
service.AssertNumberOfCalls(t, "DescribeAutoScalingGroups", 2)
}

func TestId(t *testing.T) {
Expand Down
20 changes: 15 additions & 5 deletions cluster-autoscaler/cloudprovider/aws/aws_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,9 @@ type autoScaling interface {

// AwsManager is handles aws communication and data caching.
type AwsManager struct {
asgs []*asgInformation
asgCache map[AwsRef]*Asg
asgs []*asgInformation
asgCache map[AwsRef]*Asg
instancesNotInManagedAsg map[AwsRef]struct{}

service autoScaling
cacheMutex sync.Mutex
Expand All @@ -72,9 +73,10 @@ func CreateAwsManager(configReader io.Reader) (*AwsManager, error) {

service := autoscaling.New(session.New())
manager := &AwsManager{
asgs: make([]*asgInformation, 0),
service: service,
asgCache: make(map[AwsRef]*Asg),
asgs: make([]*asgInformation, 0),
service: service,
asgCache: make(map[AwsRef]*Asg),
instancesNotInManagedAsg: make(map[AwsRef]struct{}),
}

go wait.Forever(func() {
Expand Down Expand Up @@ -173,13 +175,21 @@ func (m *AwsManager) GetAsgForInstance(instance *AwsRef) (*Asg, error) {
if config, found := m.asgCache[*instance]; found {
return config, nil
}
if _, found := m.instancesNotInManagedAsg[*instance]; found {
// The instance is already known to not belong to any configured ASG
// Skip regenerateCache so that we won't unnecessarily call DescribeAutoScalingGroups
// See https://github.com/kubernetes/contrib/issues/2541
return nil, nil
}
if err := m.regenerateCache(); err != nil {
return nil, fmt.Errorf("Error while looking for ASG for instance %+v, error: %v", *instance, err)
}
if config, found := m.asgCache[*instance]; found {
return config, nil
}
// instance does not belong to any configured ASG
glog.V(6).Infof("Instance %+v is not in any ASG managed by CA. CA is now memorizing the fact not to unnecessarily call AWS API afterwards trying to find the unexistent managed ASG for the instance", *instance)
m.instancesNotInManagedAsg[*instance] = struct{}{}
return nil, nil
}

Expand Down

0 comments on commit 2235046

Please sign in to comment.