From fa956e3909fac0731361e190e75336ae166bf296 Mon Sep 17 00:00:00 2001 From: Bronson Mirafuentes Date: Mon, 20 Dec 2021 10:08:05 -0800 Subject: [PATCH] add instance connection draining for NLBs --- upup/pkg/fi/cloudup/awsup/BUILD.bazel | 1 + upup/pkg/fi/cloudup/awsup/aws_cloud.go | 85 +++++++++++++++++++++++--- 2 files changed, 76 insertions(+), 10 deletions(-) diff --git a/upup/pkg/fi/cloudup/awsup/BUILD.bazel b/upup/pkg/fi/cloudup/awsup/BUILD.bazel index a3865fcc1ee0d..14cd59844a0fa 100644 --- a/upup/pkg/fi/cloudup/awsup/BUILD.bazel +++ b/upup/pkg/fi/cloudup/awsup/BUILD.bazel @@ -56,6 +56,7 @@ go_library( "//vendor/github.com/aws/aws-sdk-go/service/sqs:go_default_library", "//vendor/github.com/aws/aws-sdk-go/service/sqs/sqsiface:go_default_library", "//vendor/github.com/aws/aws-sdk-go/service/sts:go_default_library", + "//vendor/golang.org/x/sync/errgroup:go_default_library", "//vendor/k8s.io/api/core/v1:go_default_library", "//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library", "//vendor/k8s.io/klog/v2:go_default_library", diff --git a/upup/pkg/fi/cloudup/awsup/aws_cloud.go b/upup/pkg/fi/cloudup/awsup/aws_cloud.go index 22fa10592be78..5beb32c963b6c 100644 --- a/upup/pkg/fi/cloudup/awsup/aws_cloud.go +++ b/upup/pkg/fi/cloudup/awsup/aws_cloud.go @@ -27,6 +27,7 @@ import ( "github.com/aws/aws-sdk-go/service/eventbridge/eventbridgeiface" "github.com/aws/aws-sdk-go/service/sqs" "github.com/aws/aws-sdk-go/service/sqs/sqsiface" + "golang.org/x/sync/errgroup" "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/aws/arn" @@ -547,7 +548,7 @@ func deleteInstance(c AWSCloud, i *cloudinstances.CloudInstance) error { } if i.CloudInstanceGroup.InstanceGroup.Spec.Manager != kops.InstanceManagerKarpenter { - err := deregisterInstanceFromClassicLoadBalancer(c, i) + err := deregisterInstance(c, i) if err != nil { return fmt.Errorf("failed to deregister instance from loadBalancer before terminating: %v", err) } @@ -570,9 +571,8 @@ func deleteInstance(c AWSCloud, i *cloudinstances.CloudInstance) error { return nil } -// deregisterInstanceFromClassicLoadBalancer ensures that connectionDraining completes for the associated loadBalancer to ensure no dropped connections. -// if instance is associated with an NLB, this method no-ops. -func deregisterInstanceFromClassicLoadBalancer(c AWSCloud, i *cloudinstances.CloudInstance) error { +// deregisterInstance ensures that the instance is fully drained/removed from all associated loadBalancers and targetGroups before termination. +func deregisterInstance(c AWSCloud, i *cloudinstances.CloudInstance) error { asg := i.CloudInstanceGroup.Raw.(*autoscaling.Group) asgDetails, err := c.Autoscaling().DescribeAutoScalingGroups(&autoscaling.DescribeAutoScalingGroupsInput{ @@ -587,17 +587,41 @@ func deregisterInstanceFromClassicLoadBalancer(c AWSCloud, i *cloudinstances.Clo } // there will always be only one ASG in the DescribeAutoScalingGroups response. - loadBalancerNames := asgDetails.AutoScalingGroups[0].LoadBalancerNames + loadBalancerNames := aws.StringValueSlice(asgDetails.AutoScalingGroups[0].LoadBalancerNames) + targetGroupArns := aws.StringValueSlice(asgDetails.AutoScalingGroups[0].TargetGroupARNs) - klog.Infof("Deregistering instance from classic loadBalancers: %v", aws.StringValueSlice(loadBalancerNames)) + eg, _ := errgroup.WithContext(aws.BackgroundContext()) + + if len(loadBalancerNames) != 0 { + eg.Go(func() error { + return deregisterInstanceFromClassicLoadBalancer(c, loadBalancerNames, i.ID) + }) + } + + if len(targetGroupArns) != 0 { + eg.Go(func() error { + return deregisterInstanceFromTargetGroups(c, targetGroupArns, i.ID) + }) + } + + if err := eg.Wait(); err != nil { + return fmt.Errorf("failed to deregister instance from load balancers: %v", err) + } + + return nil +} + +// deregisterInstanceFromClassicLoadBalancer ensures that connectionDraining completes for the associated classic loadBalancer to ensure no dropped connections. +func deregisterInstanceFromClassicLoadBalancer(c AWSCloud, loadBalancerNames []string, instanceId string) error { + klog.Infof("Deregistering instance from classic loadBalancers: %v", loadBalancerNames) for { instanceDraining := false for _, loadBalancerName := range loadBalancerNames { response, err := c.ELB().DescribeInstanceHealth(&elb.DescribeInstanceHealthInput{ - LoadBalancerName: loadBalancerName, + LoadBalancerName: aws.String(loadBalancerName), Instances: []*elb.Instance{{ - InstanceId: aws.String(i.ID), + InstanceId: aws.String(instanceId), }}, }) if err != nil { @@ -612,9 +636,9 @@ func deregisterInstanceFromClassicLoadBalancer(c AWSCloud, i *cloudinstances.Clo // there will be only one instance in the DescribeInstanceHealth response. if aws.StringValue(response.InstanceStates[0].State) == instanceInServiceState { c.ELB().DeregisterInstancesFromLoadBalancer(&elb.DeregisterInstancesFromLoadBalancerInput{ - LoadBalancerName: loadBalancerName, + LoadBalancerName: aws.String(loadBalancerName), Instances: []*elb.Instance{{ - InstanceId: aws.String(i.ID), + InstanceId: aws.String(instanceId), }}, }) instanceDraining = true @@ -627,7 +651,48 @@ func deregisterInstanceFromClassicLoadBalancer(c AWSCloud, i *cloudinstances.Clo time.Sleep(5 * time.Second) } + return nil +} +// deregisterInstanceFromTargetGroups ensures that instances are fully unused in the corresponding targetGroups before instance termination. +// this ensures that connections are fully drained from the instance before terminating. +func deregisterInstanceFromTargetGroups(c AWSCloud, targetGroupArns []string, instanceId string) error { + klog.Infof("Deregistering instance from targetGroups: %v", targetGroupArns) + + for { + instanceDraining := false + for _, targetGroupArn := range targetGroupArns { + response, err := c.ELBV2().DescribeTargetHealth(&elbv2.DescribeTargetHealthInput{ + TargetGroupArn: aws.String(targetGroupArn), + Targets: []*elbv2.TargetDescription{{ + Id: aws.String(instanceId), + }}, + }) + + if err != nil { + return fmt.Errorf("error describing target health: %v", err) + } + + // there will be only one target in the DescribeTargetHealth response. + // DescribeTargetHealth response will contain a target even if the targetId doesn't exist. + // all other states besides TargetHealthStateUnused means that the instance may still be serving traffic. + if aws.StringValue(response.TargetHealthDescriptions[0].TargetHealth.State) != elbv2.TargetHealthStateEnumUnused { + c.ELBV2().DeregisterTargets(&elbv2.DeregisterTargetsInput{ + TargetGroupArn: aws.String(targetGroupArn), + Targets: []*elbv2.TargetDescription{{ + Id: aws.String(instanceId), + }}, + }) + instanceDraining = true + } + } + + if !instanceDraining { + break + } + + time.Sleep(5 * time.Second) + } return nil }