From 25352eae70b3849c09b32bbc2b1e3dc061ddbb54 Mon Sep 17 00:00:00 2001 From: wojtekt Date: Thu, 16 Apr 2020 18:59:14 +0200 Subject: [PATCH 1/2] Lazy initialization of network urls for GCE provider --- .../k8s.io/legacy-cloud-providers/gce/gce.go | 98 ++++++++++++------- .../gce/gce_loadbalancer_internal.go | 2 +- 2 files changed, 66 insertions(+), 34 deletions(-) diff --git a/staging/src/k8s.io/legacy-cloud-providers/gce/gce.go b/staging/src/k8s.io/legacy-cloud-providers/gce/gce.go index c4c33e446ba1..cae825c12038 100644 --- a/staging/src/k8s.io/legacy-cloud-providers/gce/gce.go +++ b/staging/src/k8s.io/legacy-cloud-providers/gce/gce.go @@ -99,6 +99,13 @@ type Cloud struct { // for the cloudprovider to start watching the configmap. ClusterID ClusterID + // initializer is used for lazy initialization of subnetworkURL + // and isLegacyNetwork fields if they are not passed via the config. + // The reason is to avoid GCE API calls to initialize them if they + // will never be used. This is especially important when + // it is run from Kubelets, as there can be thousands of them. + subnetworkURLAndIsLegacyNetworkInitializer sync.Once + service *compute.Service serviceBeta *computebeta.Service serviceAlpha *computealpha.Service @@ -115,10 +122,14 @@ type Cloud struct { // managedZones will be set to the 1 zone if running a single zone cluster // it will be set to ALL zones in region for any multi-zone cluster // Use GetAllCurrentZones to get only zones that contain nodes - managedZones []string - networkURL string - isLegacyNetwork bool - subnetworkURL string + managedZones []string + networkURL string + // unsafeIsLegacyNetwork should be used only via IsLegacyNetwork() accessor, + // to ensure it was properly initialized. + unsafeIsLegacyNetwork bool + // unsafeSubnetworkURL should be used only via SubnetworkURL() accessor, + // to ensure it was properly initialized. + unsafeSubnetworkURL string secondaryRangeName string networkProjectID string onXPN bool @@ -465,32 +476,12 @@ func CreateGCECloud(config *CloudConfig) (*Cloud, error) { subnetURL = config.SubnetworkURL } else if config.SubnetworkName != "" { subnetURL = gceSubnetworkURL(config.APIEndpoint, netProjID, config.Region, config.SubnetworkName) - } else { - // Determine the type of network and attempt to discover the correct subnet for AUTO mode. - // Gracefully fail because kubelet calls CreateGCECloud without any config, and minions - // lack the proper credentials for API calls. - if networkName := lastComponent(networkURL); networkName != "" { - var n *compute.Network - if n, err = getNetwork(service, netProjID, networkName); err != nil { - klog.Warningf("Could not retrieve network %q; err: %v", networkName, err) - } else { - switch typeOfNetwork(n) { - case netTypeLegacy: - klog.Infof("Network %q is type legacy - no subnetwork", networkName) - isLegacyNetwork = true - case netTypeCustom: - klog.Warningf("Network %q is type custom - cannot auto select a subnetwork", networkName) - case netTypeAuto: - subnetURL, err = determineSubnetURL(service, netProjID, networkName, config.Region) - if err != nil { - klog.Warningf("Could not determine subnetwork for network %q and region %v; err: %v", networkName, config.Region, err) - } else { - klog.Infof("Auto selecting subnetwork %q", subnetURL) - } - } - } - } } + // If neither SubnetworkURL nor SubnetworkName are provided, defer to + // lazy initialization. Determining subnetURL and isLegacyNetwork requires + // GCE API call. Given that it's not used in many cases and the fact that + // the provider is initialized also for Kubelets (and there can be thousands + // of them) we defer to lazy initialization here. if len(config.ManagedZones) == 0 { config.ManagedZones, err = getZonesForRegion(service, config.ProjectID, config.Region) @@ -518,8 +509,8 @@ func CreateGCECloud(config *CloudConfig) (*Cloud, error) { localZone: config.Zone, managedZones: config.ManagedZones, networkURL: networkURL, - isLegacyNetwork: isLegacyNetwork, - subnetworkURL: subnetURL, + unsafeIsLegacyNetwork: isLegacyNetwork, + unsafeSubnetworkURL: subnetURL, secondaryRangeName: config.SecondaryRangeName, nodeTags: config.NodeTags, nodeInstancePrefix: config.NodeInstancePrefix, @@ -542,6 +533,45 @@ func CreateGCECloud(config *CloudConfig) (*Cloud, error) { return gce, nil } +// initializeNetworkConfig() is supposed to be called under sync.Once() +// for accessors to subnetworkURL and isLegacyNetwork fields. +func (g *Cloud) initializeSubnetworkURLAndIsLegacyNetwork() { + if g.unsafeSubnetworkURL != "" { + // This has already been initialized via the config. + return + } + + var subnetURL string + var isLegacyNetwork bool + + // Determine the type of network and attempt to discover the correct subnet for AUTO mode. + // Gracefully fail because kubelet calls CreateGCECloud without any config, and minions + // lack the proper credentials for API calls. + if networkName := lastComponent(g.NetworkURL()); networkName != "" { + if n, err := getNetwork(g.service, g.NetworkProjectID(), networkName); err != nil { + klog.Warningf("Could not retrieve network %q; err: %v", networkName, err) + } else { + switch typeOfNetwork(n) { + case netTypeLegacy: + klog.Infof("Network %q is type legacy - no subnetwork", networkName) + isLegacyNetwork = true + case netTypeCustom: + klog.Warningf("Network %q is type custom - cannot auto select a subnetwork", networkName) + case netTypeAuto: + subnetURL, err = determineSubnetURL(g.service, g.NetworkProjectID(), networkName, g.Region()) + if err != nil { + klog.Warningf("Could not determine subnetwork for network %q and region %v; err: %v", networkName, g.Region(), err) + } else { + klog.Infof("Auto selecting subnetwork %q", subnetURL) + } + } + } + } + + g.unsafeSubnetworkURL = subnetURL + g.unsafeIsLegacyNetwork = isLegacyNetwork +} + // SetRateLimiter adds a custom cloud.RateLimiter implementation. // WARNING: Calling this could have unexpected behavior if you have in-flight // requests. It is best to use this immediately after creating a Cloud. @@ -672,12 +702,14 @@ func (g *Cloud) NetworkURL() string { // SubnetworkURL returns the subnetwork url func (g *Cloud) SubnetworkURL() string { - return g.subnetworkURL + g.subnetworkURLAndIsLegacyNetworkInitializer.Do(g.initializeSubnetworkURLAndIsLegacyNetwork) + return g.unsafeSubnetworkURL } // IsLegacyNetwork returns true if the cluster is still running a legacy network configuration. func (g *Cloud) IsLegacyNetwork() bool { - return g.isLegacyNetwork + g.subnetworkURLAndIsLegacyNetworkInitializer.Do(g.initializeSubnetworkURLAndIsLegacyNetwork) + return g.unsafeIsLegacyNetwork } // SetInformers sets up the zone handlers we need watching for node changes. diff --git a/staging/src/k8s.io/legacy-cloud-providers/gce/gce_loadbalancer_internal.go b/staging/src/k8s.io/legacy-cloud-providers/gce/gce_loadbalancer_internal.go index 30546d324092..721a89c198b4 100644 --- a/staging/src/k8s.io/legacy-cloud-providers/gce/gce_loadbalancer_internal.go +++ b/staging/src/k8s.io/legacy-cloud-providers/gce/gce_loadbalancer_internal.go @@ -57,7 +57,7 @@ func (g *Cloud) ensureInternalLoadBalancer(clusterName, clusterID string, svc *v } scheme := cloud.SchemeInternal options := getILBOptions(svc) - if g.isLegacyNetwork { + if g.IsLegacyNetwork() { g.eventRecorder.Event(svc, v1.EventTypeWarning, "ILBOptionsIgnored", "Internal LoadBalancer options are not supported with Legacy Networks.") options = ILBOptions{} } From f8c68ef7b4ca62ec41ed1c7dd91c6c0720d5a7a4 Mon Sep 17 00:00:00 2001 From: wojtekt Date: Fri, 17 Apr 2020 15:52:47 +0200 Subject: [PATCH 2/2] Avoid unnecessary GCE API calls for IP-alias calls This is to avoid unnecessary GCE API calls done by getInstanceByName helper, which is iterating over all zones to find in which zone the VM exists. ProviderID already contains all the information - it's in the form: gce:// (VM URL contains project, zone, VM name). ProviderID is propagated by Kubelet on node registration and in case of bugs backfilled by node-controller. --- pkg/controller/nodeipam/ipam/adapter.go | 19 ++++++++++---- .../nodeipam/ipam/cloud_cidr_allocator.go | 5 +++- pkg/controller/nodeipam/ipam/sync/sync.go | 10 +++---- .../nodeipam/ipam/sync/sync_test.go | 8 +++--- .../gce/gce_instances.go | 26 +++++++++---------- 5 files changed, 40 insertions(+), 28 deletions(-) diff --git a/pkg/controller/nodeipam/ipam/adapter.go b/pkg/controller/nodeipam/ipam/adapter.go index bfbfb9e24169..10939a0d974f 100644 --- a/pkg/controller/nodeipam/ipam/adapter.go +++ b/pkg/controller/nodeipam/ipam/adapter.go @@ -21,6 +21,7 @@ package ipam import ( "context" "encoding/json" + "fmt" "net" "k8s.io/klog" @@ -60,8 +61,12 @@ func newAdapter(k8s clientset.Interface, cloud *gce.Cloud) *adapter { return ret } -func (a *adapter) Alias(ctx context.Context, nodeName string) (*net.IPNet, error) { - cidrs, err := a.cloud.AliasRanges(types.NodeName(nodeName)) +func (a *adapter) Alias(ctx context.Context, node *v1.Node) (*net.IPNet, error) { + if node.Spec.ProviderID == "" { + return nil, fmt.Errorf("node %s doesn't have providerID", node.Name) + } + + cidrs, err := a.cloud.AliasRangesByProviderID(node.Spec.ProviderID) if err != nil { return nil, err } @@ -72,7 +77,7 @@ func (a *adapter) Alias(ctx context.Context, nodeName string) (*net.IPNet, error case 1: break default: - klog.Warningf("Node %q has more than one alias assigned (%v), defaulting to the first", nodeName, cidrs) + klog.Warningf("Node %q has more than one alias assigned (%v), defaulting to the first", node.Name, cidrs) } _, cidrRange, err := net.ParseCIDR(cidrs[0]) @@ -83,8 +88,12 @@ func (a *adapter) Alias(ctx context.Context, nodeName string) (*net.IPNet, error return cidrRange, nil } -func (a *adapter) AddAlias(ctx context.Context, nodeName string, cidrRange *net.IPNet) error { - return a.cloud.AddAliasToInstance(types.NodeName(nodeName), cidrRange) +func (a *adapter) AddAlias(ctx context.Context, node *v1.Node, cidrRange *net.IPNet) error { + if node.Spec.ProviderID == "" { + return fmt.Errorf("node %s doesn't have providerID", node.Name) + } + + return a.cloud.AddAliasToInstanceByProviderID(node.Spec.ProviderID, cidrRange) } func (a *adapter) Node(ctx context.Context, name string) (*v1.Node, error) { diff --git a/pkg/controller/nodeipam/ipam/cloud_cidr_allocator.go b/pkg/controller/nodeipam/ipam/cloud_cidr_allocator.go index 19adb41026c4..63975dc76c10 100644 --- a/pkg/controller/nodeipam/ipam/cloud_cidr_allocator.go +++ b/pkg/controller/nodeipam/ipam/cloud_cidr_allocator.go @@ -249,8 +249,11 @@ func (ca *cloudCIDRAllocator) updateCIDRAllocation(nodeName string) error { klog.Errorf("Failed while getting node %v for updating Node.Spec.PodCIDR: %v", nodeName, err) return err } + if node.Spec.ProviderID == "" { + return fmt.Errorf("node %s doesn't have providerID", nodeName) + } - cidrs, err := ca.cloud.AliasRanges(types.NodeName(nodeName)) + cidrs, err := ca.cloud.AliasRangesByProviderID(node.Spec.ProviderID) if err != nil { nodeutil.RecordNodeStatusChange(ca.recorder, node, "CIDRNotAvailable") return fmt.Errorf("failed to allocate cidr: %v", err) diff --git a/pkg/controller/nodeipam/ipam/sync/sync.go b/pkg/controller/nodeipam/ipam/sync/sync.go index ee95392b8ff9..733f76c592cc 100644 --- a/pkg/controller/nodeipam/ipam/sync/sync.go +++ b/pkg/controller/nodeipam/ipam/sync/sync.go @@ -43,9 +43,9 @@ const ( // cloudAlias is the interface to the cloud platform APIs. type cloudAlias interface { // Alias returns the IP alias for the node. - Alias(ctx context.Context, nodeName string) (*net.IPNet, error) + Alias(ctx context.Context, node *v1.Node) (*net.IPNet, error) // AddAlias adds an alias to the node. - AddAlias(ctx context.Context, nodeName string, cidrRange *net.IPNet) error + AddAlias(ctx context.Context, node *v1.Node, cidrRange *net.IPNet) error } // kubeAPI is the interface to the Kubernetes APIs. @@ -204,7 +204,7 @@ func (op *updateOp) run(sync *NodeSync) error { op.node = node } - aliasRange, err := sync.cloudAlias.Alias(ctx, sync.nodeName) + aliasRange, err := sync.cloudAlias.Alias(ctx, op.node) if err != nil { klog.Errorf("Error getting cloud alias for node %q: %v", sync.nodeName, err) return err @@ -293,7 +293,7 @@ func (op *updateOp) updateAliasFromNode(ctx context.Context, sync *NodeSync, nod return err } - if err := sync.cloudAlias.AddAlias(ctx, node.Name, aliasRange); err != nil { + if err := sync.cloudAlias.AddAlias(ctx, node, aliasRange); err != nil { klog.Errorf("Could not add alias %v for node %q: %v", aliasRange, node.Name, err) return err } @@ -325,7 +325,7 @@ func (op *updateOp) allocateRange(ctx context.Context, sync *NodeSync, node *v1. // If addAlias returns a hard error, cidrRange will be leaked as there // is no durable record of the range. The missing space will be // recovered on the next restart of the controller. - if err := sync.cloudAlias.AddAlias(ctx, node.Name, cidrRange); err != nil { + if err := sync.cloudAlias.AddAlias(ctx, node, cidrRange); err != nil { klog.Errorf("Could not add alias %v for node %q: %v", cidrRange, node.Name, err) return err } diff --git a/pkg/controller/nodeipam/ipam/sync/sync_test.go b/pkg/controller/nodeipam/ipam/sync/sync_test.go index 8c80b2c6453b..a10da6e2f458 100644 --- a/pkg/controller/nodeipam/ipam/sync/sync_test.go +++ b/pkg/controller/nodeipam/ipam/sync/sync_test.go @@ -58,13 +58,13 @@ type fakeAPIs struct { results []error } -func (f *fakeAPIs) Alias(ctx context.Context, nodeName string) (*net.IPNet, error) { - f.calls = append(f.calls, fmt.Sprintf("alias %v", nodeName)) +func (f *fakeAPIs) Alias(ctx context.Context, node *v1.Node) (*net.IPNet, error) { + f.calls = append(f.calls, fmt.Sprintf("alias %v", node.Name)) return f.aliasRange, f.aliasErr } -func (f *fakeAPIs) AddAlias(ctx context.Context, nodeName string, cidrRange *net.IPNet) error { - f.calls = append(f.calls, fmt.Sprintf("addAlias %v %v", nodeName, cidrRange)) +func (f *fakeAPIs) AddAlias(ctx context.Context, node *v1.Node, cidrRange *net.IPNet) error { + f.calls = append(f.calls, fmt.Sprintf("addAlias %v %v", node.Name, cidrRange)) return f.addAliasErr } diff --git a/staging/src/k8s.io/legacy-cloud-providers/gce/gce_instances.go b/staging/src/k8s.io/legacy-cloud-providers/gce/gce_instances.go index 15d7b6f2e3f7..f0881f7f0977 100644 --- a/staging/src/k8s.io/legacy-cloud-providers/gce/gce_instances.go +++ b/staging/src/k8s.io/legacy-cloud-providers/gce/gce_instances.go @@ -361,21 +361,20 @@ func (g *Cloud) CurrentNodeName(ctx context.Context, hostname string) (types.Nod return types.NodeName(hostname), nil } -// AliasRanges returns a list of CIDR ranges that are assigned to the +// AliasRangesByProviderID returns a list of CIDR ranges that are assigned to the // `node` for allocation to pods. Returns a list of the form // "/". -func (g *Cloud) AliasRanges(nodeName types.NodeName) (cidrs []string, err error) { +func (g *Cloud) AliasRangesByProviderID(providerID string) (cidrs []string, err error) { ctx, cancel := cloud.ContextWithCallTimeout() defer cancel() - var instance *gceInstance - instance, err = g.getInstanceByName(mapNodeNameToInstanceName(nodeName)) + _, zone, name, err := splitProviderID(providerID) if err != nil { - return + return nil, err } var res *computebeta.Instance - res, err = g.c.BetaInstances().Get(ctx, meta.ZonalKey(instance.Name, lastComponent(instance.Zone))) + res, err = g.c.BetaInstances().Get(ctx, meta.ZonalKey(canonicalizeInstanceName(name), zone)) if err != nil { return } @@ -388,28 +387,29 @@ func (g *Cloud) AliasRanges(nodeName types.NodeName) (cidrs []string, err error) return } -// AddAliasToInstance adds an alias to the given instance from the named +// AddAliasToInstanceByProviderID adds an alias to the given instance from the named // secondary range. -func (g *Cloud) AddAliasToInstance(nodeName types.NodeName, alias *net.IPNet) error { +func (g *Cloud) AddAliasToInstanceByProviderID(providerID string, alias *net.IPNet) error { ctx, cancel := cloud.ContextWithCallTimeout() defer cancel() - v1instance, err := g.getInstanceByName(mapNodeNameToInstanceName(nodeName)) + _, zone, name, err := splitProviderID(providerID) if err != nil { return err } - instance, err := g.c.BetaInstances().Get(ctx, meta.ZonalKey(v1instance.Name, lastComponent(v1instance.Zone))) + + instance, err := g.c.BetaInstances().Get(ctx, meta.ZonalKey(canonicalizeInstanceName(name), zone)) if err != nil { return err } switch len(instance.NetworkInterfaces) { case 0: - return fmt.Errorf("instance %q has no network interfaces", nodeName) + return fmt.Errorf("instance %q has no network interfaces", providerID) case 1: default: klog.Warningf("Instance %q has more than one network interface, using only the first (%v)", - nodeName, instance.NetworkInterfaces) + providerID, instance.NetworkInterfaces) } iface := &computebeta.NetworkInterface{} @@ -420,7 +420,7 @@ func (g *Cloud) AddAliasToInstance(nodeName types.NodeName, alias *net.IPNet) er SubnetworkRangeName: g.secondaryRangeName, }) - mc := newInstancesMetricContext("add_alias", v1instance.Zone) + mc := newInstancesMetricContext("add_alias", zone) err = g.c.BetaInstances().UpdateNetworkInterface(ctx, meta.ZonalKey(instance.Name, lastComponent(instance.Zone)), iface.Name, iface) return mc.Observe(err) }