Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: decouple vmss with 0 instance from lb when deleting the service #2489

Merged
merged 2 commits into from
Oct 14, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 3 additions & 3 deletions hack/deploy-cluster-capz.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ export AZURE_NODE_MACHINE_TYPE="${AZURE_NODE_MACHINE_TYPE:-Standard_D2s_v3}"
export AZURE_LOCATION="${AZURE_LOCATION:-westus2}"
export AZURE_CLOUD_CONTROLLER_MANAGER_IMG="${AZURE_CLOUD_CONTROLLER_MANAGER_IMG:-mcr.microsoft.com/oss/kubernetes/azure-cloud-controller-manager:v1.23.1}"
export AZURE_CLOUD_NODE_MANAGER_IMG="${AZURE_CLOUD_NODE_MANAGER_IMG:-mcr.microsoft.com/oss/kubernetes/azure-cloud-node-manager:v1.23.1}"
export KUBERNETES_VERSION="${KUBERNETES_VERSION:-v1.23.0}"
export KUBERNETES_VERSION="${KUBERNETES_VERSION:-v1.25.0}"
export USE_IN_TREE_CLOUD_PROVIDER="${USE_IN_TREE_CLOUD_PROVIDER:-false}"
export EXP_MACHINE_POOL=true
export EXP_CLUSTER_RESOURCE_SET=true
Expand Down Expand Up @@ -111,15 +111,15 @@ function create_workload_cluster() {
fi

echo "Waiting for the kubeconfig to become available"
timeout --foreground 300 bash -c "while ! kubectl get secrets | grep ${CLUSTER_NAME}-kubeconfig; do sleep 1; done"
timeout --foreground 1000 bash -c "while ! kubectl get secrets | grep ${CLUSTER_NAME}-kubeconfig; do sleep 1; done"
if [ "$?" == 124 ]; then
echo "Timeout waiting for the kubeconfig to become available, please check the logs of the capz controller to get the detailed error"
return 124
fi
echo "Get kubeconfig and store it locally."
kubectl --context=kind-"${MANAGEMENT_CLUSTER_NAME}" get secrets "${CLUSTER_NAME}"-kubeconfig -o json | jq -r .data.value | base64 --decode > ./"${CLUSTER_NAME}"-kubeconfig
echo "Waiting for the control plane nodes to show up"
timeout --foreground 600 bash -c "while ! kubectl --kubeconfig=./${CLUSTER_NAME}-kubeconfig get nodes | grep master; do sleep 1; done"
timeout --foreground 1000 bash -c "while ! kubectl --kubeconfig=./${CLUSTER_NAME}-kubeconfig get nodes | grep master; do sleep 1; done"
if [ "$?" == 124 ]; then
echo "Timeout waiting for the control plane nodes"
return 124
Expand Down
109 changes: 84 additions & 25 deletions pkg/provider/azure_vmss.go
Original file line number Diff line number Diff line change
Expand Up @@ -1678,22 +1678,80 @@ func getScaleSetAndResourceGroupNameByIPConfigurationID(ipConfigurationID string
return scaleSetName, resourceGroup, nil
}

func (ss *ScaleSet) ensureBackendPoolDeletedFromVMSS(service *v1.Service, backendPoolID, vmSetName string, ipConfigurationIDs []string) error {
func (ss *ScaleSet) ensureBackendPoolDeletedFromVMSS(backendPoolID, vmSetName string) error {
vmssNamesMap := make(map[string]bool)

// the standard load balancer supports multiple vmss in its backend while the basic sku doesn't
if ss.useStandardLoadBalancer() {
for _, ipConfigurationID := range ipConfigurationIDs {
// in this scenario the vmSetName is an empty string and the name of vmss should be obtained from the provider IDs of nodes
vmssName, resourceGroupName, err := getScaleSetAndResourceGroupNameByIPConfigurationID(ipConfigurationID)
cachedUniform, err := ss.vmssCache.Get(consts.VMSSKey, azcache.CacheReadTypeDefault)
if err != nil {
klog.Errorf("ensureBackendPoolDeletedFromVMSS: failed to get vmss uniform from cache: %v", err)
return err
}
flexScaleSet := ss.flexScaleSet.(*FlexScaleSet)
cachedFlex, err := flexScaleSet.vmssFlexCache.Get(consts.VmssFlexKey, azcache.CacheReadTypeDefault)
if err != nil {
klog.Errorf("ensureBackendPoolDeletedFromVMSS: failed to get vmss flex from cache: %v", err)
return err
}
vmssUniformMap := cachedUniform.(*sync.Map)
vmssFlexMap := cachedFlex.(*sync.Map)

var errorList []error
walk := func(key, value interface{}) bool {
var vmss *compute.VirtualMachineScaleSet
if vmssEntry, ok := value.(*vmssEntry); ok {
vmss = vmssEntry.vmss
} else if v, ok := value.(*compute.VirtualMachineScaleSet); ok {
vmss = v
}

// When vmss is being deleted, CreateOrUpdate API would report "the vmss is being deleted" error.
// Since it is being deleted, we shouldn't send more CreateOrUpdate requests for it.
if vmss.ProvisioningState != nil && strings.EqualFold(*vmss.ProvisioningState, consts.VirtualMachineScaleSetsDeallocating) {
klog.V(3).Infof("ensureBackendPoolDeletedFromVMSS: found vmss %s being deleted, skipping", to.String(vmss.Name))
return true
}
if vmss.VirtualMachineProfile == nil {
klog.V(4).Infof("ensureBackendPoolDeletedFromVMSS: vmss %s has no VirtualMachineProfile, skipping", to.String(vmss.Name))
return true
}
if vmss.VirtualMachineProfile.NetworkProfile.NetworkInterfaceConfigurations == nil {
Copy link
Contributor

@zmyzheng zmyzheng Oct 13, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When using VMSS Flex, it is possible the VMSS Flex does not have vm profile. We can skip the VMSS FLex which does not have vm profile

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

klog.V(4).Infof("ensureBackendPoolDeletedFromVMSS: cannot obtain the primary network interface configuration, of vmss %s", to.String(vmss.Name))
return true
}
vmssNIC := *vmss.VirtualMachineProfile.NetworkProfile.NetworkInterfaceConfigurations
primaryNIC, err := getPrimaryNetworkInterfaceConfigurationForScaleSet(vmssNIC, to.String(vmss.Name))
if err != nil {
klog.V(4).Infof("ensureBackendPoolDeletedFromVMSS: found VMAS ipcConfigurationID %s, will skip checking and continue", ipConfigurationID)
continue
klog.Errorf("ensureBackendPoolDeletedFromVMSS: failed to get the primary network interface config of the VMSS %s: %v", to.String(vmss.Name), err)
errorList = append(errorList, err)
return true
}
// only vmsses in the resource group same as it's in azure config are included
if strings.EqualFold(resourceGroupName, ss.ResourceGroup) {
vmssNamesMap[vmssName] = true
primaryIPConfig, err := getPrimaryIPConfigFromVMSSNetworkConfig(primaryNIC)
if err != nil {
klog.Errorf("ensureBackendPoolDeletedFromVMSS: failed to the primary IP config from the VMSS %s's network config : %v", to.String(vmss.Name), err)
errorList = append(errorList, err)
return true
}
loadBalancerBackendAddressPools := make([]compute.SubResource, 0)
if primaryIPConfig.LoadBalancerBackendAddressPools != nil {
loadBalancerBackendAddressPools = *primaryIPConfig.LoadBalancerBackendAddressPools
}
for _, loadBalancerBackendAddressPool := range loadBalancerBackendAddressPools {
if strings.EqualFold(to.String(loadBalancerBackendAddressPool.ID), backendPoolID) {
klog.V(4).Infof("ensureBackendPoolDeletedFromVMSS: found vmss %s with backend pool %s, removing it", to.String(vmss.Name), backendPoolID)
vmssNamesMap[to.String(vmss.Name)] = true
}
}

return true
}

// Walk through all cached vmss, and find the vmss that contains the backendPoolID.
vmssUniformMap.Range(walk)
vmssFlexMap.Range(walk)
if len(errorList) > 0 {
return utilerrors.Flatten(utilerrors.NewAggregate(errorList))
}
} else {
vmssNamesMap[vmSetName] = true
Expand Down Expand Up @@ -1728,15 +1786,7 @@ func (ss *ScaleSet) ensureBackendPoolDeleted(service *v1.Service, backendPoolID,
}
}

// 1. Ensure the backendPoolID is deleted from the VMSS.
if deleteFromVMSet {
err := ss.ensureBackendPoolDeletedFromVMSS(service, backendPoolID, vmSetName, ipConfigurationIDs)
if err != nil {
return err
}
}

// 2. Ensure the backendPoolID is deleted from the VMSS VMs.
// Ensure the backendPoolID is deleted from the VMSS VMs.
hostUpdates := make([]func() error, 0, len(ipConfigurationIDs))
nodeUpdates := make(map[vmssMetaInfo]map[string]compute.VirtualMachineScaleSetVM)
allErrs := make([]error, 0)
Expand Down Expand Up @@ -1881,6 +1931,15 @@ func (ss *ScaleSet) EnsureBackendPoolDeleted(service *v1.Service, backendPoolID,
}
}

// make sure all vmss including uniform and flex are decoupled from
// the lb backend pool even if there is no ipConfigs in the backend pool.
if deleteFromVMSet {
err := ss.ensureBackendPoolDeletedFromVMSS(backendPoolID, vmSetName)
if err != nil {
return err
}
}

if len(vmssUniformBackendIPConfigurations) > 0 {
vmssUniformBackendPools := &[]network.BackendAddressPool{
{
Expand Down Expand Up @@ -1990,23 +2049,23 @@ func (ss *ScaleSet) EnsureBackendPoolDeletedFromVMSets(vmssNamesMap map[string]b
// When vmss is being deleted, CreateOrUpdate API would report "the vmss is being deleted" error.
// Since it is being deleted, we shouldn't send more CreateOrUpdate requests for it.
if vmss.ProvisioningState != nil && strings.EqualFold(*vmss.ProvisioningState, consts.VirtualMachineScaleSetsDeallocating) {
klog.V(3).Infof("ensureVMSSInPool: found vmss %s being deleted, skipping", vmssName)
klog.V(3).Infof("EnsureBackendPoolDeletedFromVMSets: found vmss %s being deleted, skipping", vmssName)
continue
}
if vmss.VirtualMachineProfile.NetworkProfile.NetworkInterfaceConfigurations == nil {
klog.V(4).Infof("EnsureHostInPool: cannot obtain the primary network interface configuration, of vmss %s", vmssName)
klog.V(4).Infof("EnsureBackendPoolDeletedFromVMSets: cannot obtain the primary network interface configuration, of vmss %s", vmssName)
continue
}
vmssNIC := *vmss.VirtualMachineProfile.NetworkProfile.NetworkInterfaceConfigurations
primaryNIC, err := getPrimaryNetworkInterfaceConfigurationForScaleSet(vmssNIC, vmssName)
if err != nil {
klog.Errorf("ensureBackendPoolDeletedFromVMSS: failed to get the primary network interface config of the VMSS %s: %v", vmssName, err)
klog.Errorf("EnsureBackendPoolDeletedFromVMSets: failed to get the primary network interface config of the VMSS %s: %v", vmssName, err)
errors = append(errors, err)
continue
}
primaryIPConfig, err := getPrimaryIPConfigFromVMSSNetworkConfig(primaryNIC)
if err != nil {
klog.Errorf("ensureBackendPoolDeletedFromVMSS: failed to the primary IP config from the VMSS %s's network config : %v", vmssName, err)
klog.Errorf("EnsureBackendPoolDeletedFromVMSets: failed to the primary IP config from the VMSS %s's network config : %v", vmssName, err)
errors = append(errors, err)
continue
}
Expand All @@ -2020,7 +2079,7 @@ func (ss *ScaleSet) EnsureBackendPoolDeletedFromVMSets(vmssNamesMap map[string]b
for i := len(loadBalancerBackendAddressPools) - 1; i >= 0; i-- {
curPool := loadBalancerBackendAddressPools[i]
if strings.EqualFold(backendPoolID, *curPool.ID) {
klog.V(10).Infof("ensureBackendPoolDeletedFromVMSS gets unwanted backend pool %q for VMSS %s", backendPoolID, vmssName)
klog.V(10).Infof("EnsureBackendPoolDeletedFromVMSets gets unwanted backend pool %q for VMSS %s", backendPoolID, vmssName)
found = true
newBackendPools = append(loadBalancerBackendAddressPools[:i], loadBalancerBackendAddressPools[i+1:]...)
}
Expand All @@ -2043,10 +2102,10 @@ func (ss *ScaleSet) EnsureBackendPoolDeletedFromVMSets(vmssNamesMap map[string]b
},
}

klog.V(2).Infof("ensureBackendPoolDeletedFromVMSS begins to update vmss(%s) with backendPoolID %s", vmssName, backendPoolID)
klog.V(2).Infof("EnsureBackendPoolDeletedFromVMSets begins to update vmss(%s) with backendPoolID %s", vmssName, backendPoolID)
rerr := ss.CreateOrUpdateVMSS(ss.ResourceGroup, vmssName, newVMSS)
if rerr != nil {
klog.Errorf("ensureBackendPoolDeletedFromVMSS CreateOrUpdateVMSS(%s) with new backendPoolID %s, err: %v", vmssName, backendPoolID, rerr)
klog.Errorf("EnsureBackendPoolDeletedFromVMSets CreateOrUpdateVMSS(%s) with new backendPoolID %s, err: %v", vmssName, backendPoolID, rerr)
return rerr.Error()
}

Expand Down
2 changes: 1 addition & 1 deletion pkg/provider/azure_vmss_cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ const (

func (ss *ScaleSet) newVMSSCache() (*azcache.TimedCache, error) {
getter := func(key string) (interface{}, error) {
localCache := &sync.Map{} // [vmasName]*vmssEntry
localCache := &sync.Map{} // [vmssName]*vmssEntry

allResourceGroups, err := ss.GetResourceGroups()
if err != nil {
Expand Down
47 changes: 23 additions & 24 deletions pkg/provider/azure_vmss_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2413,42 +2413,38 @@ func TestEnsureBackendPoolDeletedFromVMSS(t *testing.T) {
defer ctrl.Finish()

testCases := []struct {
description string
backendPoolID string
ipConfigurationIDs []string
isVMSSDeallocating bool
isVMSSNilNICConfig bool
expectedPutVMSS bool
vmssClientErr *retry.Error
expectedErr error
description string
backendPoolID string
ipConfigurationIDs []string
isVMSSDeallocating bool
isVMSSNilNICConfig bool
isVMSSNilVirtualMachineProfile bool
expectedPutVMSS bool
vmssClientErr *retry.Error
expectedErr error
}{
{
description: "ensureBackendPoolDeletedFromVMSS should skip the IP config if it's ID is invalid",
ipConfigurationIDs: []string{"invalid/id"},
},
{
description: "ensureBackendPoolDeletedFromVMSS should skip the VMSS if it's being deleting",
ipConfigurationIDs: []string{"/subscriptions/sub/resourceGroups/rg/providers/Microsoft.Compute/virtualMachineScaleSets/vmss/virtualMachines/vmss-vm-000000/networkInterfaces/nic"},
isVMSSDeallocating: true,
},
{
description: "ensureBackendPoolDeletedFromVMSS should skip the VMSS if it's NIC config is nil",
ipConfigurationIDs: []string{"/subscriptions/sub/resourceGroups/rg/providers/Microsoft.Compute/virtualMachineScaleSets/vmss/virtualMachines/vmss-vm-000000/networkInterfaces/nic"},
isVMSSNilNICConfig: true,
},
{
description: "ensureBackendPoolDeletedFromVMSS should delete the corresponding LB backendpool ID",
ipConfigurationIDs: []string{"/subscriptions/sub/resourceGroups/rg/providers/Microsoft.Compute/virtualMachineScaleSets/vmss/virtualMachines/vmss-vm-000000/networkInterfaces/nic"},
backendPoolID: testLBBackendpoolID0,
expectedPutVMSS: true,
description: "ensureBackendPoolDeletedFromVMSS should delete the corresponding LB backendpool ID",
backendPoolID: testLBBackendpoolID0,
expectedPutVMSS: true,
},
{
description: "ensureBackendPoolDeletedFromVMSS should skip the VMSS if there's no wanted LB backendpool ID",
ipConfigurationIDs: []string{"/subscriptions/sub/resourceGroups/rg/providers/Microsoft.Compute/virtualMachineScaleSets/vmss/virtualMachines/vmss-vm-000000/networkInterfaces/nic"},
backendPoolID: testLBBackendpoolID1,
description: "ensureBackendPoolDeletedFromVMSS should skip the VMSS if there's no wanted LB backendpool ID",
backendPoolID: testLBBackendpoolID1,
},
{
description: "ensureBackendPoolDeletedFromVMSS should skip the VMSS if there's no vm profile",
isVMSSNilVirtualMachineProfile: true,
},
{
description: "ensureBackendPoolDeletedFromVMSS should report the error that occurs during VMSS client's call",
ipConfigurationIDs: []string{"/subscriptions/sub/resourceGroups/rg/providers/Microsoft.Compute/virtualMachineScaleSets/vmss/virtualMachines/vmss-vm-000000/networkInterfaces/nic"},
backendPoolID: testLBBackendpoolID0,
expectedPutVMSS: true,
Expand All @@ -2470,8 +2466,11 @@ func TestEnsureBackendPoolDeletedFromVMSS(t *testing.T) {
if test.isVMSSNilNICConfig {
expectedVMSS.VirtualMachineProfile.NetworkProfile.NetworkInterfaceConfigurations = nil
}
if test.isVMSSNilVirtualMachineProfile {
expectedVMSS.VirtualMachineProfile = nil
}
mockVMSSClient := ss.cloud.VirtualMachineScaleSetsClient.(*mockvmssclient.MockInterface)
mockVMSSClient.EXPECT().List(gomock.Any(), ss.ResourceGroup).Return([]compute.VirtualMachineScaleSet{expectedVMSS}, nil).AnyTimes()
mockVMSSClient.EXPECT().List(gomock.Any(), ss.ResourceGroup).Return([]compute.VirtualMachineScaleSet{expectedVMSS}, nil).Times(2)
vmssPutTimes := 0
if test.expectedPutVMSS {
vmssPutTimes = 1
Expand All @@ -2483,7 +2482,7 @@ func TestEnsureBackendPoolDeletedFromVMSS(t *testing.T) {
mockVMSSVMClient := ss.cloud.VirtualMachineScaleSetVMsClient.(*mockvmssvmclient.MockInterface)
mockVMSSVMClient.EXPECT().List(gomock.Any(), ss.ResourceGroup, testVMSSName, gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes()

err = ss.ensureBackendPoolDeletedFromVMSS(&v1.Service{}, test.backendPoolID, testVMSSName, test.ipConfigurationIDs)
err = ss.ensureBackendPoolDeletedFromVMSS(test.backendPoolID, testVMSSName)
if test.expectedErr != nil {
assert.EqualError(t, test.expectedErr, err.Error(), test.description+", but an error occurs")
}
Expand Down
19 changes: 1 addition & 18 deletions pkg/provider/azure_vmssflex_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1846,24 +1846,7 @@ func TestEnsureBackendPoolDeletedVmssFlex(t *testing.T) {
expectedErr: nil,
},
{
description: "EnsureBackendPoolDeleted should return error if vmss update fails",
service: &v1.Service{},
vmSetName: "vmssflex1",
backendPoolID: testBackendPoolID0,
backendAddressPools: testBackendPools,
deleteFromVMSet: true,
isStandardLB: true,
useMultipleSLBs: false,
testVMListWithoutInstanceView: testVMListWithoutInstanceView,
testVMListWithOnlyInstanceView: testVMListWithOnlyInstanceView,
vmListErr: nil,
nic: generateTestNic("testvm1-nic", false, network.ProvisioningStateSucceeded),
nicGetErr: nil,
vmssPutErr: &retry.Error{RawError: fmt.Errorf("failed to update nic")},
expectedErr: fmt.Errorf("Retriable: false, RetryAfter: 0s, HTTPStatusCode: 0, RawError: failed to update nic"),
},
{
description: "EnsureBackendPoolDeleted should return error if vmss update fails",
description: "EnsureBackendPoolDeleted should return error if nic update fails",
service: &v1.Service{},
vmSetName: "vmssflex1",
backendPoolID: testBackendPoolID0,
Expand Down
2 changes: 1 addition & 1 deletion site/content/en/development/deploy-cluster.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ Customizations are supported by environment variables:
| AZURE_LOCATION | false | region of the cluster resources | westus2 |
| AZURE_CLOUD_CONTROLLER_MANAGER_IMG | false | image of the cloud-controller-manager | mcr.microsoft.com/oss/kubernetes/azure-cloud-controller-manager:v1.23.1 |
| AZURE_CLOUD_NODE_MANAGER_IMG | false | image of the cloud-node-manager | mcr.microsoft.com/oss/kubernetes/azure-cloud-node-manager:v1.23.1 |
| KUBERNETES_VERSION | false | Kubernetes components version | v1.23.0 |
| KUBERNETES_VERSION | false | Kubernetes components version | v1.25.0 |
| AZURE_LOADBALANCER_SKU | false | LoadBalancer SKU, Standard or Basic | Standard |
| ENABLE_MULTI_SLB | false | Enable multiple standard LoadBalancers per cluster | false |
| LB_BACKEND_POOL_CONFIG_TYPE | false | LoadBalancer backend pool configuration type, nodeIPConfiguration, nodeIP or podIP | nodeIPConfiguration |
Expand Down
4 changes: 2 additions & 2 deletions tests/k8s-azure/manifest/cluster-api/vmss-multi-nodepool.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -476,10 +476,10 @@ data:
priorityClassName: system-node-critical
hostNetwork: true
nodeSelector:
node-role.kubernetes.io/master: ""
node-role.kubernetes.io/control-plane: ""
serviceAccountName: cloud-controller-manager
tolerations:
- key: node-role.kubernetes.io/master
- key: node-role.kubernetes.io/control-plane
effect: NoSchedule
containers:
- name: cloud-controller-manager
Expand Down