Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[release-1.23] fix: Skip VMSS VM update operations if the VMs are not in good state #2399

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
24 changes: 23 additions & 1 deletion pkg/azureclients/vmssvmclient/azure_vmssvmclient.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (

azclients "sigs.k8s.io/cloud-provider-azure/pkg/azureclients"
"sigs.k8s.io/cloud-provider-azure/pkg/azureclients/armclient"
"sigs.k8s.io/cloud-provider-azure/pkg/consts"
"sigs.k8s.io/cloud-provider-azure/pkg/metrics"
"sigs.k8s.io/cloud-provider-azure/pkg/retry"
)
Expand Down Expand Up @@ -503,6 +504,22 @@ func (c *Client) updateVMSSVMs(ctx context.Context, resourceGroupName string, VM
defer c.armClient.CloseResponse(ctx, resp.Response)
if resp.Error != nil {
klog.V(5).Infof("Received error in %s: resourceID: %s, error: %s", "vmssvm.put.request", resourceID, resp.Error.Error())

errMsg := resp.Error.Error().Error()
if strings.Contains(errMsg, consts.VmssVMNotActiveErrorMessage) {
klog.V(2).Infof("VMSS VM %s is not active, skip updating it.", resourceID)
continue
}
if strings.Contains(errMsg, consts.ParentResourceNotFoundMessageCode) {
klog.V(2).Info("The parent resource of VMSS VM %s is not found, skip updating it.", resourceID)
continue
}
if strings.Contains(errMsg, consts.CannotUpdateVMBeingDeletedMessagePrefix) &&
strings.Contains(errMsg, consts.CannotUpdateVMBeingDeletedMessageSuffix) {
klog.V(2).Infof("The VM %s is being deleted, skip updating it.", resourceID)
continue
}

errors = append(errors, resp.Error)
continue
}
Expand All @@ -521,7 +538,12 @@ func (c *Client) updateVMSSVMs(ctx context.Context, resourceGroupName string, VM
rerr := &retry.Error{}
errs := make([]error, 0)
for _, err := range errors {
if err.IsThrottled() && err.RetryAfter.After(err.RetryAfter) {
if !err.Retriable && strings.Contains(err.Error().Error(), consts.ConcurrentRequestConflictMessage) {
err.Retriable = true
err.RetryAfter = time.Now().Add(5 * time.Second)
}

if err.IsThrottled() && err.RetryAfter.After(rerr.RetryAfter) {
rerr.RetryAfter = err.RetryAfter
}
errs = append(errs, err.Error())
Expand Down
62 changes: 62 additions & 0 deletions pkg/azureclients/vmssvmclient/azure_vmssvmclient_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ import (
azclients "sigs.k8s.io/cloud-provider-azure/pkg/azureclients"
"sigs.k8s.io/cloud-provider-azure/pkg/azureclients/armclient"
"sigs.k8s.io/cloud-provider-azure/pkg/azureclients/armclient/mockarmclient"
"sigs.k8s.io/cloud-provider-azure/pkg/consts"
"sigs.k8s.io/cloud-provider-azure/pkg/retry"
)

Expand Down Expand Up @@ -795,6 +796,67 @@ func TestUpdateVMsThrottle(t *testing.T) {
assert.EqualError(t, throttleErr.Error(), rerr.RawError.Error())
}

func TestUpdateVMsIgnoreError(t *testing.T) {
ctrl := gomock.NewController(t)
defer ctrl.Finish()

vmssVM := getTestVMSSVM("vmss1", "1")
vmssVM2 := getTestVMSSVM("vmss1", "2")
vmssVM3 := getTestVMSSVM("vmss1", "3")
vmssVM4 := getTestVMSSVM("vmss1", "4")
instances := map[string]compute.VirtualMachineScaleSetVM{
"1": vmssVM,
"2": vmssVM2,
"3": vmssVM3,
"4": vmssVM4,
}
testvmssVMs := map[string]interface{}{
to.String(vmssVM.ID): vmssVM,
to.String(vmssVM2.ID): vmssVM2,
to.String(vmssVM3.ID): vmssVM3,
to.String(vmssVM4.ID): vmssVM4,
}
notActiveError := retry.Error{
RawError: fmt.Errorf(consts.VmssVMNotActiveErrorMessage),
Retriable: false,
}
parentResourceNotFoundError := retry.Error{
RawError: fmt.Errorf(consts.ParentResourceNotFoundMessageCode),
Retriable: false,
}
concurrentRequestConflictError := retry.Error{
RawError: fmt.Errorf(consts.ConcurrentRequestConflictMessage),
Retriable: false,
}
beingDeletedError := retry.Error{
RawError: fmt.Errorf("operation 'Put on Virtual Machine Scale Set VM Instance' is not allowed on Virtual Machine Scale Set 'aks-stg1pool1-17586529-vmss' since it is marked for deletion. For more information on your operations, please refer to https://aka.ms/activitylog"),
Retriable: false,
}
responses := map[string]*armclient.PutResourcesResponse{
to.String(vmssVM.ID): {
Error: &notActiveError,
},
to.String(vmssVM2.ID): {
Error: &parentResourceNotFoundError,
},
to.String(vmssVM3.ID): {
Error: &concurrentRequestConflictError,
},
to.String(vmssVM4.ID): {
Error: &beingDeletedError,
},
}

armClient := mockarmclient.NewMockInterface(ctrl)
armClient.EXPECT().PutResourcesInBatches(gomock.Any(), testvmssVMs, 0).Return(responses).Times(1)
armClient.EXPECT().CloseResponse(gomock.Any(), gomock.Any()).Times(len(instances))

vmssvmClient := getTestVMSSVMClient(armClient)
rerr := vmssvmClient.UpdateVMs(context.TODO(), "rg", "vmss1", instances, "test", 0)
assert.NotNil(t, rerr)
assert.Equal(t, rerr.Error().Error(), "Retriable: false, RetryAfter: 4s, HTTPStatusCode: 0, RawError: Retriable: true, RetryAfter: 4s, HTTPStatusCode: 0, RawError: The request failed due to conflict with a concurrent request.")
}

func getTestVMSSVM(vmssName, instanceID string) compute.VirtualMachineScaleSetVM {
resourceID := fmt.Sprintf("/subscriptions/subscriptionID/resourceGroups/rg/providers/Microsoft.Compute/virtualMachineScaleSets/%s/virtualMachines/%s", vmssName, instanceID)
return compute.VirtualMachineScaleSetVM{
Expand Down
8 changes: 8 additions & 0 deletions pkg/consts/consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,14 @@ const (
CannotDeletePublicIPErrorMessageCode = "PublicIPAddressCannotBeDeleted"
// ReferencedResourceNotProvisionedMessageCode means the referenced resource has not been provisioned
ReferencedResourceNotProvisionedMessageCode = "ReferencedResourceNotProvisioned"
// ParentResourceNotFoundMessageCode is the error code that the parent VMSS of the VM is not found.
ParentResourceNotFoundMessageCode = "ParentResourceNotFound"
// ConcurrentRequestConflictMessage is the error message that the request failed due to the conflict with another concurrent operation.
ConcurrentRequestConflictMessage = "The request failed due to conflict with a concurrent request."
// CannotUpdateVMBeingDeletedMessagePrefix is the prefix of the error message that the request failed due to delete a VM that is being deleted
CannotUpdateVMBeingDeletedMessagePrefix = "'Put on Virtual Machine Scale Set VM Instance' is not allowed on Virtual Machine Scale Set"
// CannotUpdateVMBeingDeletedMessageSuffix is the suffix of the error message that the request failed due to delete a VM that is being deleted
CannotUpdateVMBeingDeletedMessageSuffix = "since it is marked for deletion"
)

// node ipam controller
Expand Down
28 changes: 26 additions & 2 deletions pkg/provider/azure_vmss_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1792,11 +1792,13 @@ func TestEnsureHostInPool(t *testing.T) {
isBasicLB bool
isNilVMNetworkConfigs bool
useMultipleSLBs bool
isVMBeingDeleted bool
expectedNodeResourceGroup string
expectedVMSSName string
expectedInstanceID string
expectedVMSSVM *compute.VirtualMachineScaleSetVM
expectedErr error
vmssVMListError *retry.Error
}{
{
description: "EnsureHostInPool should skip the current node if the vmSetName is not equal to the node's vmss name and the basic LB is used",
Expand Down Expand Up @@ -1831,6 +1833,11 @@ func TestEnsureHostInPool(t *testing.T) {
backendPoolID: "/subscriptions/sub/resourceGroups/rg/providers/Microsoft.Network/loadBalancers/lb1-internal/backendAddressPools/backendpool-1",
isBasicLB: false,
},
{
description: "EnsureHostInPool should skip the current node if it is being deleted",
nodeName: "vmss-vm-000000",
isVMBeingDeleted: true,
},
{
description: "EnsureHostInPool should add a new backend pool to the vm",
service: &v1.Service{Spec: v1.ServiceSpec{ClusterIP: "clusterIP"}},
Expand Down Expand Up @@ -1891,12 +1898,29 @@ func TestEnsureHostInPool(t *testing.T) {
mockVMSSClient := ss.cloud.VirtualMachineScaleSetsClient.(*mockvmssclient.MockInterface)
mockVMSSClient.EXPECT().List(gomock.Any(), ss.ResourceGroup).Return([]compute.VirtualMachineScaleSet{expectedVMSS}, nil).AnyTimes()

expectedVMSSVMs, _, _ := buildTestVirtualMachineEnv(ss.cloud, testVMSSName, "", 0, []string{string(test.nodeName)}, "", false)
provisionState := ""
if test.isVMBeingDeleted {
provisionState = "Deleting"
}
expectedVMSSVMs, _, _ := buildTestVirtualMachineEnv(
ss.cloud,
testVMSSName,
"",
0,
[]string{string(test.nodeName)},
provisionState,
false,
)
if test.isNilVMNetworkConfigs {
expectedVMSSVMs[0].NetworkProfileConfiguration.NetworkInterfaceConfigurations = nil
}
mockVMSSVMClient := ss.cloud.VirtualMachineScaleSetVMsClient.(*mockvmssvmclient.MockInterface)
mockVMSSVMClient.EXPECT().List(gomock.Any(), ss.ResourceGroup, testVMSSName, gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes()
mockVMSSVMClient.EXPECT().List(
gomock.Any(),
ss.ResourceGroup,
testVMSSName,
gomock.Any(),
).Return(expectedVMSSVMs, test.vmssVMListError).AnyTimes()

nodeResourceGroup, ssName, instanceID, vm, err := ss.EnsureHostInPool(test.service, test.nodeName, test.backendPoolID, test.vmSetName)
assert.Equal(t, test.expectedErr, err, test.description+", but an error occurs")
Expand Down