Skip to content

Commit

Permalink
fix: Skip VMSS VM update operations if the VMs are not in good state
Browse files Browse the repository at this point in the history
  • Loading branch information
nilo19 authored and k8s-infra-cherrypick-robot committed Sep 27, 2022
1 parent f21f354 commit 9bca61a
Show file tree
Hide file tree
Showing 4 changed files with 119 additions and 3 deletions.
24 changes: 23 additions & 1 deletion pkg/azureclients/vmssvmclient/azure_vmssvmclient.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (

azclients "sigs.k8s.io/cloud-provider-azure/pkg/azureclients"
"sigs.k8s.io/cloud-provider-azure/pkg/azureclients/armclient"
"sigs.k8s.io/cloud-provider-azure/pkg/consts"
"sigs.k8s.io/cloud-provider-azure/pkg/metrics"
"sigs.k8s.io/cloud-provider-azure/pkg/retry"
)
Expand Down Expand Up @@ -503,6 +504,22 @@ func (c *Client) updateVMSSVMs(ctx context.Context, resourceGroupName string, VM
defer c.armClient.CloseResponse(ctx, resp.Response)
if resp.Error != nil {
klog.V(5).Infof("Received error in %s: resourceID: %s, error: %s", "vmssvm.put.request", resourceID, resp.Error.Error())

errMsg := resp.Error.Error().Error()
if strings.Contains(errMsg, consts.VmssVMNotActiveErrorMessage) {
klog.V(2).Infof("VMSS VM %s is not active, skip updating it.", resourceID)
continue
}
if strings.Contains(errMsg, consts.ParentResourceNotFoundMessageCode) {
klog.V(2).Info("The parent resource of VMSS VM %s is not found, skip updating it.", resourceID)
continue
}
if strings.Contains(errMsg, consts.CannotUpdateVMBeingDeletedMessagePrefix) &&
strings.Contains(errMsg, consts.CannotUpdateVMBeingDeletedMessageSuffix) {
klog.V(2).Infof("The VM %s is being deleted, skip updating it.", resourceID)
continue
}

errors = append(errors, resp.Error)
continue
}
Expand All @@ -521,7 +538,12 @@ func (c *Client) updateVMSSVMs(ctx context.Context, resourceGroupName string, VM
rerr := &retry.Error{}
errs := make([]error, 0)
for _, err := range errors {
if err.IsThrottled() && err.RetryAfter.After(err.RetryAfter) {
if !err.Retriable && strings.Contains(err.Error().Error(), consts.ConcurrentRequestConflictMessage) {
err.Retriable = true
err.RetryAfter = time.Now().Add(5 * time.Second)
}

if err.IsThrottled() && err.RetryAfter.After(rerr.RetryAfter) {
rerr.RetryAfter = err.RetryAfter
}
errs = append(errs, err.Error())
Expand Down
62 changes: 62 additions & 0 deletions pkg/azureclients/vmssvmclient/azure_vmssvmclient_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ import (
azclients "sigs.k8s.io/cloud-provider-azure/pkg/azureclients"
"sigs.k8s.io/cloud-provider-azure/pkg/azureclients/armclient"
"sigs.k8s.io/cloud-provider-azure/pkg/azureclients/armclient/mockarmclient"
"sigs.k8s.io/cloud-provider-azure/pkg/consts"
"sigs.k8s.io/cloud-provider-azure/pkg/retry"
)

Expand Down Expand Up @@ -795,6 +796,67 @@ func TestUpdateVMsThrottle(t *testing.T) {
assert.EqualError(t, throttleErr.Error(), rerr.RawError.Error())
}

func TestUpdateVMsIgnoreError(t *testing.T) {
ctrl := gomock.NewController(t)
defer ctrl.Finish()

vmssVM := getTestVMSSVM("vmss1", "1")
vmssVM2 := getTestVMSSVM("vmss1", "2")
vmssVM3 := getTestVMSSVM("vmss1", "3")
vmssVM4 := getTestVMSSVM("vmss1", "4")
instances := map[string]compute.VirtualMachineScaleSetVM{
"1": vmssVM,
"2": vmssVM2,
"3": vmssVM3,
"4": vmssVM4,
}
testvmssVMs := map[string]interface{}{
to.String(vmssVM.ID): vmssVM,
to.String(vmssVM2.ID): vmssVM2,
to.String(vmssVM3.ID): vmssVM3,
to.String(vmssVM4.ID): vmssVM4,
}
notActiveError := retry.Error{
RawError: fmt.Errorf(consts.VmssVMNotActiveErrorMessage),
Retriable: false,
}
parentResourceNotFoundError := retry.Error{
RawError: fmt.Errorf(consts.ParentResourceNotFoundMessageCode),
Retriable: false,
}
concurrentRequestConflictError := retry.Error{
RawError: fmt.Errorf(consts.ConcurrentRequestConflictMessage),
Retriable: false,
}
beingDeletedError := retry.Error{
RawError: fmt.Errorf("operation 'Put on Virtual Machine Scale Set VM Instance' is not allowed on Virtual Machine Scale Set 'aks-stg1pool1-17586529-vmss' since it is marked for deletion. For more information on your operations, please refer to https://aka.ms/activitylog"),
Retriable: false,
}
responses := map[string]*armclient.PutResourcesResponse{
to.String(vmssVM.ID): {
Error: &notActiveError,
},
to.String(vmssVM2.ID): {
Error: &parentResourceNotFoundError,
},
to.String(vmssVM3.ID): {
Error: &concurrentRequestConflictError,
},
to.String(vmssVM4.ID): {
Error: &beingDeletedError,
},
}

armClient := mockarmclient.NewMockInterface(ctrl)
armClient.EXPECT().PutResourcesInBatches(gomock.Any(), testvmssVMs, 0).Return(responses).Times(1)
armClient.EXPECT().CloseResponse(gomock.Any(), gomock.Any()).Times(len(instances))

vmssvmClient := getTestVMSSVMClient(armClient)
rerr := vmssvmClient.UpdateVMs(context.TODO(), "rg", "vmss1", instances, "test", 0)
assert.NotNil(t, rerr)
assert.Equal(t, rerr.Error().Error(), "Retriable: false, RetryAfter: 4s, HTTPStatusCode: 0, RawError: Retriable: true, RetryAfter: 4s, HTTPStatusCode: 0, RawError: The request failed due to conflict with a concurrent request.")
}

func getTestVMSSVM(vmssName, instanceID string) compute.VirtualMachineScaleSetVM {
resourceID := fmt.Sprintf("/subscriptions/subscriptionID/resourceGroups/rg/providers/Microsoft.Compute/virtualMachineScaleSets/%s/virtualMachines/%s", vmssName, instanceID)
return compute.VirtualMachineScaleSetVM{
Expand Down
8 changes: 8 additions & 0 deletions pkg/consts/consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,14 @@ const (
CannotDeletePublicIPErrorMessageCode = "PublicIPAddressCannotBeDeleted"
// ReferencedResourceNotProvisionedMessageCode means the referenced resource has not been provisioned
ReferencedResourceNotProvisionedMessageCode = "ReferencedResourceNotProvisioned"
// ParentResourceNotFoundMessageCode is the error code that the parent VMSS of the VM is not found.
ParentResourceNotFoundMessageCode = "ParentResourceNotFound"
// ConcurrentRequestConflictMessage is the error message that the request failed due to the conflict with another concurrent operation.
ConcurrentRequestConflictMessage = "The request failed due to conflict with a concurrent request."
// CannotUpdateVMBeingDeletedMessagePrefix is the prefix of the error message that the request failed due to delete a VM that is being deleted
CannotUpdateVMBeingDeletedMessagePrefix = "'Put on Virtual Machine Scale Set VM Instance' is not allowed on Virtual Machine Scale Set"
// CannotUpdateVMBeingDeletedMessageSuffix is the suffix of the error message that the request failed due to delete a VM that is being deleted
CannotUpdateVMBeingDeletedMessageSuffix = "since it is marked for deletion"
)

// node ipam controller
Expand Down
28 changes: 26 additions & 2 deletions pkg/provider/azure_vmss_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1792,11 +1792,13 @@ func TestEnsureHostInPool(t *testing.T) {
isBasicLB bool
isNilVMNetworkConfigs bool
useMultipleSLBs bool
isVMBeingDeleted bool
expectedNodeResourceGroup string
expectedVMSSName string
expectedInstanceID string
expectedVMSSVM *compute.VirtualMachineScaleSetVM
expectedErr error
vmssVMListError *retry.Error
}{
{
description: "EnsureHostInPool should skip the current node if the vmSetName is not equal to the node's vmss name and the basic LB is used",
Expand Down Expand Up @@ -1831,6 +1833,11 @@ func TestEnsureHostInPool(t *testing.T) {
backendPoolID: "/subscriptions/sub/resourceGroups/rg/providers/Microsoft.Network/loadBalancers/lb1-internal/backendAddressPools/backendpool-1",
isBasicLB: false,
},
{
description: "EnsureHostInPool should skip the current node if it is being deleted",
nodeName: "vmss-vm-000000",
isVMBeingDeleted: true,
},
{
description: "EnsureHostInPool should add a new backend pool to the vm",
service: &v1.Service{Spec: v1.ServiceSpec{ClusterIP: "clusterIP"}},
Expand Down Expand Up @@ -1891,12 +1898,29 @@ func TestEnsureHostInPool(t *testing.T) {
mockVMSSClient := ss.cloud.VirtualMachineScaleSetsClient.(*mockvmssclient.MockInterface)
mockVMSSClient.EXPECT().List(gomock.Any(), ss.ResourceGroup).Return([]compute.VirtualMachineScaleSet{expectedVMSS}, nil).AnyTimes()

expectedVMSSVMs, _, _ := buildTestVirtualMachineEnv(ss.cloud, testVMSSName, "", 0, []string{string(test.nodeName)}, "", false)
provisionState := ""
if test.isVMBeingDeleted {
provisionState = "Deleting"
}
expectedVMSSVMs, _, _ := buildTestVirtualMachineEnv(
ss.cloud,
testVMSSName,
"",
0,
[]string{string(test.nodeName)},
provisionState,
false,
)
if test.isNilVMNetworkConfigs {
expectedVMSSVMs[0].NetworkProfileConfiguration.NetworkInterfaceConfigurations = nil
}
mockVMSSVMClient := ss.cloud.VirtualMachineScaleSetVMsClient.(*mockvmssvmclient.MockInterface)
mockVMSSVMClient.EXPECT().List(gomock.Any(), ss.ResourceGroup, testVMSSName, gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes()
mockVMSSVMClient.EXPECT().List(
gomock.Any(),
ss.ResourceGroup,
testVMSSName,
gomock.Any(),
).Return(expectedVMSSVMs, test.vmssVMListError).AnyTimes()

nodeResourceGroup, ssName, instanceID, vm, err := ss.EnsureHostInPool(test.service, test.nodeName, test.backendPoolID, test.vmSetName, false)
assert.Equal(t, test.expectedErr, err, test.description+", but an error occurs")
Expand Down

0 comments on commit 9bca61a

Please sign in to comment.