Skip to content

Commit

Permalink
Merge pull request #8784 from kubevirt-bot/cherry-pick-8530-to-releas…
Browse files Browse the repository at this point in the history
…e-0.58

[release-0.58] Use exponential backoff for failing migrations
  • Loading branch information
kubevirt-bot committed Nov 20, 2022
2 parents 6d4e574 + cc00896 commit 4db212c
Show file tree
Hide file tree
Showing 5 changed files with 209 additions and 2 deletions.
72 changes: 70 additions & 2 deletions pkg/virt-controller/watch/migration.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ package watch
import (
"context"
"encoding/json"
"errors"
"fmt"
"sort"
"strconv"
Expand All @@ -34,7 +35,7 @@ import (
k8sv1 "k8s.io/api/core/v1"
policyv1 "k8s.io/api/policy/v1"
"k8s.io/apimachinery/pkg/api/equality"
"k8s.io/apimachinery/pkg/api/errors"
k8serrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
Expand Down Expand Up @@ -86,6 +87,8 @@ const defaultFinalizedMigrationGarbageCollectionBuffer = 5
// cause the migration to fail when it could have reasonably succeeded.
const defaultCatchAllPendingTimeoutSeconds = int64(60 * 15)

var migrationBackoffError = errors.New(MigrationBackoffReason)

type MigrationController struct {
templateService services.TemplateService
clientset kubecli.KubevirtClient
Expand Down Expand Up @@ -654,6 +657,66 @@ func (c *MigrationController) expandPDB(pdb *policyv1.PodDisruptionBudget, vmi *
return nil
}

func (c *MigrationController) handleMigrationBackoff(key string, vmi *virtv1.VirtualMachineInstance, migration *virtv1.VirtualMachineInstanceMigration) error {
if _, exists := migration.Annotations[virtv1.FuncTestForceIgnoreMigrationBackoffAnnotation]; exists {
return nil
}

migrations, err := c.listMigrationsMatchingVMI(vmi.Namespace, vmi.Name)
if err != nil {
return err
}
if len(migrations) < 2 {
return nil
}

// Newest first
sort.Sort(sort.Reverse(vmimCollection(migrations)))
if migrations[0].UID != migration.UID {
return nil
}

backoff := time.Second * 0
for _, m := range migrations[1:] {
if m.Status.Phase == virtv1.MigrationSucceeded {
break
}
if m.DeletionTimestamp != nil {
continue
}

if m.Status.Phase == virtv1.MigrationFailed {
if backoff == 0 {
backoff = time.Second * 20
} else {
backoff = backoff * 2
}
}
}
if backoff == 0 {
return nil
}

getFailedTS := func(migration *virtv1.VirtualMachineInstanceMigration) metav1.Time {
for _, ts := range migration.Status.PhaseTransitionTimestamps {
if ts.Phase == virtv1.MigrationFailed {
return ts.PhaseTransitionTimestamp
}
}
return metav1.Time{}
}

outOffBackoffTS := getFailedTS(migrations[1]).Add(backoff)
backoff = outOffBackoffTS.Sub(time.Now())

if backoff > 0 {
log.Log.Object(vmi).V(2).Errorf("vmi in migration backoff, re-enqueueing after %v", backoff)
c.Queue.AddAfter(key, backoff)
return migrationBackoffError
}
return nil
}

func (c *MigrationController) handleMarkMigrationFailedOnVMI(migration *virtv1.VirtualMachineInstanceMigration, vmi *virtv1.VirtualMachineInstance) error {

// Mark Migration Done on VMI if virt handler never started it.
Expand Down Expand Up @@ -1079,6 +1142,11 @@ func (c *MigrationController) sync(key string, migration *virtv1.VirtualMachineI
if migration.DeletionTimestamp != nil {
return c.handlePreHandoffMigrationCancel(migration, vmi, pod)
}
if err = c.handleMigrationBackoff(key, vmi, migration); errors.Is(err, migrationBackoffError) {
warningMsg := fmt.Sprintf("backoff migrating vmi %s/%s", vmi.Namespace, vmi.Name)
c.recorder.Eventf(vmi, k8sv1.EventTypeWarning, err.Error(), warningMsg)
return nil
}

if !targetPodExists {
sourcePod, err := controller.CurrentVMIPod(vmi, c.podInformer)
Expand Down Expand Up @@ -1443,7 +1511,7 @@ func (c *MigrationController) garbageCollectFinalizedMigrations(vmi *virtv1.Virt

for i := 0; i < garbageCollectionCount; i++ {
err = c.clientset.VirtualMachineInstanceMigration(vmi.Namespace).Delete(finalizedMigrations[i], &v1.DeleteOptions{})
if err != nil && errors.IsNotFound(err) {
if err != nil && k8serrors.IsNotFound(err) {
// This is safe to ignore. It's possible in some
// scenarios that the migration we're trying to garbage
// collect has already disappeared. Let's log it as debug
Expand Down
51 changes: 51 additions & 0 deletions pkg/virt-controller/watch/migration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1664,6 +1664,56 @@ var _ = Describe("Migration watcher", func() {
shouldExpectMigrationFailedState(migration)
})
})

Context("Migration backoff", func() {
var vmi *virtv1.VirtualMachineInstance

It("should be applied after a migration fails", func() {
vmi = newVirtualMachine("testvmi", virtv1.Running)
failedMigration := newMigration("testmigration", vmi.Name, virtv1.MigrationFailed)
pendingMigration := newMigration("testmigration2", vmi.Name, virtv1.MigrationPending)

failedMigration.Status.PhaseTransitionTimestamps = []virtv1.VirtualMachineInstanceMigrationPhaseTransitionTimestamp{
{
Phase: virtv1.MigrationFailed,
PhaseTransitionTimestamp: failedMigration.CreationTimestamp,
},
}
pendingMigration.CreationTimestamp = metav1.NewTime(failedMigration.CreationTimestamp.Add(time.Second * 1))

_ = migrationInformer.GetStore().Add(failedMigration)
_ = vmiInformer.GetStore().Add(vmi)
addMigration(pendingMigration)

controller.Execute()
Expect(pendingMigration.Status.Phase).To(Equal(virtv1.MigrationPending))
testutils.ExpectEvent(recorder, "MigrationBackoff")
})

It("should be cleared when a migration succeeds", func() {
vmi = newVirtualMachine("testvmi", virtv1.Running)
failedMigration := newMigration("testmigration", vmi.Name, virtv1.MigrationFailed)
successfulMigration := newMigration("testmigration2", vmi.Name, virtv1.MigrationSucceeded)
pendingMigration := newMigration("testmigration3", vmi.Name, virtv1.MigrationPending)

failedMigration.Status.PhaseTransitionTimestamps = []virtv1.VirtualMachineInstanceMigrationPhaseTransitionTimestamp{
{
Phase: virtv1.MigrationFailed,
PhaseTransitionTimestamp: failedMigration.CreationTimestamp,
},
}
successfulMigration.CreationTimestamp = metav1.NewTime(failedMigration.CreationTimestamp.Add(time.Second * 1))
pendingMigration.CreationTimestamp = metav1.NewTime(successfulMigration.CreationTimestamp.Add(time.Second * 1))

_ = migrationInformer.GetStore().Add(failedMigration)
_ = migrationInformer.GetStore().Add(successfulMigration)
_ = vmiInformer.GetStore().Add(vmi)
addMigration(pendingMigration)

controller.Execute()
shouldExpectPodCreation(vmi.UID, pendingMigration.UID, 1, 0, 0)
})
})
})

func newPDB(name string, vmi *virtv1.VirtualMachineInstance, pods int) *policyv1.PodDisruptionBudget {
Expand Down Expand Up @@ -1698,6 +1748,7 @@ func newMigration(name string, vmiName string, phase virtv1.VirtualMachineInstan
virtv1.ControllerAPILatestVersionObservedAnnotation: virtv1.ApiLatestVersion,
virtv1.ControllerAPIStorageVersionObservedAnnotation: virtv1.ApiStorageVersion,
},
CreationTimestamp: metav1.Now(),
},
Spec: virtv1.VirtualMachineInstanceMigrationSpec{
VMIName: vmiName,
Expand Down
3 changes: 3 additions & 0 deletions pkg/virt-controller/watch/vmi.go
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,9 @@ const (
NoSuitableNodesForHostModelMigration = "NoSuitableNodesForHostModelMigration"
// FailedPodPatchReason is set when a pod patch error occurs during sync
FailedPodPatchReason = "FailedPodPatch"
// MigrationBackoffReason is set when an error has occured while migrating
// and virt-controller is backing off before retrying.
MigrationBackoffReason = "MigrationBackoff"
)

const failedToRenderLaunchManifestErrFormat = "failed to render launch manifest: %v"
Expand Down
4 changes: 4 additions & 0 deletions staging/src/kubevirt.io/api/core/v1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -723,6 +723,10 @@ const (

// Used by functional tests to simulate virt-launcher crash looping
FuncTestLauncherFailFastAnnotation string = "kubevirt.io/func-test-virt-launcher-fail-fast"

// Used by functional tests to ignore backoff applied to migrations
FuncTestForceIgnoreMigrationBackoffAnnotation string = "kubevirt.io/func-test-ignore-migration-backoff"

// This label is used to match virtual machine instance IDs with pods.
// Similar to kubevirt.io/domain. Used on Pod.
// Internal use only.
Expand Down
81 changes: 81 additions & 0 deletions tests/migration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ import (
"kubevirt.io/kubevirt/pkg/virt-handler/cgroup"

"kubevirt.io/kubevirt/pkg/util/hardware"
utiltypes "kubevirt.io/kubevirt/pkg/util/types"
"kubevirt.io/kubevirt/pkg/virt-controller/watch"

virtconfig "kubevirt.io/kubevirt/pkg/virt-config"
Expand Down Expand Up @@ -606,6 +607,19 @@ var _ = Describe("[rfe_id:393][crit:high][vendor:cnv-qe@redhat.com][level:system
}, 30*time.Second, 1*time.Second).Should(HaveLen(expectedEventsAmount))
}

expectEvent := func(eventListOpts metav1.ListOptions) {
// This function is dangerous to use from parallel tests as events might override each other.
// This can be removed in the future if these functions are used with great caution.
expectSerialRun()

Eventually(func() []k8sv1.Event {
events, err := virtClient.CoreV1().Events(util.NamespaceTestDefault).List(context.Background(), eventListOpts)
Expect(err).ToNot(HaveOccurred())

return events.Items
}, 30*time.Second, 1*time.Second).ShouldNot(BeEmpty())
}

deleteEvents := func(eventListOpts metav1.ListOptions) {
// See comment in expectEvents() for more info on why that's needed.
expectSerialRun()
Expand Down Expand Up @@ -2333,6 +2347,7 @@ var _ = Describe("[rfe_id:393][crit:high][vendor:cnv-qe@redhat.com][level:system
By("Starting the Migration")
migration := tests.NewRandomMigration(vmi.Name, vmi.Namespace)
migration.Name = fmt.Sprintf("%s-iter-%d", vmi.Name, i)
migration.Annotations = map[string]string{v1.FuncTestForceIgnoreMigrationBackoffAnnotation: ""}
migrationUID := runMigrationAndExpectFailure(migration, 180)

// check VMI, confirm migration state
Expand Down Expand Up @@ -4420,6 +4435,72 @@ var _ = Describe("[rfe_id:393][crit:high][vendor:cnv-qe@redhat.com][level:system
})

})

Context("when migrating fails", func() {
var vmi *v1.VirtualMachineInstance

BeforeEach(func() {
vmi = libvmi.NewCirros(
libvmi.WithInterface(libvmi.InterfaceDeviceWithMasqueradeBinding()),
libvmi.WithNetwork(v1.DefaultPodNetwork()),
libvmi.WithAnnotation(v1.FuncTestForceLauncherMigrationFailureAnnotation, ""),
)
})

It("[Serial] retrying immediately should be blocked by the migration backoff", func() {
By("Starting the VirtualMachineInstance")
vmi = tests.RunVMIAndExpectLaunch(vmi, 240)

By("Waiting for the migration to fail")
migration := tests.NewRandomMigration(vmi.Name, vmi.Namespace)
_ = runMigrationAndExpectFailure(migration, tests.MigrationWaitTime)

By("Try again")
migration = tests.NewRandomMigration(vmi.Name, vmi.Namespace)
_ = runMigrationAndExpectFailure(migration, tests.MigrationWaitTime)

By("Expecting for a MigrationBackoff event to be sent")
eventListOpts := metav1.ListOptions{
FieldSelector: fmt.Sprintf("type=%s,reason=%s", k8sv1.EventTypeWarning, watch.MigrationBackoffReason),
}
expectEvent(eventListOpts)
deleteEvents(eventListOpts)
})

It("[Serial] after a successful migration backoff should be cleared", func() {
By("Starting the VirtualMachineInstance")
vmi = tests.RunVMIAndExpectLaunch(vmi, 240)

By("Waiting for the migration to fail")
migration := tests.NewRandomMigration(vmi.Name, vmi.Namespace)
_ = runMigrationAndExpectFailure(migration, tests.MigrationWaitTime)

By("Patch VMI")
patch := []byte(fmt.Sprintf(`[{"op": "remove", "path": "/metadata/annotations/%s"}]`, utiltypes.EscapeJSONPointer(v1.FuncTestForceLauncherMigrationFailureAnnotation)))
_, err := virtClient.VirtualMachineInstance(vmi.Namespace).Patch(vmi.Name, types.JSONPatchType, patch, &metav1.PatchOptions{})
Expect(err).ToNot(HaveOccurred())

By("Try again with backoff")
migration = tests.NewRandomMigration(vmi.Name, vmi.Namespace)
_ = tests.RunMigrationAndExpectCompletion(virtClient, migration, tests.MigrationWaitTime)

eventListOpts := metav1.ListOptions{
FieldSelector: fmt.Sprintf("type=%s,reason=%s", k8sv1.EventTypeWarning, watch.MigrationBackoffReason),
}
deleteEvents(eventListOpts)

By("There should be no backoff now")
migration = tests.NewRandomMigration(vmi.Name, vmi.Namespace)
_ = tests.RunMigrationAndExpectCompletion(virtClient, migration, tests.MigrationWaitTime)

By("Checking that no backoff event occurred")
events, err := virtClient.CoreV1().Events(util.NamespaceTestDefault).List(context.Background(), metav1.ListOptions{})
Expect(err).ToNot(HaveOccurred())
for _, ev := range events.Items {
Expect(ev.Reason).ToNot(Equal(watch.MigrationBackoffReason))
}
})
})
})

func fedoraVMIWithEvictionStrategy() *v1.VirtualMachineInstance {
Expand Down

0 comments on commit 4db212c

Please sign in to comment.