Skip to content

Commit

Permalink
migration: use exponential backoff for failing migrations
Browse files Browse the repository at this point in the history
With this patch when migrating a VM fails an increasingly exponential
backoff will be applied before retrying.

Signed-off-by: Antonio Cardace <acardace@redhat.com>
  • Loading branch information
acardace committed Oct 3, 2022
1 parent 3f0e744 commit efad57d
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 4 deletions.
1 change: 1 addition & 0 deletions pkg/virt-controller/watch/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ go_library(
srcs = [
"application.go",
"migration.go",
"migration-backoff.go",
"migrationpolicy.go",
"node.go",
"pool.go",
Expand Down
50 changes: 46 additions & 4 deletions pkg/virt-controller/watch/migration.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ package watch
import (
"context"
"encoding/json"
"errors"
"fmt"
"sort"
"strconv"
Expand All @@ -34,7 +35,7 @@ import (
k8sv1 "k8s.io/api/core/v1"
policyv1 "k8s.io/api/policy/v1"
"k8s.io/apimachinery/pkg/api/equality"
"k8s.io/apimachinery/pkg/api/errors"
k8serrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
Expand Down Expand Up @@ -110,6 +111,8 @@ type MigrationController struct {
handOffLock sync.Mutex
handOffMap map[string]struct{}

backoffHandler *migrationBackoffHandler

unschedulablePendingTimeoutSeconds int64
catchAllPendingTimeoutSeconds int64

Expand Down Expand Up @@ -148,6 +151,7 @@ func NewMigrationController(templateService services.TemplateService,
clusterConfig: clusterConfig,
statusUpdater: status.NewMigrationStatusUpdater(clientset),
handOffMap: make(map[string]struct{}),
backoffHandler: NewMigrationBackoff(),

unschedulablePendingTimeoutSeconds: defaultUnschedulablePendingTimeoutSeconds,
catchAllPendingTimeoutSeconds: defaultCatchAllPendingTimeoutSeconds,
Expand Down Expand Up @@ -211,10 +215,10 @@ func (c *MigrationController) Execute() bool {
defer c.Queue.Done(key)
err := c.execute(key.(string))

if err != nil {
if err != nil && !errors.Is(err, errMigrationBackoff) {
log.Log.Reason(err).Infof("reenqueuing Migration %v", key)
c.Queue.AddRateLimited(key)
} else {
} else if err == nil {
log.Log.V(4).Infof("processed Migration %v", key)
c.Queue.Forget(key)
}
Expand Down Expand Up @@ -276,6 +280,9 @@ func (c *MigrationController) execute(key string) error {
var vmi *virtv1.VirtualMachineInstance
var targetPods []*k8sv1.Pod

// Garbage collect backoff entries
c.backoffHandler.GC()

// Fetch the latest state from cache
obj, exists, err := c.migrationInformer.GetStore().GetByKey(key)
if err != nil {
Expand Down Expand Up @@ -687,6 +694,34 @@ func (c *MigrationController) expandPDB(pdb *policyv1.PodDisruptionBudget, vmi *
return nil
}

func (c *MigrationController) handleMigrationBackoff(key string, vmi *virtv1.VirtualMachineInstance, migration *virtv1.VirtualMachineInstanceMigration, targetPod *k8sv1.Pod) error {
id := string(vmi.UID)

switch migration.Status.Phase {
case virtv1.MigrationSucceeded:
c.backoffHandler.DeleteEntry(id)
log.Log.V(4).Infof("migration %s completed, deleting backoff entry", migration.Name)
case virtv1.MigrationPreparingTarget, virtv1.MigrationTargetReady:
if targetPodDisappeared(targetPod, vmi) {
if c.backoffHandler.IncreaseBackoff(id, migration) == nil {
log.Log.V(2).Errorf("migration %s failed (target pod disappeared), backoff now is %v", migration.Name, c.backoffHandler.Get(id))
}
}
case virtv1.MigrationFailed:
if c.backoffHandler.IncreaseBackoff(id, migration) == nil {
log.Log.V(2).Errorf("migration %s failed, backoff is now %v", migration.Name, c.backoffHandler.Get(id))
}
case virtv1.MigrationPending, virtv1.MigrationPhaseUnset:
if untilBackoffEnd := c.backoffHandler.UntilBackoffEnd(id, time.Now()); untilBackoffEnd > 0 {
log.Log.V(2).Warningf("migration %s in backoff re-enqueueing after %s", migration.Name, untilBackoffEnd)
c.Queue.AddAfter(key, untilBackoffEnd)
return errMigrationBackoff
}
}

return nil
}

func (c *MigrationController) handleMarkMigrationFailedOnVMI(migration *virtv1.VirtualMachineInstanceMigration, vmi *virtv1.VirtualMachineInstance) error {

// Mark Migration Done on VMI if virt handler never started it.
Expand Down Expand Up @@ -1116,6 +1151,13 @@ func (c *MigrationController) sync(key string, migration *virtv1.VirtualMachineI
return fmt.Errorf("vmi is inelgible for migration because another migration job is running")
}

if err = c.handleMigrationBackoff(key, vmi, migration, pod); err != nil {
warningMsg := fmt.Sprintf("backoff migrating vmi %s/%s", vmi.Namespace, vmi.Name)
c.recorder.Eventf(vmi, k8sv1.EventTypeWarning, err.Error(), warningMsg)
log.Log.Object(migration).Warning(warningMsg)
return err
}

switch migration.Status.Phase {
case virtv1.MigrationPending:
if migration.DeletionTimestamp != nil {
Expand Down Expand Up @@ -1479,7 +1521,7 @@ func (c *MigrationController) garbageCollectFinalizedMigrations(vmi *virtv1.Virt

for i := 0; i < garbageCollectionCount; i++ {
err = c.clientset.VirtualMachineInstanceMigration(vmi.Namespace).Delete(finalizedMigrations[i], &v1.DeleteOptions{})
if err != nil && errors.IsNotFound(err) {
if err != nil && k8serrors.IsNotFound(err) {
// This is safe to ignore. It's possible in some
// scenarios that the migration we're trying to garbage
// collect has already disappeared. Let's log it as debug
Expand Down
38 changes: 38 additions & 0 deletions pkg/virt-controller/watch/migration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1807,6 +1807,43 @@ var _ = Describe("Migration watcher", func() {
shouldExpectMigrationFailedState(migration)
})
})

Context("Migration backoff", func() {
var vmi *virtv1.VirtualMachineInstance

It("should be applied after a migration fails", func() {
vmi = newVirtualMachine("testvmi", virtv1.Running)
migration := newMigration("testmigration", vmi.Name, virtv1.MigrationFailed)
id := string(vmi.UID)
addMigration(migration)
addVirtualMachineInstance(vmi)

controller.Execute()
Expect(controller.backoffHandler.Get(id)).To(Equal(initialBackoff))

migration = newMigration("testmigration2", vmi.Name, virtv1.MigrationPending)
addMigration(migration)
controller.Execute()
Expect(migration.Status.Phase).To(Equal(virtv1.MigrationPending))
testutils.ExpectEvent(recorder, errMigrationBackoff.Error())
})

It("should be cleared when a migration succeeds", func() {
vmi = newVirtualMachine("testvmi", virtv1.Running)
migration := newMigration("testmigration", vmi.Name, virtv1.MigrationFailed)
id := string(vmi.UID)
addMigration(migration)
addVirtualMachineInstance(vmi)

controller.Execute()
Expect(controller.backoffHandler.Get(id)).To(Equal(initialBackoff))

migration = newMigration("testmigration2", vmi.Name, virtv1.MigrationSucceeded)
addMigration(migration)
controller.Execute()
Expect(controller.backoffHandler.Get(id)).To(Equal(time.Duration(0)))
})
})
})

func newPDB(name string, vmi *virtv1.VirtualMachineInstance, pods int) *policyv1.PodDisruptionBudget {
Expand Down Expand Up @@ -1841,6 +1878,7 @@ func newMigration(name string, vmiName string, phase virtv1.VirtualMachineInstan
virtv1.ControllerAPILatestVersionObservedAnnotation: virtv1.ApiLatestVersion,
virtv1.ControllerAPIStorageVersionObservedAnnotation: virtv1.ApiStorageVersion,
},
CreationTimestamp: metav1.Now(),
},
Spec: virtv1.VirtualMachineInstanceMigrationSpec{
VMIName: vmiName,
Expand Down

0 comments on commit efad57d

Please sign in to comment.