Skip to content

Commit

Permalink
worker/undertaker: don't ignore all errors
Browse files Browse the repository at this point in the history
The undertaker was ignoring all errors when attempting
to transition a Dying model to Dead. This is wrong; we
should only ignore a couple of specific errors relating
to the contents of the model. After those errors are
received, we'll retry. Other errors should cause the
worker to restart.

Possibly fixes https://bugs.launchpad.net/juju/+bug/1695894
  • Loading branch information
axw committed Aug 18, 2017
1 parent 7d513a8 commit bf0de30
Show file tree
Hide file tree
Showing 10 changed files with 135 additions and 49 deletions.
2 changes: 1 addition & 1 deletion api/controller/legacy_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ func (s *legacySuite) TestDestroyController(c *gc.C) {
sysManager := s.OpenAPI(c)
defer sysManager.Close()
err := sysManager.DestroyController(false)
c.Assert(err, gc.ErrorMatches, `failed to destroy model: hosting 1 other models \(controller has hosted models\)`)
c.Assert(err, gc.ErrorMatches, `failed to destroy model: hosting 1 other model \(controller has hosted models\)`)
}

func (s *legacySuite) TestListBlockedModels(c *gc.C) {
Expand Down
4 changes: 4 additions & 0 deletions apiserver/common/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,8 @@ func ServerError(err error) *params.Error {
code = params.CodeHasAssignedUnits
case state.IsHasHostedModelsError(err):
code = params.CodeHasHostedModels
case state.IsModelNotEmptyError(err):
code = params.CodeModelNotEmpty
case isNoAddressSetError(err):
code = params.CodeNoAddressSet
case errors.IsNotProvisioned(err):
Expand Down Expand Up @@ -308,6 +310,8 @@ func RestoreError(err error) error {
return err
case params.IsCodeHasHostedModels(err):
return err
case params.IsCodeModelNotEmpty(err):
return err
case params.IsCodeNoAddressSet(err):
// TODO(ericsnow) Handle isNoAddressSetError here.
// ...by parsing msg?
Expand Down
6 changes: 4 additions & 2 deletions apiserver/common/modeldestroy_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ func (s *destroyTwoModelsSuite) TestDestroyControllerAfterNonControllerIsDestroy
otherFactory.MakeMachineNested(c, m.Id(), nil)

err := common.DestroyModel(s.modelManager, s.State.ModelTag())
c.Assert(err, gc.ErrorMatches, "failed to destroy model: hosting 1 other models")
c.Assert(err, gc.ErrorMatches, "failed to destroy model: hosting 1 other model")

needsCleanup, err := s.State.NeedsCleanup()
c.Assert(err, jc.ErrorIsNil)
Expand All @@ -319,7 +319,7 @@ func (s *destroyTwoModelsSuite) TestDestroyControllerAfterNonControllerIsDestroy
// The hosted model is Dying, not Dead; we cannot destroy
// the controller model until all hosted models are Dead.
err = common.DestroyModel(s.modelManager, s.State.ModelTag())
c.Assert(err, gc.ErrorMatches, "failed to destroy model: hosting 1 other models")
c.Assert(err, gc.ErrorMatches, "failed to destroy model: hosting 1 other model")

// Continue to take the hosted model down so we can
// destroy the controller model.
Expand All @@ -333,6 +333,8 @@ func (s *destroyTwoModelsSuite) TestDestroyControllerAfterNonControllerIsDestroy
otherEnv, err := s.otherState.Model()
c.Assert(err, jc.ErrorIsNil)
c.Assert(otherEnv.Life(), gc.Equals, state.Dead)
err = s.otherState.RemoveAllModelDocs()
c.Assert(err, jc.ErrorIsNil)

env, err := s.State.Model()
c.Assert(err, jc.ErrorIsNil)
Expand Down
5 changes: 5 additions & 0 deletions apiserver/params/apierror.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ const (
CodeDead = "dead"
CodeHasAssignedUnits = "machine has assigned units"
CodeHasHostedModels = "controller has hosted models"
CodeModelNotEmpty = "model not empty"
CodeMachineHasAttachedStorage = "machine has attached storage"
CodeNotProvisioned = "not provisioned"
CodeNoAddressSet = "no address set"
Expand Down Expand Up @@ -184,6 +185,10 @@ func IsCodeHasHostedModels(err error) bool {
return ErrCode(err) == CodeHasHostedModels
}

func IsCodeModelNotEmpty(err error) bool {
return ErrCode(err) == CodeModelNotEmpty
}

func IsCodeMachineHasAttachedStorage(err error) bool {
return ErrCode(err) == CodeMachineHasAttachedStorage
}
Expand Down
2 changes: 1 addition & 1 deletion featuretests/api_undertaker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ func (s *undertakerSuite) TestStateProcessDyingEnviron(c *gc.C) {
c.Assert(env.Life(), gc.Equals, state.Dying)

err = undertakerClient.ProcessDyingModel()
c.Assert(err, gc.ErrorMatches, `model not empty, found 1 machine\(s\)`)
c.Assert(err, gc.ErrorMatches, `model not empty, found 1 machine \(model not empty\)`)
}

func (s *undertakerSuite) TestStateRemoveEnvironFails(c *gc.C) {
Expand Down
74 changes: 56 additions & 18 deletions state/model.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package state

import (
"fmt"
"strings"

"github.com/juju/errors"
jujutxn "github.com/juju/txn"
Expand Down Expand Up @@ -898,7 +899,11 @@ var errModelNotAlive = errors.New("model is no longer alive")
type hasHostedModelsError int

func (e hasHostedModelsError) Error() string {
return fmt.Sprintf("hosting %d other models", e)
s := ""
if e != 1 {
s = "s"
}
return fmt.Sprintf("hosting %d other model"+s, e)
}

func IsHasHostedModelsError(err error) bool {
Expand Down Expand Up @@ -1046,6 +1051,47 @@ func (m *Model) destroyOps(ensureNoHostedModels, ensureEmpty bool) ([]txn.Op, er
return append(prereqOps, ops...), nil
}

type modelNotEmptyError struct {
machines int
applications int
volumes int
filesystems int
}

// Error is part of the error interface.
func (e modelNotEmptyError) Error() string {
msg := "model not empty, found "
plural := func(n int, thing string) string {
s := fmt.Sprintf("%d %s", n, thing)
if n != 1 {
s += "s"
}
return s
}
var contains []string
if n := e.machines; n > 0 {
contains = append(contains, plural(n, "machine"))
}
if n := e.applications; n > 0 {
contains = append(contains, plural(n, "application"))
}
if n := e.volumes; n > 0 {
contains = append(contains, plural(n, "volume"))
}
if n := e.filesystems; n > 0 {
contains = append(contains, plural(n, "filesystem"))
}
return msg + strings.Join(contains, ", ")
}

// IsModelNotEmptyError reports whether or not the given error was caused
// due to an operation requiring a model to be empty, where the model is
// non-empty.
func IsModelNotEmptyError(err error) bool {
_, ok := errors.Cause(err).(modelNotEmptyError)
return ok
}

// checkEmpty checks that the machine is empty of any entities that may
// require external resource cleanup. If the model is not empty, then
// an error will be returned.
Expand All @@ -1060,25 +1106,17 @@ func (m *Model) checkEmpty() error {
}
return errors.Annotatef(err, "getting entity references for model %s", m.UUID())
}
// These errors could be potentially swallowed as we re-try to destroy model.
// Let's, at least, log them for observation.
if n := len(doc.Machines); n > 0 {
logger.Infof("model is still not empty, has machines: %v", doc.Machines)
return errors.Errorf("model not empty, found %d machine(s)", n)
}
if n := len(doc.Applications); n > 0 {
logger.Infof("model is still not empty, has applications: %v", doc.Applications)
return errors.Errorf("model not empty, found %d application(s)", n)
}
if n := len(doc.Volumes); n > 0 {
logger.Infof("model is still not empty, has volumes: %v", doc.Volumes)
return errors.Errorf("model not empty, found %d volume(s)", n)

err := modelNotEmptyError{
machines: len(doc.Machines),
applications: len(doc.Applications),
volumes: len(doc.Volumes),
filesystems: len(doc.Filesystems),
}
if n := len(doc.Filesystems); n > 0 {
logger.Infof("model is still not empty, has file systems: %v", doc.Filesystems)
return errors.Errorf("model not empty, found %d filesystem(s)", n)
if err == (modelNotEmptyError{}) {
return nil
}
return nil
return err
}

func addModelMachineRefOp(st *State, machineId string) txn.Op {
Expand Down
29 changes: 21 additions & 8 deletions state/model_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -419,7 +419,7 @@ func (s *ModelSuite) TestDestroyControllerNonEmptyModelFails(c *gc.C) {

env, err := s.State.Model()
c.Assert(err, jc.ErrorIsNil)
c.Assert(env.Destroy(), gc.ErrorMatches, "failed to destroy model: hosting 1 other models")
c.Assert(env.Destroy(), gc.ErrorMatches, "failed to destroy model: hosting 1 other model")
}

func (s *ModelSuite) TestDestroyControllerEmptyModel(c *gc.C) {
Expand Down Expand Up @@ -465,6 +465,8 @@ func (s *ModelSuite) TestDestroyControllerAndHostedModels(c *gc.C) {

c.Assert(env2.Refresh(), jc.ErrorIsNil)
c.Assert(env2.Life(), gc.Equals, state.Dead)
err = st2.RemoveAllModelDocs()
c.Assert(err, jc.ErrorIsNil)

c.Assert(s.State.ProcessDyingModel(), jc.ErrorIsNil)
c.Assert(env.Refresh(), jc.ErrorIsNil)
Expand Down Expand Up @@ -513,7 +515,8 @@ func (s *ModelSuite) TestDestroyControllerAndHostedModelsWithResources(c *gc.C)
assertEnv(controllerEnv, s.State, state.Dying, 0, 0)

err = s.State.ProcessDyingModel()
c.Assert(err, gc.ErrorMatches, `one or more hosted models are not yet dead`)
c.Assert(err, jc.Satisfies, state.IsHasHostedModelsError)
c.Assert(err, gc.ErrorMatches, `hosting 1 other model`)

assertCleanupCount(c, otherSt, 3)
assertAllMachinesDeadAndRemove(c, otherSt)
Expand All @@ -523,6 +526,12 @@ func (s *ModelSuite) TestDestroyControllerAndHostedModelsWithResources(c *gc.C)
c.Assert(otherEnv.Refresh(), jc.ErrorIsNil)
c.Assert(otherEnv.Life(), gc.Equals, state.Dead)

// Until the model is removed, we can't mark the controller model Dead.
err = s.State.ProcessDyingModel()
c.Assert(err, gc.ErrorMatches, `hosting 1 other model`)

err = otherSt.RemoveAllModelDocs()
c.Assert(err, jc.ErrorIsNil)
c.Assert(s.State.ProcessDyingModel(), jc.ErrorIsNil)
c.Assert(controllerEnv.Refresh(), jc.ErrorIsNil)
c.Assert(controllerEnv.Life(), gc.Equals, state.Dead)
Expand Down Expand Up @@ -566,7 +575,7 @@ func (s *ModelSuite) TestDestroyControllerRemoveEmptyAddNonEmptyModel(c *gc.C) {

env, err := s.State.Model()
c.Assert(err, jc.ErrorIsNil)
c.Assert(env.Destroy(), gc.ErrorMatches, "failed to destroy model: hosting 1 other models")
c.Assert(env.Destroy(), gc.ErrorMatches, "failed to destroy model: hosting 1 other model")
}

func (s *ModelSuite) TestDestroyControllerNonEmptyModelRace(c *gc.C) {
Expand All @@ -580,7 +589,7 @@ func (s *ModelSuite) TestDestroyControllerNonEmptyModelRace(c *gc.C) {

env, err := s.State.Model()
c.Assert(err, jc.ErrorIsNil)
c.Assert(env.Destroy(), gc.ErrorMatches, "failed to destroy model: hosting 1 other models")
c.Assert(env.Destroy(), gc.ErrorMatches, "failed to destroy model: hosting 1 other model")
}

func (s *ModelSuite) TestDestroyControllerAlreadyDyingRaceNoOp(c *gc.C) {
Expand Down Expand Up @@ -735,7 +744,8 @@ func (s *ModelSuite) TestProcessDyingModelWithMachinesAndServicesNoOp(c *gc.C) {
defer state.SetAfterHooks(c, st, func() {
assertEnv(state.Dying, 1, 1)
err := st.ProcessDyingModel()
c.Assert(err, gc.ErrorMatches, `model not empty, found 1 machine\(s\)`)
c.Assert(err, jc.Satisfies, state.IsModelNotEmptyError)
c.Assert(err, gc.ErrorMatches, `model not empty, found 1 machine, 1 application`)
assertEnv(state.Dying, 1, 1)
}).Check()

Expand Down Expand Up @@ -781,7 +791,8 @@ func (s *ModelSuite) TestProcessDyingModelWithVolumeBackedFilesystems(c *gc.C) {
// The filesystem will be gone, but the volume is persistent and should
// not have been removed.
err = st.ProcessDyingModel()
c.Assert(err, gc.ErrorMatches, `model not empty, found 1 volume\(s\)`)
c.Assert(err, jc.Satisfies, state.IsModelNotEmptyError)
c.Assert(err, gc.ErrorMatches, `model not empty, found 1 volume`)
}

func (s *ModelSuite) TestProcessDyingModelWithVolumes(c *gc.C) {
Expand Down Expand Up @@ -818,7 +829,8 @@ func (s *ModelSuite) TestProcessDyingModelWithVolumes(c *gc.C) {
// The volume is persistent and should not have been removed along with
// the machine it was attached to.
err = st.ProcessDyingModel()
c.Assert(err, gc.ErrorMatches, `model not empty, found 1 volume\(s\)`)
c.Assert(err, jc.Satisfies, state.IsModelNotEmptyError)
c.Assert(err, gc.ErrorMatches, `model not empty, found 1 volume`)
}

func (s *ModelSuite) TestProcessDyingControllerEnvironWithHostedEnvsNoOp(c *gc.C) {
Expand All @@ -832,7 +844,8 @@ func (s *ModelSuite) TestProcessDyingControllerEnvironWithHostedEnvsNoOp(c *gc.C
c.Assert(controllerEnv.DestroyIncludingHosted(), jc.ErrorIsNil)

err = s.State.ProcessDyingModel()
c.Assert(err, gc.ErrorMatches, `one or more hosted models are not yet dead`)
c.Assert(err, jc.Satisfies, state.IsHasHostedModelsError)
c.Assert(err, gc.ErrorMatches, `hosting 1 other model`)

c.Assert(controllerEnv.Refresh(), jc.ErrorIsNil)
c.Assert(controllerEnv.Life(), gc.Equals, state.Dying)
Expand Down
18 changes: 12 additions & 6 deletions state/undertaker.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,13 @@ import (

var ErrModelNotDying = errors.New("model is not dying")

// ProcessDyingModel checks if there are any machines or services left in
// state. If there are none, the model's life is changed from dying to dead.
// ProcessDyingModel checks if the model is Dying and empty, and if so,
// transitions the model to Dead.
//
// If the model is non-empty because it is the controller model and still
// contains hosted models, an error satisfying IsHasHostedModelsError will
// be returned. If the model is otherwise non-empty, an error satisfying
// IsNonEmptyModelError will be returned.
func (st *State) ProcessDyingModel() (err error) {
buildTxn := func(attempt int) ([]txn.Op, error) {
model, err := st.Model()
Expand All @@ -25,14 +30,15 @@ func (st *State) ProcessDyingModel() (err error) {
}

if st.IsController() {
// We should not mark the controller model as Dead until
// all hosted models have been removed, otherwise the
// hosted model environs may not have beeen destroyed.
models, err := st.AllModels()
if err != nil {
return nil, errors.Trace(err)
}
for _, model := range models {
if model.UUID() != st.ModelUUID() && model.Life() != Dead {
return nil, errors.Errorf("one or more hosted models are not yet dead")
}
if n := len(models) - 1; n > 0 {
return nil, errors.Trace(hasHostedModelsError(n))
}
}

Expand Down
20 changes: 9 additions & 11 deletions worker/undertaker/undertaker.go
Original file line number Diff line number Diff line change
Expand Up @@ -154,30 +154,28 @@ func (u *Undertaker) processDyingModel() error {
if err := u.catacomb.Add(watcher); err != nil {
return errors.Trace(err)
}
defer watcher.Kill() // The watcher is not needed once this func returns.
defer watcher.Kill()

attempt := 1
for {
select {
case <-u.catacomb.Dying():
return u.catacomb.ErrDying()
case <-watcher.Changes():
// TODO(fwereade): this is wrong. If there's a time
// it's ok to ignore an error, it's when we know
// exactly what an error is/means. If there's a
// specific code for "not done yet", *that* is what
// we should be ignoring. But unknown errors are
// *unknown*, and we can't just assume that it's
// safe to continue.
err := u.config.Facade.ProcessDyingModel()
if err == nil {
// ProcessDyingModel succeeded. We're free to
// destroy any remaining environ resources.
return nil
}
// Yes, we ignore the error. See comment above. But let's at least
// surface it in status.
u.setStatus(status.Destroying, fmt.Sprintf("attempt %d to destroy model failed (will retry): %v", attempt, err))
if !params.IsCodeModelNotEmpty(err) && !params.IsCodeHasHostedModels(err) {
return errors.Trace(err)
}
// Retry once there are changes to the model's resources.
u.setStatus(
status.Destroying,
fmt.Sprintf("attempt %d to destroy model failed (will retry): %v", attempt, err),
)
}
attempt++
}
Expand Down
24 changes: 22 additions & 2 deletions worker/undertaker/undertaker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,9 +119,9 @@ func (s *UndertakerSuite) TestProcessDyingModelErrorRetried(c *gc.C) {
nil, // ModelInfo
nil, // SetStatus
nil, // WatchModelResources,
errors.New("meh, will retry"), // ProcessDyingModel,
&params.Error{Code: params.CodeHasHostedModels},
nil, // SetStatus
errors.New("will retry again"), // ProcessDyingModel,
&params.Error{Code: params.CodeModelNotEmpty},
nil, // SetStatus
nil, // ProcessDyingModel,
nil, // SetStatus
Expand All @@ -146,6 +146,26 @@ func (s *UndertakerSuite) TestProcessDyingModelErrorRetried(c *gc.C) {
)
}

func (s *UndertakerSuite) TestProcessDyingModelErrorFatal(c *gc.C) {
s.fix.errors = []error{
nil, // ModelInfo
nil, // SetStatus
nil, // WatchModelResources,
errors.New("nope"),
}
s.fix.dirty = true
stub := s.fix.run(c, func(w worker.Worker) {
err := workertest.CheckKilled(c, w)
c.Check(err, gc.ErrorMatches, "nope")
})
stub.CheckCallNames(c,
"ModelInfo",
"SetStatus",
"WatchModelResources",
"ProcessDyingModel",
)
}

func (s *UndertakerSuite) TestDestroyErrorFatal(c *gc.C) {
s.fix.errors = []error{nil, nil, errors.New("pow")}
s.fix.info.Result.Life = "dead"
Expand Down

0 comments on commit bf0de30

Please sign in to comment.