Skip to content

Commit

Permalink
Merge pull request #3558 from axw/lp1444912-errterminateagent-nounins…
Browse files Browse the repository at this point in the history
…tall

cmd/jujud/agent: rework uninstall-gating logic

The fix for https://bugs.launchpad.net/juju-core/+bug/1464304
was too narrowly focused on the SIGABRT trigger, and so this
commit changes how we gate uninstall.

The termination worker was doing the uninstall-agent file
check, but it really belongs at the top level of the machine
agent, because we attempt to uninstall whenever the agent gets
an ErrTerminateAgent. This can happen in various ways, such
as SIGABRT, or just because the agent made an bad authorization
attempt when opening the API connection. So we move the check
to the top level, and undo the checking in the termination worker.

For this to work and still uninstall when the machine is destroyed,
we must make deeper changes:
 - the machiner worker must inform the agent code when it has
   set the machine to Dead. Immediately after the machine is set
   to Dead, we write out the uninstall-agent file so the ensuing
   ErrTerminateAgent will cause an uninstall
 - if the machine is Dead in state when the agent connects, we
   also want to write out the uninstall-agent file. This means
   changing worker/apicaller to not error on Dead entities, but
   leave that up to the caller

There is a test removed from the unit agent which was really in
the wrong place (it was calling worker/apicaller code directly,
and asserting on its results). More importantly, this behaviour
has changed; we now rely on the worker/uniter code to return
ErrTerminateAgent when it encounters the Dead unit entity.

Fixes https://bugs.launchpad.net/juju-core/+bug/1444912

(Review request: http://reviews.vapour.ws/r/2958/)
  • Loading branch information
jujubot committed Oct 27, 2015
2 parents 05e94fe + b25b4cc commit 4e3972b
Show file tree
Hide file tree
Showing 10 changed files with 364 additions and 199 deletions.
105 changes: 53 additions & 52 deletions cmd/jujud/agent/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ package agent
import (
"fmt"
"io"
"io/ioutil"
"net"
"os"
"path/filepath"
Expand Down Expand Up @@ -34,7 +35,6 @@ import (

"github.com/juju/juju/agent"
"github.com/juju/juju/api"
apiagent "github.com/juju/juju/api/agent"
apideployer "github.com/juju/juju/api/deployer"
apilogsender "github.com/juju/juju/api/logsender"
"github.com/juju/juju/api/metricsmanager"
Expand Down Expand Up @@ -450,11 +450,7 @@ func (a *MachineAgent) Run(*cmd.Context) error {
a.runner.StartWorker("api", a.APIWorker)
a.runner.StartWorker("statestarter", a.newStateStarterWorker)
a.runner.StartWorker("termination", func() (worker.Worker, error) {
return startTerminationWorker(
agentConfig.DataDir(),
terminationworker.NewWorker,
os.Stat,
), nil
return terminationworker.NewWorker(), nil
})

// At this point, all workers will have been configured to start
Expand All @@ -475,44 +471,18 @@ func (a *MachineAgent) Run(*cmd.Context) error {
return err
}

// startTerminationWorker starts a new termination worker that will cause
// the machine agent to uninstall if the uninstall-agent file is present.
func startTerminationWorker(
dataDir string,
newTerminationWorker func(func() error) worker.Worker,
statFile func(string) (os.FileInfo, error),
) worker.Worker {
uninstallFile := filepath.Join(dataDir, agent.UninstallAgentFile)
terminationError := func() error {
// If the uninstall file exists, then the termination
// signal should cause the agent to uninstall; otherwise
// it should just restart the workers.
if _, err := statFile(uninstallFile); err == nil {
return worker.ErrTerminateAgent
}
logger.Debugf(
"uninstall file %q does not exist",
uninstallFile,
)
return &cmdutil.FatalError{fmt.Sprintf(
"%q signal received",
terminationworker.TerminationSignal,
)}
}
return newTerminationWorker(terminationError)
}

func (a *MachineAgent) executeRebootOrShutdown(action params.RebootAction) error {
agentCfg := a.CurrentConfig()
// At this stage, all API connections would have been closed
// We need to reopen the API to clear the reboot flag after
// scheduling the reboot. It may be cleaner to do this in the reboot
// worker, before returning the ErrRebootMachine.
st, _, err := apicaller.OpenAPIState(a)
st, err := apicaller.OpenAPIState(a)
if err != nil {
logger.Infof("Reboot: Error connecting to state")
return errors.Trace(err)
}

// block until all units/containers are ready, and reboot/shutdown
finalize, err := reboot.NewRebootWaiter(st, agentCfg)
if err != nil {
Expand Down Expand Up @@ -669,7 +639,7 @@ func (a *MachineAgent) stateStarter(stopch <-chan struct{}) error {
// APIWorker returns a Worker that connects to the API and starts any
// workers that need an API connection.
func (a *MachineAgent) APIWorker() (_ worker.Worker, err error) {
st, entity, err := apicaller.OpenAPIState(a)
st, err := apicaller.OpenAPIState(a)
if err != nil {
return nil, errors.Trace(err)
}
Expand All @@ -689,8 +659,21 @@ func (a *MachineAgent) APIWorker() (_ worker.Worker, err error) {
}
}()

machine, err := st.Agent().Entity(a.Tag())
if err != nil {
return nil, errors.Trace(err)
}

agentConfig := a.CurrentConfig()
for _, job := range entity.Jobs() {
if machine.Life() == params.Dead {
logger.Errorf("agent terminating - %s is dead", names.ReadableString(a.Tag()))
if err := writeUninstallAgentFile(agentConfig.DataDir()); err != nil {
return nil, errors.Annotate(err, "writing uninstall agent file")
}
return nil, worker.ErrTerminateAgent
}

for _, job := range machine.Jobs() {
if job.NeedsState() {
info, err := st.Agent().StateServingInfo()
if err != nil {
Expand All @@ -713,11 +696,11 @@ func (a *MachineAgent) APIWorker() (_ worker.Worker, err error) {
// Run the agent upgrader and the upgrade-steps worker without waiting for
// the upgrade steps to complete.
runner.StartWorker("upgrader", a.agentUpgraderWorkerStarter(st.Upgrader(), agentConfig))
runner.StartWorker("upgrade-steps", a.upgradeStepsWorkerStarter(st, entity.Jobs()))
runner.StartWorker("upgrade-steps", a.upgradeStepsWorkerStarter(st, machine.Jobs()))

// All other workers must wait for the upgrade steps to complete before starting.
a.startWorkerAfterUpgrade(runner, "api-post-upgrade", func() (worker.Worker, error) {
return a.postUpgradeAPIWorker(st, agentConfig, entity)
return a.postUpgradeAPIWorker(st, agentConfig, machine.Jobs())
})

return cmdutil.NewCloseWorker(logger, runner, st), nil // Note: a worker.Runner is itself a worker.Worker.
Expand All @@ -726,11 +709,11 @@ func (a *MachineAgent) APIWorker() (_ worker.Worker, err error) {
func (a *MachineAgent) postUpgradeAPIWorker(
st api.Connection,
agentConfig agent.Config,
entity *apiagent.Entity,
machineJobs []multiwatcher.MachineJob,
) (worker.Worker, error) {

var isEnvironManager bool
for _, job := range entity.Jobs() {
for _, job := range machineJobs {
if job == multiwatcher.JobManageEnviron {
isEnvironManager = true
break
Expand Down Expand Up @@ -773,7 +756,14 @@ func (a *MachineAgent) postUpgradeAPIWorker(
}
runner.StartWorker("machiner", func() (worker.Worker, error) {
accessor := machiner.APIMachineAccessor{st.Machiner()}
return newMachiner(accessor, agentConfig, ignoreMachineAddresses), nil
return newMachiner(machiner.Config{
MachineAccessor: accessor,
Tag: agentConfig.Tag().(names.MachineTag),
ClearMachineAddressesOnStart: ignoreMachineAddresses,
NotifyMachineDead: func() error {
return writeUninstallAgentFile(agentConfig.DataDir())
},
})
})
runner.StartWorker("reboot", func() (worker.Worker, error) {
reboot, err := st.Reboot()
Expand Down Expand Up @@ -843,7 +833,7 @@ func (a *MachineAgent) postUpgradeAPIWorker(

// Start networker depending on configuration and job.
intrusiveMode := false
for _, job := range entity.Jobs() {
for _, job := range machineJobs {
if job == multiwatcher.JobManageNetworking {
intrusiveMode = true
break
Expand All @@ -864,14 +854,14 @@ func (a *MachineAgent) postUpgradeAPIWorker(
}

// Perform the operations needed to set up hosting for containers.
if err := a.setupContainerSupport(runner, st, entity, agentConfig); err != nil {
if err := a.setupContainerSupport(runner, st, agentConfig); err != nil {
cause := errors.Cause(err)
if params.IsCodeDead(cause) || cause == worker.ErrTerminateAgent {
return nil, worker.ErrTerminateAgent
}
return nil, fmt.Errorf("setting up container support: %v", err)
}
for _, job := range entity.Jobs() {
for _, job := range machineJobs {
switch job {
case multiwatcher.JobHostUnits:
runner.StartWorker("deployer", func() (worker.Worker, error) {
Expand Down Expand Up @@ -947,7 +937,7 @@ var shouldWriteProxyFiles = func(conf agent.Config) bool {

// setupContainerSupport determines what containers can be run on this machine and
// initialises suitable infrastructure to support such containers.
func (a *MachineAgent) setupContainerSupport(runner worker.Runner, st api.Connection, entity *apiagent.Entity, agentConfig agent.Config) error {
func (a *MachineAgent) setupContainerSupport(runner worker.Runner, st api.Connection, agentConfig agent.Config) error {
var supportedContainers []instance.ContainerType
// LXC containers are only supported on bare metal and fully virtualized linux systems
// Nested LXC containers and Windows machines cannot run LXC containers
Expand All @@ -966,7 +956,7 @@ func (a *MachineAgent) setupContainerSupport(runner worker.Runner, st api.Connec
if err == nil && supportsKvm {
supportedContainers = append(supportedContainers, instance.KVM)
}
return a.updateSupportedContainers(runner, st, entity.Tag(), supportedContainers, agentConfig)
return a.updateSupportedContainers(runner, st, supportedContainers, agentConfig)
}

// updateSupportedContainers records in state that a machine can run the specified containers.
Expand All @@ -976,15 +966,11 @@ func (a *MachineAgent) setupContainerSupport(runner worker.Runner, st api.Connec
func (a *MachineAgent) updateSupportedContainers(
runner worker.Runner,
st api.Connection,
machineTag string,
containers []instance.ContainerType,
agentConfig agent.Config,
) error {
pr := st.Provisioner()
tag, err := names.ParseMachineTag(machineTag)
if err != nil {
return err
}
tag := agentConfig.Tag().(names.MachineTag)
machine, err := pr.Machine(tag)
if errors.IsNotFound(err) || err == nil && machine.Life() == params.Dead {
return worker.ErrTerminateAgent
Expand Down Expand Up @@ -1741,8 +1727,23 @@ func (a *MachineAgent) createJujuRun(dataDir string) error {
return symlink.New(jujud, JujuRun)
}

// writeUninstallAgentFile creates the uninstall-agent file on disk,
// which will cause the agent to uninstall itself when it encounters
// the ErrTerminateAgent error.
func writeUninstallAgentFile(dataDir string) error {
uninstallFile := filepath.Join(dataDir, agent.UninstallAgentFile)
return ioutil.WriteFile(uninstallFile, nil, 0644)
}

func (a *MachineAgent) uninstallAgent(agentConfig agent.Config) error {
logger.Infof("machine agent uninstalling itself")
// We should only uninstall if the uninstall file is present.
uninstallFile := filepath.Join(agentConfig.DataDir(), agent.UninstallAgentFile)
if _, err := os.Stat(uninstallFile); err != nil {
logger.Debugf("uninstall file %q does not exist", uninstallFile)
return nil
}
logger.Infof("%q found, uninstalling agent", uninstallFile)

var errors []error
agentServiceName := agentConfig.Value(agent.AgentServiceName)
if agentServiceName == "" {
Expand Down
62 changes: 21 additions & 41 deletions cmd/jujud/agent/machine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -392,14 +392,30 @@ func (s *MachineSuite) TestWithDeadMachine(c *gc.C) {
}

func (s *MachineSuite) TestWithRemovedMachine(c *gc.C) {
m, _, _ := s.primeAgent(c, state.JobHostUnits)
m, ac, _ := s.primeAgent(c, state.JobHostUnits)
err := m.EnsureDead()
c.Assert(err, jc.ErrorIsNil)
err = m.Remove()
c.Assert(err, jc.ErrorIsNil)
a := s.newAgent(c, m)
err = runWithTimeout(a)
c.Assert(err, jc.ErrorIsNil)

// Since the machine is removed from state, when
// the agent attempts to open the API connection
// it will receive an error stating that either
// the entity does not exist, or the agent is
// not authorised. Since we don't know which, we
// do not uninstall unless the uninstall-agent
// file is found (which we haven't written in
// this test).
//
// Since the machine agent is responsible for
// setting itself to Dead, this could only happen
// if it failed to write the uninstall-agent file,
// and then bounced.
_, err = os.Stat(ac.DataDir())
c.Assert(err, jc.ErrorIsNil)
}

func (s *MachineSuite) TestDyingMachine(c *gc.C) {
Expand Down Expand Up @@ -1270,11 +1286,10 @@ func (s *MachineSuite) runOpenAPISTateTest(c *gc.C, machine *state.Machine, conf
agent := NewAgentConf(conf.DataDir())
err := agent.ReadConfig(tagString)
c.Assert(err, jc.ErrorIsNil)
st, gotEntity, err := apicaller.OpenAPIState(agent)
st, err := apicaller.OpenAPIState(agent)
c.Assert(err, jc.ErrorIsNil)
c.Assert(st, gc.NotNil)
st.Close()
c.Assert(gotEntity.Tag(), gc.Equals, tagString)
}
assertOpen()

Expand Down Expand Up @@ -1841,16 +1856,12 @@ func (s *MachineSuite) TestMachineAgentNetworkerMode(c *gc.C) {
func (s *MachineSuite) TestMachineAgentIgnoreAddresses(c *gc.C) {
for _, expectedIgnoreValue := range []bool{true, false} {
ignoreAddressCh := make(chan bool, 1)
s.AgentSuite.PatchValue(&newMachiner, func(
accessor machiner.MachineAccessor,
conf agent.Config,
ignoreMachineAddresses bool,
) worker.Worker {
s.AgentSuite.PatchValue(&newMachiner, func(cfg machiner.Config) (worker.Worker, error) {
select {
case ignoreAddressCh <- ignoreMachineAddresses:
case ignoreAddressCh <- cfg.ClearMachineAddressesOnStart:
default:
}
return machiner.NewMachiner(accessor, conf, ignoreMachineAddresses)
return machiner.NewMachiner(cfg)
})

attrs := coretesting.Attrs{"ignore-machine-addresses": expectedIgnoreValue}
Expand Down Expand Up @@ -2269,37 +2280,6 @@ func (s *shouldWriteProxyFilesSuite) TestAll(c *gc.C) {
}
}

type machineAgentTerminationSuite struct {
coretesting.BaseSuite
}

var _ = gc.Suite(&machineAgentTerminationSuite{})

func (*machineAgentTerminationSuite) TestStartTerminationWorker(c *gc.C) {
var stub gitjujutesting.Stub
statFile := func(path string) (os.FileInfo, error) {
stub.AddCall("Stat", path)
return nil, stub.NextErr()
}
var errorFunction func() error
newTerminationWorker := func(f func() error) worker.Worker {
errorFunction = f
return nil
}
startTerminationWorker("data-dir", newTerminationWorker, statFile)
c.Assert(errorFunction, gc.NotNil)

stub.SetErrors(os.ErrNotExist, nil)
errorResult := errorFunction()
c.Assert(errorResult, gc.FitsTypeOf, (*cmdutil.FatalError)(nil))
c.Assert(errorResult, gc.ErrorMatches, `"[aA]borted" signal received`)
stub.CheckCall(c, 0, "Stat", filepath.Join("data-dir", "uninstall-agent"))

// No error returned from Stat == uninstall-agent exists.
c.Assert(errorFunction(), gc.Equals, worker.ErrTerminateAgent)
stub.CheckCall(c, 1, "Stat", filepath.Join("data-dir", "uninstall-agent"))
}

type mockAgentConfig struct {
agent.Config
providerType string
Expand Down
14 changes: 2 additions & 12 deletions cmd/jujud/agent/unit_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -277,11 +277,10 @@ func (s *UnitSuite) TestOpenAPIState(c *gc.C) {
agent := NewAgentConf(conf.DataDir())
err := agent.ReadConfig(conf.Tag().String())
c.Assert(err, jc.ErrorIsNil)
st, gotEntity, err := apicaller.OpenAPIState(agent)
st, err := apicaller.OpenAPIState(agent)
c.Assert(err, jc.ErrorIsNil)
c.Assert(st, gc.NotNil)
st.Close()
c.Assert(gotEntity.Tag(), gc.Equals, unit.Tag().String())
}
assertOpen()

Expand Down Expand Up @@ -309,16 +308,7 @@ func (s *UnitSuite) TestOpenAPIState(c *gc.C) {

func (s *UnitSuite) TestOpenAPIStateWithBadCredsTerminates(c *gc.C) {
conf, _ := s.PrimeAgent(c, names.NewUnitTag("missing/0"), "no-password")
_, _, err := apicaller.OpenAPIState(fakeConfAgent{conf: conf})
c.Assert(err, gc.Equals, worker.ErrTerminateAgent)
}

func (s *UnitSuite) TestOpenAPIStateWithDeadEntityTerminates(c *gc.C) {
_, unit, conf, _ := s.primeAgent(c)
err := unit.EnsureDead()
c.Assert(err, jc.ErrorIsNil)

_, _, err = apicaller.OpenAPIState(fakeConfAgent{conf: conf})
_, err := apicaller.OpenAPIState(fakeConfAgent{conf: conf})
c.Assert(err, gc.Equals, worker.ErrTerminateAgent)
}

Expand Down
Loading

0 comments on commit 4e3972b

Please sign in to comment.