/
worker.go
379 lines (329 loc) · 11.6 KB
/
worker.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
// Copyright 2019 Canonical Ltd.
// Licensed under the AGPLv3, see LICENCE file for details.
package upgradedatabase
import (
"fmt"
"time"
"github.com/juju/errors"
"github.com/juju/names/v5"
"github.com/juju/retry"
"github.com/juju/version/v2"
"github.com/juju/worker/v3"
"gopkg.in/tomb.v2"
"github.com/juju/juju/agent"
"github.com/juju/juju/core/status"
"github.com/juju/juju/state"
"github.com/juju/juju/upgrades"
jujuversion "github.com/juju/juju/version"
"github.com/juju/juju/worker/gate"
"github.com/juju/juju/wrench"
)
// NewLock creates a gate.Lock to be used to synchronise workers
// that need to start after database upgrades have completed.
// The returned Lock should be passed to NewWorker.
// If the agent has already upgraded to the current version,
// then the lock will be returned in the released state.
func NewLock(agentConfig agent.Config) gate.Lock {
lock := gate.NewLock()
// Build numbers are irrelevant to upgrade steps.
upgradedToVersion := agentConfig.UpgradedToVersion().ToPatch()
currentVersion := jujuversion.Current.ToPatch()
if upgradedToVersion == currentVersion {
lock.Unlock()
}
return lock
}
// Config is the configuration needed to construct an upgradeDB worker.
type Config struct {
// UpgradeComplete is a lock used to synchronise workers that must start
// after database upgrades are verified as completed.
UpgradeComplete gate.Lock
// Tag is the current machine tag.
Tag names.Tag
// agent is the running machine agent.
Agent agent.Agent
// Logger is the logger for this worker.
Logger Logger
// Open state is a function pointer for returning a state pool indirection.
OpenState func() (Pool, error)
// PerformUpgrade is a function pointer for executing the DB upgrade steps.
// Context retrieval is lazy because because it requires a real
// state.StatePool that we cast our Pool indirection back to.
// We need the concrete type, because we are unable to indirect all the
// state methods that upgrade steps might require.
// This is OK for in-theatre operation, but is not suitable for testing.
PerformUpgrade func(version.Number, []upgrades.Target, func() upgrades.Context) error
// RetryStrategy is the strategy to use for re-attempting failed upgrades.
RetryStrategy retry.CallArgs
// Clock is used to enforce time-out logic for controllers waiting for the
// master MongoDB upgrades to execute.
Clock Clock
}
// Validate returns an error if the worker config is not valid.
func (cfg Config) Validate() error {
if cfg.UpgradeComplete == nil {
return errors.NotValidf("nil UpgradeComplete lock")
}
if cfg.Tag == nil {
return errors.NotValidf("nil machine tag")
}
k := cfg.Tag.Kind()
if k != names.MachineTagKind && k != names.ControllerAgentTagKind {
return errors.NotValidf("%q tag kind", k)
}
if cfg.Agent == nil {
return errors.NotValidf("nil Agent")
}
if cfg.Logger == nil {
return errors.NotValidf("nil Logger")
}
if cfg.OpenState == nil {
return errors.NotValidf("nil OpenState function")
}
if cfg.PerformUpgrade == nil {
return errors.NotValidf("nil PerformUpgrade function")
}
if cfg.RetryStrategy.Clock == nil {
return errors.NotValidf("nil RetryStrategy Clock")
}
if cfg.RetryStrategy.Delay == 0 {
return errors.NotValidf("zero value for RetryStrategy Delay")
}
if cfg.RetryStrategy.Attempts == 0 && cfg.RetryStrategy.MaxDuration == 0 {
return errors.NotValidf("zero value for RetryStrategy Attempts and MaxDuration")
}
if cfg.Clock == nil {
return errors.NotValidf("nil Clock")
}
return nil
}
// upgradeDB is a worker that will run on a controller machine.
// It is responsible for running upgrade steps of type `DatabaseMaster` on the
// primary MongoDB instance.
type upgradeDB struct {
tomb tomb.Tomb
upgradeComplete gate.Lock
tag names.Tag
agent agent.Agent
logger Logger
pool Pool
performUpgrade func(version.Number, []upgrades.Target, func() upgrades.Context) error
upgradeInfo UpgradeInfo
retryStrategy retry.CallArgs
clock Clock
fromVersion version.Number
toVersion version.Number
}
// NewWorker validates the input configuration, then uses it to create,
// start and return an upgradeDB worker.
func NewWorker(cfg Config) (worker.Worker, error) {
var err error
if err = cfg.Validate(); err != nil {
return nil, errors.Trace(err)
}
w := &upgradeDB{
upgradeComplete: cfg.UpgradeComplete,
tag: cfg.Tag,
agent: cfg.Agent,
logger: cfg.Logger,
performUpgrade: cfg.PerformUpgrade,
retryStrategy: cfg.RetryStrategy,
clock: cfg.Clock,
}
if w.pool, err = cfg.OpenState(); err != nil {
return nil, err
}
w.tomb.Go(w.run)
return w, nil
}
func (w *upgradeDB) run() error {
defer func() {
if err := w.pool.Close(); err != nil {
w.logger.Errorf("failed closing state pool: %v", err)
}
}()
if w.upgradeDone() {
return nil
}
isPrimary, err := w.pool.IsPrimary(w.tag.Id())
if err != nil {
return errors.Trace(err)
}
// Ensure that an upgrade document exists in order to monitor this upgrade.
// This is the same document that will be used by the `upgradesteps` worker
// that will execute subsequently.
// In this worker we use it as a distributed lock - once the status reports
// `UpgradeDBComplete` this causes our member `upgradeComplete` to unlock
// on each controller running this worker.
if w.upgradeInfo, err = w.pool.EnsureUpgradeInfo(w.tag.Id(), w.fromVersion, w.toVersion); err != nil {
return errors.Annotate(err, "retrieving upgrade info")
}
// If we are the primary we need to run the upgrade steps.
// Otherwise we watch state and unlock once the primary has run the steps.
if isPrimary {
err = w.runUpgrade()
} else {
err = w.watchUpgrade()
}
return errors.Trace(err)
}
// upgradeDone returns true if this worker
// does not need to run any upgrade logic.
func (w *upgradeDB) upgradeDone() bool {
// If we are already unlocked, there is nothing to do.
if w.upgradeComplete.IsUnlocked() {
return true
}
// If we are already on the current version, there is nothing to do.
w.fromVersion = w.agent.CurrentConfig().UpgradedToVersion()
w.toVersion = jujuversion.Current
if w.fromVersion == w.toVersion {
w.logger.Infof("database upgrade for %v already completed", w.toVersion)
w.upgradeComplete.Unlock()
return true
}
return false
}
func (w *upgradeDB) runUpgrade() error {
w.logger.Infof("running database upgrade for %v on mongodb primary", w.toVersion)
w.setStatus(status.Started, fmt.Sprintf("upgrading database for %v", w.toVersion))
err := w.agent.ChangeConfig(w.runUpgradeSteps)
if err != nil {
w.setFailStatus()
return errors.Trace(err)
}
// Update the upgrade status document to unlock the other controllers.
err = w.upgradeInfo.SetStatus(state.UpgradeDBComplete)
if err != nil {
w.setFailStatus()
return errors.Trace(err)
}
w.logger.Infof("database upgrade for %v completed successfully.", w.toVersion)
w.setStatus(status.Started, fmt.Sprintf("database upgrade for %v completed", w.toVersion))
w.upgradeComplete.Unlock()
return nil
}
// runUpgradeSteps runs the required database upgrade steps for the agent,
// retrying on failure.
func (w *upgradeDB) runUpgradeSteps(agentConfig agent.ConfigSetter) error {
contextGetter := w.contextGetter(agentConfig)
retryStrategy := w.retryStrategy
retryStrategy.Func = func() error {
return w.performUpgrade(w.fromVersion, []upgrades.Target{upgrades.DatabaseMaster}, contextGetter)
}
retryStrategy.NotifyFunc = func(lastError error, attempt int) {
w.reportUpgradeFailure(lastError, attempt != retryStrategy.Attempts)
}
err := retry.Call(retryStrategy)
if retry.IsAttemptsExceeded(err) || retry.IsDurationExceeded(err) {
err = retry.LastError(err)
}
return errors.Trace(err)
}
// contextGetter returns a function that creates an upgrade context.
// Note that the performUpgrade method passed by the manifold calls
// upgrades.PerformStateUpgrade, which only uses the StateContext from this
// context. We can set the API connection to nil - it should never be used.
func (w *upgradeDB) contextGetter(agentConfig agent.ConfigSetter) func() upgrades.Context {
return func() upgrades.Context {
return upgrades.NewContext(agentConfig, nil, upgrades.NewStateBackend(w.pool.(*pool).StatePool))
}
}
func (w *upgradeDB) watchUpgrade() error {
w.logger.Infof("waiting for database upgrade on mongodb primary")
w.setStatus(status.Started, fmt.Sprintf("waiting on primary database upgrade for %v", w.toVersion))
if wrench.IsActive("upgrade-database", "watch-upgrade") {
// Simulate an error causing the upgrade to fail.
w.setFailStatus()
return errors.New("unable to upgrade - wrench in works")
}
timeout := w.clock.After(10 * time.Minute)
watcher := w.upgradeInfo.Watch()
defer func() { _ = watcher.Stop() }()
// Ensure that we re-read the upgrade document after starting the watcher to
// ensure that we are operating on the latest information, otherwise there
// is a potential race where we wouldn't notice a change.
if err := w.upgradeInfo.Refresh(); err != nil {
w.logger.Errorf("unable to refresh upgrade info: %v", err)
w.setFailStatus()
return err
}
// To be here, this node previously returned false for isPrimary
// Sometimes our primary changes, or is reported as false when called too
// early. In the case that a node state changes whilst watching,
// escalate an error which will result in the worker being restarted
stateChanged := make(chan struct{})
done := make(chan struct{})
defer close(done)
go func() {
for {
select {
case <-done:
return
case <-w.clock.After(5 * time.Second):
isPrimary, err := w.pool.IsPrimary(w.tag.Id())
if isPrimary || err != nil {
if err != nil {
w.logger.Errorf("Failed to check is this node is primary: %v", err)
}
close(stateChanged)
return
}
}
}
}()
for {
// If the primary has already run the database steps then the status
// will be "db-complete", however it may have progressed further on to
// upgrade steps, so we check for that status too.
switch w.upgradeInfo.Status() {
case state.UpgradeDBComplete, state.UpgradeRunning:
w.logger.Infof("finished waiting - database upgrade steps completed on mongodb primary")
w.setStatus(status.Started, fmt.Sprintf("confirmed primary database upgrade for %v", w.toVersion))
w.upgradeComplete.Unlock()
return nil
default:
// Continue waiting for another change.
}
select {
case <-watcher.Changes():
if err := w.upgradeInfo.Refresh(); err != nil {
w.setFailStatus()
return errors.Trace(err)
}
case <-stateChanged:
w.logger.Infof("primary changed mid-upgrade to this watching host. Restart upgrade")
return errors.New("mongo primary state changed")
case <-timeout:
w.setFailStatus()
return errors.New("timed out waiting for primary database upgrade")
case <-w.tomb.Dying():
return tomb.ErrDying
}
}
}
func (w *upgradeDB) reportUpgradeFailure(err error, willRetry bool) {
retryText := "will retry"
if !willRetry {
retryText = "giving up"
}
w.logger.Errorf("database upgrade from %v to %v for %q failed (%s): %v",
w.fromVersion, w.toVersion, w.tag, retryText, err)
w.setFailStatus()
}
func (w *upgradeDB) setFailStatus() {
w.setStatus(status.Error, fmt.Sprintf("upgrading database for %v", w.toVersion))
}
func (w *upgradeDB) setStatus(sts status.Status, msg string) {
if err := w.pool.SetStatus(w.tag.Id(), sts, msg); err != nil {
w.logger.Errorf("setting agent status: %v", err)
}
}
// Kill is part of the worker.Worker interface.
func (w *upgradeDB) Kill() {
w.tomb.Kill(nil)
}
// Wait is part of the worker.Worker interface.
func (w *upgradeDB) Wait() error {
return w.tomb.Wait()
}