forked from hyperledger/fabric
-
Notifications
You must be signed in to change notification settings - Fork 0
/
election.go
487 lines (427 loc) · 13.5 KB
/
election.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
/*
Copyright IBM Corp. 2016 All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package election
import (
"bytes"
"sync"
"sync/atomic"
"time"
"github.com/hyperledger/fabric/gossip/util"
"github.com/op/go-logging"
"github.com/spf13/viper"
)
// Gossip leader election module
// Algorithm properties:
// - Peers break symmetry by comparing IDs
// - Each peer is either a leader or a follower,
// and the aim is to have exactly 1 leader if the membership view
// is the same for all peers
// - If the network is partitioned into 2 or more sets, the number of leaders
// is the number of network partitions, but when the partition heals,
// only 1 leader should be left eventually
// - Peers communicate by gossiping leadership proposal or declaration messages
// The Algorithm, in pseudo code:
//
//
// variables:
// leaderKnown = false
//
// Invariant:
// Peer listens for messages from remote peers
// and whenever it receives a leadership declaration,
// leaderKnown is set to true
//
// Startup():
// wait for membership view to stabilize, or for a leadership declaration is received
// or the startup timeout expires.
// goto SteadyState()
//
// SteadyState():
// while true:
// If leaderKnown is false:
// LeaderElection()
// If you are the leader:
// Broadcast leadership declaration
// If a leadership declaration was received from
// a peer with a lower ID,
// become a follower
// Else, you're a follower:
// If haven't received a leadership declaration within
// a time threshold:
// set leaderKnown to false
//
// LeaderElection():
// Gossip leadership proposal message
// Collect messages from other peers sent within a time period
// If received a leadership declaration:
// return
// Iterate over all proposal messages collected.
// If a proposal message from a peer with an ID lower
// than yourself was received, return.
// Else, declare yourself a leader
// LeaderElectionAdapter is used by the leader election module
// to send and receive messages and to get membership information
type LeaderElectionAdapter interface {
// Gossip gossips a message to other peers
Gossip(Msg)
// Accept returns a channel that emits messages
Accept() <-chan Msg
// CreateProposalMessage
CreateMessage(isDeclaration bool) Msg
// Peers returns a list of peers considered alive
Peers() []Peer
}
type leadershipCallback func(isLeader bool)
// LeaderElectionService is the object that runs the leader election algorithm
type LeaderElectionService interface {
// IsLeader returns whether this peer is a leader or not
IsLeader() bool
// Stop stops the LeaderElectionService
Stop()
// Yield relinquishes the leadership until a new leader is elected,
// or a timeout expires
Yield()
}
type peerID []byte
// Peer describes a remote peer
type Peer interface {
// ID returns the ID of the peer
ID() peerID
}
// Msg describes a message sent from a remote peer
type Msg interface {
// SenderID returns the ID of the peer sent the message
SenderID() peerID
// IsProposal returns whether this message is a leadership proposal
IsProposal() bool
// IsDeclaration returns whether this message is a leadership declaration
IsDeclaration() bool
}
func noopCallback(_ bool) {
}
// NewLeaderElectionService returns a new LeaderElectionService
func NewLeaderElectionService(adapter LeaderElectionAdapter, id string, callback leadershipCallback) LeaderElectionService {
if len(id) == 0 {
panic("Empty id")
}
le := &leaderElectionSvcImpl{
id: peerID(id),
proposals: util.NewSet(),
adapter: adapter,
stopChan: make(chan struct{}, 1),
interruptChan: make(chan struct{}, 1),
logger: util.GetLogger(util.LoggingElectionModule, ""),
callback: noopCallback,
}
if callback != nil {
le.callback = callback
}
go le.start()
return le
}
// leaderElectionSvcImpl is an implementation of a LeaderElectionService
type leaderElectionSvcImpl struct {
id peerID
proposals *util.Set
sync.Mutex
stopChan chan struct{}
interruptChan chan struct{}
stopWG sync.WaitGroup
isLeader int32
toDie int32
leaderExists int32
yield int32
sleeping bool
adapter LeaderElectionAdapter
logger *logging.Logger
callback leadershipCallback
yieldTimer *time.Timer
}
func (le *leaderElectionSvcImpl) start() {
le.stopWG.Add(2)
go le.handleMessages()
le.waitForMembershipStabilization(getStartupGracePeriod())
go le.run()
}
func (le *leaderElectionSvcImpl) handleMessages() {
le.logger.Debug(le.id, ": Entering")
defer le.logger.Debug(le.id, ": Exiting")
defer le.stopWG.Done()
msgChan := le.adapter.Accept()
for {
select {
case <-le.stopChan:
le.stopChan <- struct{}{}
return
case msg := <-msgChan:
if !le.isAlive(msg.SenderID()) {
le.logger.Debug(le.id, ": Got message from", msg.SenderID(), "but it is not in the view")
break
}
le.handleMessage(msg)
}
}
}
func (le *leaderElectionSvcImpl) handleMessage(msg Msg) {
msgType := "proposal"
if msg.IsDeclaration() {
msgType = "declaration"
}
le.logger.Debug(le.id, ":", msg.SenderID(), "sent us", msgType)
le.Lock()
defer le.Unlock()
if msg.IsProposal() {
le.proposals.Add(string(msg.SenderID()))
} else if msg.IsDeclaration() {
atomic.StoreInt32(&le.leaderExists, int32(1))
if le.sleeping && len(le.interruptChan) == 0 {
le.interruptChan <- struct{}{}
}
if bytes.Compare(msg.SenderID(), le.id) < 0 && le.IsLeader() {
le.stopBeingLeader()
}
} else {
// We shouldn't get here
le.logger.Error("Got a message that's not a proposal and not a declaration")
}
}
// waitForInterrupt sleeps until the interrupt channel is triggered
// or given timeout expires
func (le *leaderElectionSvcImpl) waitForInterrupt(timeout time.Duration) {
le.logger.Debug(le.id, ": Entering")
defer le.logger.Debug(le.id, ": Exiting")
le.Lock()
le.sleeping = true
le.Unlock()
select {
case <-le.interruptChan:
case <-le.stopChan:
le.stopChan <- struct{}{}
case <-time.After(timeout):
}
le.Lock()
le.sleeping = false
// We drain the interrupt channel
// because we might get 2 leadership declarations messages
// while sleeping, but we would only read 1 of them in the select block above
le.drainInterruptChannel()
le.Unlock()
}
func (le *leaderElectionSvcImpl) run() {
defer le.stopWG.Done()
for !le.shouldStop() {
if !le.isLeaderExists() {
le.leaderElection()
}
// If we are yielding and some leader has been elected,
// stop yielding
if le.isLeaderExists() && le.isYielding() {
le.stopYielding()
}
if le.shouldStop() {
return
}
if le.IsLeader() {
le.leader()
} else {
le.follower()
}
}
}
func (le *leaderElectionSvcImpl) leaderElection() {
le.logger.Debug(le.id, ": Entering")
defer le.logger.Debug(le.id, ": Exiting")
// If we're yielding to other peers, do not participate
// in leader election
if le.isYielding() {
return
}
// Propose ourselves as a leader
le.propose()
// Collect other proposals
le.waitForInterrupt(getLeaderElectionDuration())
// If someone declared itself as a leader, give up
// on trying to become a leader too
if le.isLeaderExists() {
le.logger.Debug(le.id, ": Some peer is already a leader")
return
}
if le.isYielding() {
le.logger.Debug(le.id, ": Aborting leader election because yielding")
return
}
// Leader doesn't exist, let's see if there is a better candidate than us
// for being a leader
for _, o := range le.proposals.ToArray() {
id := o.(string)
if bytes.Compare(peerID(id), le.id) < 0 {
return
}
}
// If we got here, there is no one that proposed being a leader
// that's a better candidate than us.
le.beLeader()
atomic.StoreInt32(&le.leaderExists, int32(1))
}
// propose sends a leadership proposal message to remote peers
func (le *leaderElectionSvcImpl) propose() {
le.logger.Debug(le.id, ": Entering")
le.logger.Debug(le.id, ": Exiting")
leadershipProposal := le.adapter.CreateMessage(false)
le.adapter.Gossip(leadershipProposal)
}
func (le *leaderElectionSvcImpl) follower() {
le.logger.Debug(le.id, ": Entering")
defer le.logger.Debug(le.id, ": Exiting")
le.proposals.Clear()
atomic.StoreInt32(&le.leaderExists, int32(0))
select {
case <-time.After(getLeaderAliveThreshold()):
case <-le.stopChan:
le.stopChan <- struct{}{}
}
}
func (le *leaderElectionSvcImpl) leader() {
leaderDeclaration := le.adapter.CreateMessage(true)
le.adapter.Gossip(leaderDeclaration)
le.waitForInterrupt(getLeadershipDeclarationInterval())
}
// waitForMembershipStabilization waits for membership view to stabilize
// or until a time limit expires, or until a peer declares itself as a leader
func (le *leaderElectionSvcImpl) waitForMembershipStabilization(timeLimit time.Duration) {
le.logger.Debug(le.id, ": Entering")
defer le.logger.Debug(le.id, ": Exiting, peers found", len(le.adapter.Peers()))
endTime := time.Now().Add(timeLimit)
viewSize := len(le.adapter.Peers())
for !le.shouldStop() {
time.Sleep(getMembershipSampleInterval())
newSize := len(le.adapter.Peers())
if newSize == viewSize || time.Now().After(endTime) || le.isLeaderExists() {
return
}
viewSize = newSize
}
}
// drainInterruptChannel clears the interruptChannel
// if needed
func (le *leaderElectionSvcImpl) drainInterruptChannel() {
if len(le.interruptChan) == 1 {
<-le.interruptChan
}
}
// isAlive returns whether peer of given id is considered alive
func (le *leaderElectionSvcImpl) isAlive(id peerID) bool {
for _, p := range le.adapter.Peers() {
if bytes.Equal(p.ID(), id) {
return true
}
}
return false
}
func (le *leaderElectionSvcImpl) isLeaderExists() bool {
return atomic.LoadInt32(&le.leaderExists) == int32(1)
}
// IsLeader returns whether this peer is a leader
func (le *leaderElectionSvcImpl) IsLeader() bool {
isLeader := atomic.LoadInt32(&le.isLeader) == int32(1)
le.logger.Debug(le.id, ": Returning", isLeader)
return isLeader
}
func (le *leaderElectionSvcImpl) beLeader() {
le.logger.Debug(le.id, ": Becoming a leader")
atomic.StoreInt32(&le.isLeader, int32(1))
le.callback(true)
}
func (le *leaderElectionSvcImpl) stopBeingLeader() {
le.logger.Debug(le.id, "Stopped being a leader")
atomic.StoreInt32(&le.isLeader, int32(0))
le.callback(false)
}
func (le *leaderElectionSvcImpl) shouldStop() bool {
return atomic.LoadInt32(&le.toDie) == int32(1)
}
func (le *leaderElectionSvcImpl) isYielding() bool {
return atomic.LoadInt32(&le.yield) == int32(1)
}
func (le *leaderElectionSvcImpl) stopYielding() {
le.logger.Debug("Stopped yielding")
le.Lock()
defer le.Unlock()
atomic.StoreInt32(&le.yield, int32(0))
le.yieldTimer.Stop()
}
// Yield relinquishes the leadership until a new leader is elected,
// or a timeout expires
func (le *leaderElectionSvcImpl) Yield() {
le.Lock()
defer le.Unlock()
if !le.IsLeader() || le.isYielding() {
return
}
// Turn on the yield flag
atomic.StoreInt32(&le.yield, int32(1))
// Stop being a leader
le.stopBeingLeader()
// Clear the leader exists flag since it could be that we are the leader
atomic.StoreInt32(&le.leaderExists, int32(0))
// Clear the yield flag in any case afterwards
le.yieldTimer = time.AfterFunc(getLeaderAliveThreshold()*6, func() {
atomic.StoreInt32(&le.yield, int32(0))
})
}
// Stop stops the LeaderElectionService
func (le *leaderElectionSvcImpl) Stop() {
le.logger.Debug(le.id, ": Entering")
defer le.logger.Debug(le.id, ": Exiting")
atomic.StoreInt32(&le.toDie, int32(1))
le.stopChan <- struct{}{}
le.stopWG.Wait()
}
// SetStartupGracePeriod configures startup grace period interval,
// the period of time to wait until election algorithm will start
func SetStartupGracePeriod(t time.Duration) {
viper.Set("peer.gossip.election.startupGracePeriod", t)
}
// SetMembershipSampleInterval setups/initializes the frequency the
// membership view should be checked
func SetMembershipSampleInterval(t time.Duration) {
viper.Set("peer.gossip.election.membershipSampleInterval", t)
}
// SetLeaderAliveThreshold configures leader election alive threshold
func SetLeaderAliveThreshold(t time.Duration) {
viper.Set("peer.gossip.election.leaderAliveThreshold", t)
}
// SetLeaderElectionDuration configures expected leadership election duration,
// interval to wait until leader election will be completed
func SetLeaderElectionDuration(t time.Duration) {
viper.Set("peer.gossip.election.leaderElectionDuration", t)
}
func getStartupGracePeriod() time.Duration {
return util.GetDurationOrDefault("peer.gossip.election.startupGracePeriod", time.Second*15)
}
func getMembershipSampleInterval() time.Duration {
return util.GetDurationOrDefault("peer.gossip.election.membershipSampleInterval", time.Second)
}
func getLeaderAliveThreshold() time.Duration {
return util.GetDurationOrDefault("peer.gossip.election.leaderAliveThreshold", time.Second*10)
}
func getLeadershipDeclarationInterval() time.Duration {
return time.Duration(getLeaderAliveThreshold() / 2)
}
func getLeaderElectionDuration() time.Duration {
return util.GetDurationOrDefault("peer.gossip.election.leaderElectionDuration", time.Second*5)
}
// GetMsgExpirationTimeout return leadership message expiration timeout
func GetMsgExpirationTimeout() time.Duration {
return getLeaderAliveThreshold() * 10
}