Skip to content

Commit eb3910c

Browse files
authored
Fix deadlock in 10-node cluster convergence (#2467)
This PR fixes #2286 . - CheckQuorum was causing us multiple issues. When doing a 5-node Zero cluster bootstrap, it would cause a leader to step down when the size of the cluster is 2, then causing all the rest of the joins to be blocked indefinitely. It would also cause leader step down in a seemingly healthy cluster which is processing proposals. CheckQuorum was mandated by raft.ReadOnlyLeaseBased, which is a less safe option to do linearizable reads. Switch ReadOnlyOption back to raft.ReadOnlySafe. Moreover, we don't need to do quorum based lin reads in the Alpha servers, because of the switch to proposing and then applying transaction updates. - raft.ReadIndex is not working for some reason. So, commented out its usage in Zero (and removed it from Alpha permanently). Needs to be fixed when the following issue is resolved. etcd-io/etcd#9893 - The logic to do lin reads was replicated in both Zero and Alpha. Refactor that into one place in conn/node.go. - Retry conf change proposals if they timeout. This mechanism is similar to the one introduced for normal proposals in a previous commit 06ea4c. - Use a lock to only allow one JoinCluster call at a time. Block JoinCluster until node.AddToCluster is successful (or return the error). - Set raft library to 3.2.23. Before upgrade, we were at 3.2.6. Commit log: * Trying to understand why JoinCluster doesn't work properly. * Fucking works. Fucking works. * It all works now. * More Dgraph servers. Found a new issue where requesting read quorum doesn't respond. * Refactor wait lin read code and move it to conn/node.go * Remove lin read wait for server, because txn timestamp should be sufficient for waiting. Also, for the time being, comment out lin read wait from Zero as well.
1 parent 3b5bc66 commit eb3910c

File tree

17 files changed

+450
-666
lines changed

17 files changed

+450
-666
lines changed

conn/node.go

+163-38
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import (
1818

1919
"github.com/coreos/etcd/raft"
2020
"github.com/coreos/etcd/raft/raftpb"
21+
"github.com/dgraph-io/badger/y"
2122
"github.com/dgraph-io/dgo/protos/api"
2223
"github.com/dgraph-io/dgraph/protos/intern"
2324
"github.com/dgraph-io/dgraph/raftwal"
@@ -37,6 +38,11 @@ type sendmsg struct {
3738
type Node struct {
3839
x.SafeMutex
3940

41+
joinLock sync.Mutex
42+
43+
// Used to keep track of lin read requests.
44+
requestCh chan linReadReq
45+
4046
// SafeMutex is for fields which can be changed after init.
4147
_confState *raftpb.ConfState
4248
_raft raft.Node
@@ -88,27 +94,30 @@ func NewNode(rc *intern.RaftContext, store *raftwal.DiskStorage) *Node {
8894
MaxSizePerMsg: 256 << 10,
8995
MaxInflightMsgs: 256,
9096
Logger: &raft.DefaultLogger{Logger: x.Logger},
91-
// We use lease-based linearizable ReadIndex for performance, at the cost of
92-
// correctness. With it, communication goes follower->leader->follower, instead of
93-
// follower->leader->majority_of_followers->leader->follower. We lose correctness
94-
// because the Raft ticker might not arrive promptly, in which case the leader would
95-
// falsely believe that its lease is still good.
96-
CheckQuorum: true,
97-
ReadOnlyOption: raft.ReadOnlyLeaseBased,
97+
// We don't need lease based reads. They cause issues because they require CheckQuorum
98+
// to be true, and that causes a lot of issues for us during cluster bootstrapping and
99+
// later. A seemingly healthy cluster would just cause leader to step down due to
100+
// "inactive" quorum, and then disallow anyone from becoming leader. So, let's stick to
101+
// default options. Let's achieve correctness, then we achieve performance. Plus, for
102+
// the Dgraph servers, we'll be soon relying only on Timestamps for blocking reads and
103+
// achieving linearizability, than checking quorums (Zero would still check quorums).
104+
ReadOnlyOption: raft.ReadOnlySafe,
98105
},
99106
// processConfChange etc are not throttled so some extra delta, so that we don't
100107
// block tick when applyCh is full
101-
peers: make(map[uint64]string),
102-
confChanges: make(map[uint64]chan error),
103-
RaftContext: rc,
104-
messages: make(chan sendmsg, 100),
105108
Applied: x.WaterMark{Name: fmt.Sprintf("Applied watermark")},
109+
RaftContext: rc,
106110
Rand: rand.New(&lockedSource{src: rand.NewSource(time.Now().UnixNano())}),
111+
confChanges: make(map[uint64]chan error),
112+
messages: make(chan sendmsg, 100),
113+
peers: make(map[uint64]string),
114+
requestCh: make(chan linReadReq),
107115
}
108116
n.Applied.Init()
109117
// TODO: n_ = n is a hack. We should properly init node, and make it part of the server struct.
110118
// This can happen once we get rid of groups.
111119
n_ = n
120+
112121
return n
113122
}
114123

@@ -375,6 +384,34 @@ func (n *Node) DeletePeer(pid uint64) {
375384
delete(n.peers, pid)
376385
}
377386

387+
var errInternalRetry = errors.New("Retry proposal again")
388+
389+
func (n *Node) proposeConfChange(ctx context.Context, pb raftpb.ConfChange) error {
390+
cctx, cancel := context.WithTimeout(ctx, 3*time.Second)
391+
defer cancel()
392+
393+
ch := make(chan error, 1)
394+
id := n.storeConfChange(ch)
395+
// TODO: Delete id from the map.
396+
pb.ID = id
397+
if err := n.Raft().ProposeConfChange(cctx, pb); err != nil {
398+
if cctx.Err() != nil {
399+
return errInternalRetry
400+
}
401+
x.Printf("Error while proposing conf change: %v", err)
402+
return err
403+
}
404+
select {
405+
case err := <-ch:
406+
return err
407+
case <-ctx.Done():
408+
return ctx.Err()
409+
case <-cctx.Done():
410+
return errInternalRetry
411+
}
412+
return nil
413+
}
414+
378415
func (n *Node) AddToCluster(ctx context.Context, pid uint64) error {
379416
addr, ok := n.Peer(pid)
380417
x.AssertTruef(ok, "Unable to find conn pool for peer: %d", pid)
@@ -386,18 +423,17 @@ func (n *Node) AddToCluster(ctx context.Context, pid uint64) error {
386423
rcBytes, err := rc.Marshal()
387424
x.Check(err)
388425

389-
ch := make(chan error, 1)
390-
id := n.storeConfChange(ch)
391-
err = n.Raft().ProposeConfChange(ctx, raftpb.ConfChange{
392-
ID: id,
426+
cc := raftpb.ConfChange{
393427
Type: raftpb.ConfChangeAddNode,
394428
NodeID: pid,
395429
Context: rcBytes,
396-
})
397-
if err != nil {
398-
return err
399430
}
400-
err = <-ch
431+
err = errInternalRetry
432+
for err == errInternalRetry {
433+
x.Printf("Trying to add %d to cluster. Addr: %v\n", pid, addr)
434+
x.Printf("Current confstate at %d: %+v\n", n.Id, n.ConfState())
435+
err = n.proposeConfChange(ctx, cc)
436+
}
401437
return err
402438
}
403439

@@ -408,20 +444,111 @@ func (n *Node) ProposePeerRemoval(ctx context.Context, id uint64) error {
408444
if _, ok := n.Peer(id); !ok && id != n.RaftContext.Id {
409445
return x.Errorf("Node %d not part of group", id)
410446
}
411-
ch := make(chan error, 1)
412-
pid := n.storeConfChange(ch)
413-
err := n.Raft().ProposeConfChange(ctx, raftpb.ConfChange{
414-
ID: pid,
447+
cc := raftpb.ConfChange{
415448
Type: raftpb.ConfChangeRemoveNode,
416449
NodeID: id,
417-
})
418-
if err != nil {
419-
return err
420450
}
421-
err = <-ch
451+
err := errInternalRetry
452+
for err == errInternalRetry {
453+
err = n.proposeConfChange(ctx, cc)
454+
}
422455
return err
423456
}
424457

458+
type linReadReq struct {
459+
// A one-shot chan which we send a raft index upon
460+
indexCh chan<- uint64
461+
}
462+
463+
var errReadIndex = x.Errorf("cannot get linearized read (time expired or no configured leader)")
464+
465+
func (n *Node) WaitLinearizableRead(ctx context.Context) error {
466+
indexCh := make(chan uint64, 1)
467+
select {
468+
case n.requestCh <- linReadReq{indexCh: indexCh}:
469+
case <-ctx.Done():
470+
return ctx.Err()
471+
}
472+
473+
select {
474+
case index := <-indexCh:
475+
if index == 0 {
476+
return errReadIndex
477+
}
478+
return n.Applied.WaitForMark(ctx, index)
479+
case <-ctx.Done():
480+
return ctx.Err()
481+
}
482+
}
483+
484+
func (n *Node) RunReadIndexLoop(closer *y.Closer, readStateCh <-chan raft.ReadState) {
485+
defer closer.Done()
486+
readIndex := func() (uint64, error) {
487+
// Read Request can get rejected then we would wait idefinitely on the channel
488+
// so have a timeout.
489+
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
490+
defer cancel()
491+
492+
activeRctx := make([]byte, 8)
493+
x.Check2(n.Rand.Read(activeRctx[:]))
494+
if err := n.Raft().ReadIndex(ctx, activeRctx[:]); err != nil {
495+
x.Errorf("Error while trying to call ReadIndex: %v\n", err)
496+
return 0, err
497+
}
498+
499+
again:
500+
select {
501+
case <-closer.HasBeenClosed():
502+
return 0, errors.New("closer has been called")
503+
case rs := <-readStateCh:
504+
if !bytes.Equal(activeRctx[:], rs.RequestCtx) {
505+
goto again
506+
}
507+
return rs.Index, nil
508+
case <-ctx.Done():
509+
x.Errorf("[%d] Read index context timed out\n")
510+
return 0, errInternalRetry
511+
}
512+
}
513+
514+
// We maintain one linearizable ReadIndex request at a time. Others wait queued behind
515+
// requestCh.
516+
requests := []linReadReq{}
517+
for {
518+
select {
519+
case <-closer.HasBeenClosed():
520+
return
521+
case rs := <-readStateCh:
522+
// Do nothing, discard ReadState as we don't have any pending ReadIndex requests.
523+
x.Errorf("Received a read state unexpectedly: %+v\n", rs)
524+
case req := <-n.requestCh:
525+
slurpLoop:
526+
for {
527+
requests = append(requests, req)
528+
select {
529+
case req = <-n.requestCh:
530+
default:
531+
break slurpLoop
532+
}
533+
}
534+
for {
535+
index, err := readIndex()
536+
if err == errInternalRetry {
537+
continue
538+
}
539+
if err != nil {
540+
index = 0
541+
x.Errorf("[%d] While trying to do lin read index: %v", n.Id, err)
542+
}
543+
for _, req := range requests {
544+
req.indexCh <- index
545+
}
546+
}
547+
requests = requests[:0]
548+
}
549+
}
550+
}
551+
425552
// TODO: Get rid of this in the upcoming changes.
426553
var n_ *Node
427554

@@ -466,6 +593,10 @@ func (w *RaftServer) JoinCluster(ctx context.Context,
466593
if node == nil || node.Raft() == nil {
467594
return nil, errNoNode
468595
}
596+
// Only process one JoinCluster request at a time.
597+
node.joinLock.Lock()
598+
defer node.joinLock.Unlock()
599+
469600
// Check that the new node is from the same group as me.
470601
if rc.Group != node.RaftContext.Group {
471602
return nil, x.Errorf("Raft group mismatch")
@@ -474,25 +605,19 @@ func (w *RaftServer) JoinCluster(ctx context.Context,
474605
if rc.Id == node.RaftContext.Id {
475606
return nil, ErrDuplicateRaftId
476607
}
608+
477609
// Check that the new node is not already part of the group.
478-
if addr, ok := node.peers[rc.Id]; ok && rc.Addr != addr {
479-
Get().Connect(addr)
610+
if addr, ok := node.Peer(rc.Id); ok && rc.Addr != addr {
480611
// There exists a healthy connection to server with same id.
481612
if _, err := Get().Get(addr); err == nil {
482613
return &api.Payload{}, ErrDuplicateRaftId
483614
}
484615
}
485616
node.Connect(rc.Id, rc.Addr)
486617

487-
c := make(chan error, 1)
488-
go func() { c <- node.AddToCluster(ctx, rc.Id) }()
489-
490-
select {
491-
case <-ctx.Done():
492-
return &api.Payload{}, ctx.Err()
493-
case err := <-c:
494-
return &api.Payload{}, err
495-
}
618+
err := node.AddToCluster(context.Background(), rc.Id)
619+
x.Printf("[%d] Done joining cluster with err: %v", rc.Id, err)
620+
return &api.Payload{}, err
496621
}
497622

498623
var (

conn/pool.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ func (p *Pools) Connect(addr string) *Pool {
105105
p.Unlock()
106106
return existingPool
107107
}
108-
x.Printf("== CONNECT ==> Setting %v\n", addr)
108+
x.Printf("== CONNECTED ==> Setting %v\n", addr)
109109
p.all[addr] = pool
110110
p.Unlock()
111111
return pool
@@ -117,7 +117,7 @@ func NewPool(addr string) (*Pool, error) {
117117
grpc.WithDefaultCallOptions(
118118
grpc.MaxCallRecvMsgSize(x.GrpcMaxSize),
119119
grpc.MaxCallSendMsgSize(x.GrpcMaxSize)),
120-
grpc.WithBackoffMaxDelay(10*time.Second),
120+
grpc.WithBackoffMaxDelay(time.Second),
121121
grpc.WithInsecure())
122122
if err != nil {
123123
return nil, err

0 commit comments

Comments
 (0)