Skip to content

Commit

Permalink
Raft: Check suspect info once per suspect interval (#1600)
Browse files Browse the repository at this point in the history
Today's existing suspect logic has a periodic checker, which checks
every 10s if the Raft cluster still has quorum.  If the cluster has lost
quorum, it marks the time this event begins, then, every 10s checks to
see if 'enough' time has elapsed since the quorum was lost to suspect
that the OSN has been evicted.

If the OSN has not been evicted, or cannot determine its eviction
status, then every 10s the OSN attempts to re-check its suspicion
status, which can lead to large volumes of network traffic, especially
in significiantly multichannel environments.

This commit modifies the logic to track the number of times that the
suspect checking logic has actually executed, to ensure that we check no
more than once every suspect interval (by default every 10m, instead of
every 10s).

Signed-off-by: Jason Yellick <jyellick@us.ibm.com>
(cherry picked from commit c90015c)
  • Loading branch information
Jason Yellick authored and lindluni committed Jul 15, 2020
1 parent 03f0544 commit 3741860
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 1 deletion.
1 change: 1 addition & 0 deletions orderer/consensus/etcdraft/chain.go
Expand Up @@ -369,6 +369,7 @@ func (c *Chain) Start() {
c.periodicChecker = &PeriodicCheck{
Logger: c.logger,
Report: es.confirmSuspicion,
ReportCleared: es.clearSuspicion,
CheckInterval: interval,
Condition: c.suspectEviction,
}
Expand Down
14 changes: 13 additions & 1 deletion orderer/consensus/etcdraft/eviction.go
Expand Up @@ -26,6 +26,7 @@ type PeriodicCheck struct {
CheckInterval time.Duration
Condition func() bool
Report func(cumulativePeriod time.Duration)
ReportCleared func()
conditionHoldsSince time.Time
once sync.Once // Used to prevent double initialization
stopped uint32
Expand Down Expand Up @@ -60,6 +61,9 @@ func (pc *PeriodicCheck) check() {
}

func (pc *PeriodicCheck) conditionNotFulfilled() {
if pc.ReportCleared != nil && !pc.conditionHoldsSince.IsZero() {
pc.ReportCleared()
}
pc.conditionHoldsSince = time.Time{}
}

Expand All @@ -81,12 +85,20 @@ type evictionSuspector struct {
writeBlock func(block *common.Block) error
triggerCatchUp func(sn *raftpb.Snapshot)
halted bool
timesTriggered int
}

func (es *evictionSuspector) clearSuspicion() {
es.timesTriggered = 0
}

func (es *evictionSuspector) confirmSuspicion(cumulativeSuspicion time.Duration) {
if es.evictionSuspicionThreshold > cumulativeSuspicion || es.halted {
// The goal here is to only execute the body of the function once every es.evictionSuspicionThreshold
if es.evictionSuspicionThreshold*time.Duration(es.timesTriggered+1) > cumulativeSuspicion || es.halted {
return
}
es.timesTriggered++

es.logger.Infof("Suspecting our own eviction from the channel for %v", cumulativeSuspicion)
puller, err := es.createPuller()
if err != nil {
Expand Down
19 changes: 19 additions & 0 deletions orderer/consensus/etcdraft/eviction_test.go
Expand Up @@ -48,11 +48,18 @@ func TestPeriodicCheck(t *testing.T) {
reports <- duration
}

clears := make(chan struct{}, 1000)

reportCleared := func() {
clears <- struct{}{}
}

check := &PeriodicCheck{
Logger: flogging.MustGetLogger("test"),
Condition: condition,
CheckInterval: time.Millisecond,
Report: report,
ReportCleared: reportCleared,
}

go check.Run()
Expand Down Expand Up @@ -85,6 +92,9 @@ func TestPeriodicCheck(t *testing.T) {
}
}

g.Eventually(clears).Should(gomega.Receive())
g.Consistently(clears).ShouldNot(gomega.Receive())

// ensure the checks have been made
checksDoneSoFar := atomic.LoadUint32(&checkNum)
g.Consistently(reports, time.Second*2, time.Millisecond).Should(gomega.BeEmpty())
Expand All @@ -107,6 +117,7 @@ func TestPeriodicCheck(t *testing.T) {
time.Sleep(check.CheckInterval * 50)
// Ensure that we cease checking the condition, hence the PeriodicCheck is stopped.
g.Expect(atomic.LoadUint32(&checkNum)).To(gomega.BeNumerically("<", checkCountAfterStop+2))
g.Consistently(clears).ShouldNot(gomega.Receive())
}

func TestEvictionSuspector(t *testing.T) {
Expand Down Expand Up @@ -137,13 +148,20 @@ func TestEvictionSuspector(t *testing.T) {
blockPuller BlockPuller
blockPullerErr error
height uint64
timesTriggered int
halt func()
}{
{
description: "suspected time is lower than threshold",
evictionSuspicionThreshold: 11 * time.Minute,
halt: t.Fail,
},
{
description: "timesTriggered multiplier prevents threshold",
evictionSuspicionThreshold: 6 * time.Minute,
timesTriggered: 1,
halt: t.Fail,
},
{
description: "puller creation fails",
evictionSuspicionThreshold: 10*time.Minute - time.Second,
Expand Down Expand Up @@ -219,6 +237,7 @@ func TestEvictionSuspector(t *testing.T) {
},
logger: flogging.MustGetLogger("test"),
triggerCatchUp: func(sn *raftpb.Snapshot) {},
timesTriggered: testCase.timesTriggered,
}

foundExpectedLog := testCase.expectedLog == ""
Expand Down

0 comments on commit 3741860

Please sign in to comment.