Skip to content

Commit

Permalink
p2p-governor: add some randomness to the reconnection delay
Browse files Browse the repository at this point in the history
In case of a TCP sim open the handshake will fail, and both
nodes will retry at exactly the same time causing the same
failure to repeat forever. The random delay breaks the cycle.

The random delay is between -1s to +3s which is more than
then the expected RTT around the world.
  • Loading branch information
coot committed Sep 28, 2021
1 parent e2d2601 commit fcfb5c0
Show file tree
Hide file tree
Showing 5 changed files with 29 additions and 12 deletions.
Expand Up @@ -444,16 +444,17 @@ peerSelectionGovernor :: (MonadAsync m, MonadMask m, MonadTime m, MonadTimer m,
=> Tracer m (TracePeerSelection peeraddr)
-> Tracer m (DebugPeerSelection peeraddr peerconn)
-> Tracer m PeerSelectionCounters
-> StdGen
-> PeerSelectionActions peeraddr peerconn m
-> PeerSelectionPolicy peeraddr m
-> m Void
peerSelectionGovernor tracer debugTracer countersTracer actions policy =
peerSelectionGovernor tracer debugTracer countersTracer fuzzRng actions policy =
JobPool.withJobPool $ \jobPool ->
peerSelectionGovernorLoop
tracer (debugTracer <> contramap transform countersTracer)
actions policy
jobPool
emptyPeerSelectionState
(emptyPeerSelectionState fuzzRng)
where
transform :: Ord peeraddr => DebugPeerSelection peeraddr peerconn -> PeerSelectionCounters
transform (TraceGovernorState _ _ st) = peerStateToCounters st
Expand Down
Expand Up @@ -16,6 +16,7 @@ import Control.Concurrent.JobPool (Job(..))
import Control.Monad.Class.MonadSTM
import Control.Monad.Class.MonadTime
import Control.Exception (SomeException)
import System.Random (randomR)

import qualified Ouroboros.Network.PeerSelection.EstablishedPeers as EstablishedPeers
import qualified Ouroboros.Network.PeerSelection.KnownPeers as KnownPeers
Expand Down Expand Up @@ -239,6 +240,7 @@ jobPromoteColdPeer PeerSelectionActions {
handler e = return $
Completion $ \st@PeerSelectionState {
establishedPeers,
fuzzRng,
targets = PeerSelectionTargets {
targetNumberOfEstablishedPeers
}
Expand All @@ -247,12 +249,14 @@ jobPromoteColdPeer PeerSelectionActions {
let (failCount, knownPeers') = KnownPeers.incrementFailCount
peeraddr
(knownPeers st)
(fuzz, fuzzRng') = randomR (-2, 2 :: Double) fuzzRng

-- exponential backoff: 5s, 10s, 20s, 40s, 80s, 160s.
delay :: DiffTime
delay = fromIntegral $
baseColdPeerRetryDiffTime
* 2 ^ (pred failCount `min` maxColdPeerRetryBackoff)
delay = realToFrac fuzz
+ fromIntegral
( baseColdPeerRetryDiffTime
* 2 ^ (pred failCount `min` maxColdPeerRetryBackoff)
)
in
Decision {
decisionTrace = TracePromoteColdFailed targetNumberOfEstablishedPeers
Expand All @@ -264,7 +268,8 @@ jobPromoteColdPeer PeerSelectionActions {
(delay `addTime` now)
knownPeers',
inProgressPromoteCold = Set.delete peeraddr
(inProgressPromoteCold st)
(inProgressPromoteCold st),
fuzzRng = fuzzRng'
},
decisionJobs = []
}
Expand Down
Expand Up @@ -50,6 +50,7 @@ import Control.Concurrent.JobPool (Job)
import Control.Monad.Class.MonadSTM
import Control.Monad.Class.MonadTime
import Control.Exception (assert, SomeException)
import System.Random (StdGen)

import qualified Ouroboros.Network.PeerSelection.EstablishedPeers as EstablishedPeers
import Ouroboros.Network.PeerSelection.EstablishedPeers (EstablishedPeers)
Expand Down Expand Up @@ -289,7 +290,10 @@ data PeerSelectionState peeraddr peerconn = PeerSelectionState {
inProgressPromoteCold :: !(Set peeraddr),
inProgressPromoteWarm :: !(Set peeraddr),
inProgressDemoteWarm :: !(Set peeraddr),
inProgressDemoteHot :: !(Set peeraddr)
inProgressDemoteHot :: !(Set peeraddr),

-- | Rng for fuzzy delay
fuzzRng :: !StdGen

-- TODO: need something like this to distinguish between lots of bad peers
-- and us getting disconnected from the network locally. We don't want a
Expand All @@ -315,8 +319,8 @@ peerStateToCounters st = PeerSelectionCounters { coldPeers, warmPeers, hotPeers
warmPeers = Set.size $ establishedPeersSet Set.\\ activePeers st
hotPeers = Set.size $ activePeers st

emptyPeerSelectionState :: PeerSelectionState peeraddr peerconn
emptyPeerSelectionState =
emptyPeerSelectionState :: StdGen -> PeerSelectionState peeraddr peerconn
emptyPeerSelectionState rng =
PeerSelectionState {
targets = nullPeerSelectionTargets,
localRootPeers = LocalRootPeers.empty,
Expand All @@ -331,7 +335,8 @@ emptyPeerSelectionState =
inProgressPromoteCold = Set.empty,
inProgressPromoteWarm = Set.empty,
inProgressDemoteWarm = Set.empty,
inProgressDemoteHot = Set.empty
inProgressDemoteHot = Set.empty,
fuzzRng = rng
}


Expand Down
Expand Up @@ -28,6 +28,7 @@ import Data.Void (Void)
import qualified Data.Signal as Signal
import Data.Signal (Signal, Events, E(E), TS(TS))
import qualified Data.OrdPSQ as PSQ
import System.Random (mkStdGen)

import Control.Monad.Class.MonadSTM.Strict (STM)
import Control.Monad.Class.MonadTime
Expand Down Expand Up @@ -1867,7 +1868,8 @@ selectGovState :: Eq a
selectGovState f =
Signal.nub
. fmap f
. Signal.fromChangeEvents Governor.emptyPeerSelectionState
-- TODO: #3182 Rng seed should come from quickcheck.
. Signal.fromChangeEvents (Governor.emptyPeerSelectionState $ mkStdGen 42)
. Signal.selectEvents
(\case GovernorDebug (TraceGovernorState _ _ st) -> Just st
_ -> Nothing)
Expand Down Expand Up @@ -1907,6 +1909,8 @@ _governorFindingPublicRoots targetNumberOfRootPeers readDomains =

peerSelectionGovernor
tracer tracer tracer
-- TODO: #3182 Rng seed should come from quickcheck.
(mkStdGen 42)
actions { requestPublicRootPeers }
policy
where
Expand Down
Expand Up @@ -35,6 +35,7 @@ import Data.Set (Set)
import qualified Data.Set as Set
import Data.Typeable (Typeable)
import Data.Void (Void)
import System.Random (mkStdGen)

import Control.Exception (throw)
import Control.Monad (forM_)
Expand Down Expand Up @@ -168,6 +169,7 @@ runGovernorInMockEnvironment mockEnv =
tracerTracePeerSelection
tracerDebugPeerSelection
tracerTracePeerSelectionCounters
(mkStdGen 42)
actions
policy

Expand Down

0 comments on commit fcfb5c0

Please sign in to comment.