/
ErrorPolicy.hs
150 lines (130 loc) · 7.29 KB
/
ErrorPolicy.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
{-# LANGUAGE ScopedTypeVariables #-}
module Ouroboros.Consensus.Node.ErrorPolicy (consensusErrorPolicy) where
import Data.Time.Clock (DiffTime)
import Control.Monad.Class.MonadAsync (ExceptionInLinkedThread (..))
import Ouroboros.Network.ErrorPolicy
import Ouroboros.Storage.ChainDB.API (ChainDbError (..),
ChainDbFailure)
import Ouroboros.Storage.FS.API.Types (FsError)
import Ouroboros.Storage.ImmutableDB.Types (ImmutableDBError)
import Ouroboros.Storage.VolatileDB.Types (VolatileDBError)
import Ouroboros.Consensus.BlockFetchServer
(BlockFetchServerException)
import Ouroboros.Consensus.ChainSyncClient
(ChainSyncClientException (..))
import Ouroboros.Consensus.Node.DbMarker (DbMarkerError)
import Ouroboros.Consensus.Node.ProtocolInfo.Byron
(PBftLeaderCredentialsError)
import Ouroboros.Consensus.Util.ResourceRegistry
(RegistryClosedException, ResourceRegistryThreadException)
consensusErrorPolicy :: ErrorPolicies addr ()
consensusErrorPolicy = ErrorPolicies {
-- Exception raised during connect
--
-- This is entirely a network-side concern.
epConErrorPolicies = []
-- What to do when the protocol exits cleanly
--
-- This never happens (we always throw an exception), so this function
-- should never be called; if for some reason it /does/, we make it
-- throw an exception.
, epReturnCallback = \_time _addr () -> ourBug
-- Exception raised during interaction with the peer
--
-- The list below should contain an entry for every type declared as an
-- instance of 'Exception' within ouroboros-consensus.
--
-- If a particular exception is not handled by any policy, a default
-- kicks in, which currently means logging the exception and disconnecting
-- from the peer (in both directions), but allowing an immediate
-- reconnect. This is fine for exceptions that only affect that peer.
-- It is however essential that we handle exceptions here that /must/
-- shut down the node (mainly storage layer errors).
--
-- TODO: Talk to devops about what they should do when the node does
-- terminate with a storage layer exception (restart with full recovery).
, epAppErrorPolicies = [
-- Any exceptions in the storage layer should terminate the node
--
-- NOTE: We do not catch IOExceptions here; they /ought/ to be caught
-- by the FS layer (and turn into FsError). If we do want to catch
-- them, we'd somehow have to distinguish between IO exceptions
-- arising from disk I/O (shutdownNode) and those arising from
-- network failures (SuspendConsumer).
ErrorPolicy $ \(_ :: DbMarkerError) -> Just shutdownNode
, ErrorPolicy $ \(_ :: ChainDbFailure) -> Just shutdownNode
-- The three exceptions below will always be wrapped in a
-- 'ChainDbFailure', but we include them in the policy just in case.
, ErrorPolicy $ \(_ :: VolatileDBError) -> Just shutdownNode
, ErrorPolicy $ \(_ :: FsError) -> Just shutdownNode
, ErrorPolicy $ \(_ :: ImmutableDBError) -> Just shutdownNode
-- Node configuration failure
, ErrorPolicy $ \(_ :: PBftLeaderCredentialsError) -> Just shutdownNode
-- Some chain DB errors are indicative of a bug in our code, others
-- indicate an invalid request from the peer. If the DB is closed
-- entirely, it will only be reopened after a node restart.
, ErrorPolicy $ \(e :: ChainDbError) ->
case e of
NoGenesisBlock{} -> Just theyBuggyOrEvil
ClosedDBError{} -> Just shutdownNode
ClosedReaderError{} -> Just ourBug
InvalidIteratorRange{} -> Just theyBuggyOrEvil
-- We have some resource registries that are used per-connection,
-- and so if we have ResourceRegistry related exception, we close
-- the connection but leave the rest of the node running.
, ErrorPolicy $ \(_ :: RegistryClosedException) -> Just ourBug
, ErrorPolicy $ \(_ :: ResourceRegistryThreadException) -> Just ourBug
-- An exception in the block fetch server meant the client asked
-- for some blocks we used to have but got GCed. This means the
-- peer is on a chain that forks off more than @k@ blocks away.
, ErrorPolicy $ \(_ :: BlockFetchServerException) -> Just distantPeer
-- Some chain sync client exceptions indicate malicious behaviour,
-- others merely mean that we should disconnect from this client
-- because we have diverged too much.
, ErrorPolicy $ \(e :: ChainSyncClientException) ->
case e of
HeaderExceedsClockSkew{} -> Just misconfiguredPeer
ForkTooDeep{} -> Just distantPeer
ChainError{} -> Just theyBuggyOrEvil
InvalidRollForward{} -> Just distantPeer
InvalidRollBack{} -> Just theyBuggyOrEvil
InvalidIntersection{} -> Just theyBuggyOrEvil
NoMoreIntersection{} -> Just distantPeer
DoesntFit{} -> Just theyBuggyOrEvil
InvalidBlock{} -> Just theyBuggyOrEvil
-- Dispatch on nested exception
, ErrorPolicy $ \(ExceptionInLinkedThread _ e) ->
evalErrorPolicies e (epAppErrorPolicies consensusErrorPolicy)
]
}
where
-- Shutdown the node. If we have a storage layer failure, the node /must/
-- be restarted (triggering recovery).
shutdownNode :: SuspendDecision DiffTime
shutdownNode = Throw
-- Peer sent us wrong data, but it's more than likely due to a
-- misconfiguration. We try to reconnect in a few minutes
misconfiguredPeer :: SuspendDecision DiffTime
misconfiguredPeer = SuspendPeer defaultDelay defaultDelay
-- Peer is either on a distant chain (one that forks more than k blocks ago)
-- or else is just too far behind; the chain sync client doesn't really have
-- any way of distinguishing between these two cases. If they are merely
-- far behind, we might want to reconnect to them later.
distantPeer :: SuspendDecision DiffTime
distantPeer = SuspendConsumer defaultDelay
-- The peer sent us some data that they could have known was invalid.
-- This can only be due to a bug or malice.
theyBuggyOrEvil :: SuspendDecision DiffTime
theyBuggyOrEvil = SuspendPeer defaultDelay defaultDelay
-- Something went wrong due to a bug in our code. We disconnect from the
-- peer, but allow to try again later in the hope the bug was transient.
-- We do not close the connection in the other direction; if the bug was
-- indeed local, it might not affect communication in the other direction.
ourBug :: SuspendDecision DiffTime
ourBug = SuspendConsumer defaultDelay
-- Default delay
--
-- We might want to tweak the delays for the various different kinds of
-- problems, but we'd need to establish a policy on how to set them.
defaultDelay :: DiffTime
defaultDelay = 200 -- seconds