Skip to content

Commit

Permalink
Disconnect peers on timer ticks to unblock channel state machine
Browse files Browse the repository at this point in the history
At times, we've noticed that channels with `lnd` counterparties do not
receive messages we expect to in a timely manner (or at all) after
sending them a `ChannelReestablish` upon reconnection, or a
`CommitmentSigned` message. This can block the channel state machine
from making progress, eventually leading to force closes, if any pending
HTLCs are committed and their expiration is met.

It seems common wisdom for `lnd` node operators to periodically restart
their node/reconnect to their peers, allowing them to start from a fresh
state such that the message we expect to receive hopefully gets sent. We
can achieve the same end result by disconnecting peers ourselves
(regardless of whether they're a `lnd` node), which we opt to implement
here by awaiting their response within two timer ticks.
  • Loading branch information
wpaulino committed May 24, 2023
1 parent 98768f2 commit 09c051f
Show file tree
Hide file tree
Showing 3 changed files with 197 additions and 3 deletions.
59 changes: 57 additions & 2 deletions lightning/src/ln/channel.rs
Expand Up @@ -479,6 +479,13 @@ pub(crate) const MIN_AFFORDABLE_HTLC_COUNT: usize = 4;
/// * `EXPIRE_PREV_CONFIG_TICKS` = convergence_delay / tick_interval
pub(crate) const EXPIRE_PREV_CONFIG_TICKS: usize = 5;

/// The number of ticks that may elapse while we're waiting for a response to a
/// [`msgs::RevokeAndACK`] or [`msgs::ChannelReestablish`] message before we attempt to disconnect
/// them.
///
/// See [`Channel::sent_message_awaiting_response`] for more information.
pub(crate) const DISCONNECT_PEER_AWAITING_RESPONSE_TICKS: usize = 2;

struct PendingChannelMonitorUpdate {
update: ChannelMonitorUpdate,
/// In some cases we need to delay letting the [`ChannelMonitorUpdate`] go until after an
Expand Down Expand Up @@ -715,6 +722,19 @@ pub(super) struct Channel<Signer: ChannelSigner> {
/// See-also <https://github.com/lightningnetwork/lnd/issues/4006>
pub workaround_lnd_bug_4006: Option<msgs::ChannelReady>,

/// An option set when we wish to track how many ticks have elapsed while waiting for a response
/// from our counterparty after sending a message. If the peer has yet to respond after reaching
/// `DISCONNECT_PEER_AWAITING_RESPONSE_TICKS`, a reconnection should be attempted to try to
/// unblock the state machine.
///
/// This behavior is mostly motivated by a lnd bug in which we don't receive a message we expect
/// to in a timely manner, which may lead to channels becoming unusable and/or force-closed. An
/// example of such can be found at <https://github.com/lightningnetwork/lnd/issues/7682>.
///
/// This is currently only used when waiting for a [`msgs::ChannelReestablish`] or
/// [`msgs::RevokeAndACK`] message from the counterparty.
sent_message_awaiting_response: Option<usize>,

#[cfg(any(test, fuzzing))]
// When we receive an HTLC fulfill on an outbound path, we may immediately fulfill the
// corresponding HTLC on the inbound path. If, then, the outbound path channel is
Expand Down Expand Up @@ -1130,6 +1150,7 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
next_remote_commitment_tx_fee_info_cached: Mutex::new(None),

workaround_lnd_bug_4006: None,
sent_message_awaiting_response: None,

latest_inbound_scid_alias: None,
outbound_scid_alias,
Expand Down Expand Up @@ -1489,6 +1510,7 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
next_remote_commitment_tx_fee_info_cached: Mutex::new(None),

workaround_lnd_bug_4006: None,
sent_message_awaiting_response: None,

latest_inbound_scid_alias: None,
outbound_scid_alias,
Expand Down Expand Up @@ -3526,6 +3548,7 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
// OK, we step the channel here and *then* if the new generation fails we can fail the
// channel based on that, but stepping stuff here should be safe either way.
self.channel_state &= !(ChannelState::AwaitingRemoteRevoke as u32);
self.sent_message_awaiting_response = None;
self.counterparty_prev_commitment_point = self.counterparty_cur_commitment_point;
self.counterparty_cur_commitment_point = Some(msg.next_per_commitment_point);
self.cur_counterparty_commitment_transaction_number -= 1;
Expand Down Expand Up @@ -3841,6 +3864,8 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
}
}

self.sent_message_awaiting_response = None;

self.channel_state |= ChannelState::PeerDisconnected as u32;
log_trace!(logger, "Peer disconnection resulted in {} remote-announced HTLC drops on channel {}", inbound_drop_count, log_bytes!(self.channel_id()));
}
Expand Down Expand Up @@ -3943,6 +3968,7 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
Some(self.get_last_revoke_and_ack())
} else { None };
let commitment_update = if self.monitor_pending_commitment_signed {
self.mark_awaiting_response();
Some(self.get_last_commitment_update(logger))
} else { None };

Expand Down Expand Up @@ -4132,6 +4158,7 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
// Go ahead and unmark PeerDisconnected as various calls we may make check for it (and all
// remaining cases either succeed or ErrorMessage-fail).
self.channel_state &= !(ChannelState::PeerDisconnected as u32);
self.sent_message_awaiting_response = None;

let shutdown_msg = if self.channel_state & (ChannelState::LocalShutdownSent as u32) != 0 {
assert!(self.shutdown_scriptpubkey.is_some());
Expand Down Expand Up @@ -4192,7 +4219,11 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
// revoke_and_ack, not on sending commitment_signed, so we add one if have
// AwaitingRemoteRevoke set, which indicates we sent a commitment_signed but haven't gotten
// the corresponding revoke_and_ack back yet.
let next_counterparty_commitment_number = INITIAL_COMMITMENT_NUMBER - self.cur_counterparty_commitment_transaction_number + if (self.channel_state & ChannelState::AwaitingRemoteRevoke as u32) != 0 { 1 } else { 0 };
let is_awaiting_remote_revoke = self.channel_state & ChannelState::AwaitingRemoteRevoke as u32 != 0;
if is_awaiting_remote_revoke && !self.is_awaiting_monitor_update() {
self.mark_awaiting_response();
}
let next_counterparty_commitment_number = INITIAL_COMMITMENT_NUMBER - self.cur_counterparty_commitment_transaction_number + if is_awaiting_remote_revoke { 1 } else { 0 };

let channel_ready = if msg.next_local_commitment_number == 1 && INITIAL_COMMITMENT_NUMBER - self.cur_holder_commitment_transaction_number == 1 {
// We should never have to worry about MonitorUpdateInProgress resending ChannelReady
Expand Down Expand Up @@ -4361,6 +4392,28 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
}), None))
}

// Marks a channel as waiting for a response from the counterparty. If it's not received
// [`DISCONNECT_PEER_AWAITING_RESPONSE_TICKS`] after sending our own to them, then we'll attempt
// a reconnection.
fn mark_awaiting_response(&mut self) {
self.sent_message_awaiting_response = Some(0);
}

/// Determines whether we should disconnect the counterparty due to not receiving a response
/// within our expected timeframe.
///
/// This should be called on every [`super::channelmanager::ChannelManager::timer_tick_occurred`].
pub fn should_disconnect_peer_awaiting_response(&mut self) -> bool {
let ticks_elapsed = if let Some(ticks_elapsed) = self.sent_message_awaiting_response.as_mut() {
ticks_elapsed
} else {
// Don't disconnect when we're not waiting on a response.
return false;
};
*ticks_elapsed += 1;
*ticks_elapsed >= DISCONNECT_PEER_AWAITING_RESPONSE_TICKS
}

pub fn shutdown<SP: Deref>(
&mut self, signer_provider: &SP, their_features: &InitFeatures, msg: &msgs::Shutdown
) -> Result<(Option<msgs::Shutdown>, Option<&ChannelMonitorUpdate>, Vec<(HTLCSource, PaymentHash)>), ChannelError>
Expand Down Expand Up @@ -5733,7 +5786,7 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {

/// May panic if called on a channel that wasn't immediately-previously
/// self.remove_uncommitted_htlcs_and_mark_paused()'d
pub fn get_channel_reestablish<L: Deref>(&self, logger: &L) -> msgs::ChannelReestablish where L::Target: Logger {
pub fn get_channel_reestablish<L: Deref>(&mut self, logger: &L) -> msgs::ChannelReestablish where L::Target: Logger {
assert_eq!(self.channel_state & ChannelState::PeerDisconnected as u32, ChannelState::PeerDisconnected as u32);
assert_ne!(self.cur_counterparty_commitment_transaction_number, INITIAL_COMMITMENT_NUMBER);
// Prior to static_remotekey, my_current_per_commitment_point was critical to claiming
Expand All @@ -5752,6 +5805,7 @@ impl<Signer: WriteableEcdsaChannelSigner> Channel<Signer> {
log_info!(logger, "Sending a data_loss_protect with no previous remote per_commitment_secret for channel {}", log_bytes!(self.channel_id()));
[0;32]
};
self.mark_awaiting_response();
msgs::ChannelReestablish {
channel_id: self.channel_id(),
// The protocol has two different commitment number concepts - the "commitment
Expand Down Expand Up @@ -7090,6 +7144,7 @@ impl<'a, 'b, 'c, ES: Deref, SP: Deref> ReadableArgs<(&'a ES, &'b SP, u32, &'c Ch
next_remote_commitment_tx_fee_info_cached: Mutex::new(None),

workaround_lnd_bug_4006: None,
sent_message_awaiting_response: None,

latest_inbound_scid_alias,
// Later in the ChannelManager deserialization phase we scan for channels and assign scid aliases if its missing
Expand Down
14 changes: 14 additions & 0 deletions lightning/src/ln/channelmanager.rs
Expand Up @@ -3921,6 +3921,20 @@ where

chan.maybe_expire_prev_config();

if chan.should_disconnect_peer_awaiting_response() {
log_debug!(self.logger, "Disconnecting peer {} due to not making any progress on channel {}",
counterparty_node_id, log_bytes!(*chan_id));
pending_msg_events.push(MessageSendEvent::HandleError {
node_id: counterparty_node_id,
action: msgs::ErrorAction::DisconnectPeerWithWarning {
msg: msgs::WarningMessage {
channel_id: *chan_id,
data: "Disconnecting due to timeout awaiting response".to_owned(),
},
},
});
}

true
});
if peer_state.ok_to_remove(true) {
Expand Down
127 changes: 126 additions & 1 deletion lightning/src/ln/functional_tests.rs
Expand Up @@ -22,7 +22,7 @@ use crate::events::{Event, MessageSendEvent, MessageSendEventsProvider, PathFail
use crate::ln::{PaymentPreimage, PaymentSecret, PaymentHash};
use crate::ln::channel::{commitment_tx_base_weight, COMMITMENT_TX_WEIGHT_PER_HTLC, CONCURRENT_INBOUND_HTLC_FEE_BUFFER, FEE_SPIKE_BUFFER_FEE_INCREASE_MULTIPLE, MIN_AFFORDABLE_HTLC_COUNT};
use crate::ln::channelmanager::{self, PaymentId, RAACommitmentOrder, PaymentSendFailure, RecipientOnionFields, BREAKDOWN_TIMEOUT, ENABLE_GOSSIP_TICKS, DISABLE_GOSSIP_TICKS, MIN_CLTV_EXPIRY_DELTA};
use crate::ln::channel::{Channel, ChannelError};
use crate::ln::channel::{DISCONNECT_PEER_AWAITING_RESPONSE_TICKS, Channel, ChannelError};
use crate::ln::{chan_utils, onion_utils};
use crate::ln::chan_utils::{OFFERED_HTLC_SCRIPT_WEIGHT, htlc_success_tx_weight, htlc_timeout_tx_weight, HTLCOutputInCommitment};
use crate::routing::gossip::{NetworkGraph, NetworkUpdate};
Expand Down Expand Up @@ -9955,3 +9955,128 @@ fn test_payment_with_custom_min_cltv_expiry_delta() {
do_payment_with_custom_min_final_cltv_expiry(true, false);
do_payment_with_custom_min_final_cltv_expiry(true, true);
}

#[test]
fn test_disconnects_peer_awaiting_response_ticks() {
// Tests that nodes which are awaiting on a response critical for channel responsiveness
// disconnect their counterparty after `DISCONNECT_PEER_AWAITING_RESPONSE_TICKS`.
let mut chanmon_cfgs = create_chanmon_cfgs(2);
let node_cfgs = create_node_cfgs(2, &chanmon_cfgs);
let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]);
let nodes = create_network(2, &node_cfgs, &node_chanmgrs);

// Asserts a disconnect event is queued to the user.
let check_disconnect_event = |node: &Node, should_disconnect: bool| {
let disconnect_event = node.node.get_and_clear_pending_msg_events().iter().find_map(|event|
if let MessageSendEvent::HandleError { action, .. } = event {
if let msgs::ErrorAction::DisconnectPeerWithWarning { .. } = action {
Some(())
} else {
None
}
} else {
None
}
);
assert_eq!(disconnect_event.is_some(), should_disconnect);
};

// Fires timer ticks ensuring we only attempt to disconnect peers after reaching
// `DISCONNECT_PEER_AWAITING_RESPONSE_TICKS`.
let check_disconnect = |node: &Node| {
// No disconnect without any timer ticks.
check_disconnect_event(node, false);

// No disconnect with 1 timer tick less than required.
for _ in 0..DISCONNECT_PEER_AWAITING_RESPONSE_TICKS - 1 {
node.node.timer_tick_occurred();
check_disconnect_event(node, false);
}

// Disconnect after reaching the required ticks.
node.node.timer_tick_occurred();
check_disconnect_event(node, true);

// Disconnect again on the next tick if the peer hasn't been disconnected yet.
node.node.timer_tick_occurred();
check_disconnect_event(node, true);
};

create_chan_between_nodes(&nodes[0], &nodes[1]);

// We'll start by performing a fee update with Alice (nodes[0]) on the channel.
*nodes[0].fee_estimator.sat_per_kw.lock().unwrap() *= 2;
nodes[0].node.timer_tick_occurred();
check_added_monitors!(&nodes[0], 1);
let alice_fee_update = get_htlc_update_msgs(&nodes[0], &nodes[1].node.get_our_node_id());
nodes[1].node.handle_update_fee(&nodes[0].node.get_our_node_id(), alice_fee_update.update_fee.as_ref().unwrap());
nodes[1].node.handle_commitment_signed(&nodes[0].node.get_our_node_id(), &alice_fee_update.commitment_signed);
check_added_monitors!(&nodes[1], 1);

// This will prompt Bob (nodes[1]) to respond with his `CommitmentSigned` and `RevokeAndACK`.
let (bob_revoke_and_ack, bob_commitment_signed) = get_revoke_commit_msgs!(&nodes[1], nodes[0].node.get_our_node_id());
nodes[0].node.handle_revoke_and_ack(&nodes[1].node.get_our_node_id(), &bob_revoke_and_ack);
check_added_monitors!(&nodes[0], 1);
nodes[0].node.handle_commitment_signed(&nodes[1].node.get_our_node_id(), &bob_commitment_signed);
check_added_monitors(&nodes[0], 1);

// Alice then needs to send her final `RevokeAndACK` to complete the commitment dance. We
// pretend Bob hasn't received the message and check whether he'll disconnect Alice after
// reaching `DISCONNECT_PEER_AWAITING_RESPONSE_TICKS`.
let alice_revoke_and_ack = get_event_msg!(nodes[0], MessageSendEvent::SendRevokeAndACK, nodes[1].node.get_our_node_id());
check_disconnect(&nodes[1]);

// Now, we'll reconnect them to test awaiting a `ChannelReestablish` message.
//
// Note that since the commitment dance didn't complete above, Alice is expected to resend her
// final `RevokeAndACK` to Bob to complete it.
nodes[0].node.peer_disconnected(&nodes[1].node.get_our_node_id());
nodes[1].node.peer_disconnected(&nodes[0].node.get_our_node_id());
let bob_init = msgs::Init { features: nodes[1].node.init_features(), remote_network_address: None };
nodes[0].node.peer_connected(&nodes[1].node.get_our_node_id(), &bob_init, true).unwrap();
let alice_init = msgs::Init { features: nodes[0].node.init_features(), remote_network_address: None };
nodes[1].node.peer_connected(&nodes[0].node.get_our_node_id(), &alice_init, true).unwrap();

// Upon reconnection, Alice sends her `ChannelReestablish` to Bob. Alice, however, hasn't
// received Bob's yet, so she should disconnect him after reaching
// `DISCONNECT_PEER_AWAITING_RESPONSE_TICKS`.
let alice_channel_reestablish = get_event_msg!(
nodes[0], MessageSendEvent::SendChannelReestablish, nodes[1].node.get_our_node_id()
);
nodes[1].node.handle_channel_reestablish(&nodes[0].node.get_our_node_id(), &alice_channel_reestablish);
check_disconnect(&nodes[0]);

// Bob now sends his `ChannelReestablish` to Alice to resume the channel and consider it "live".
let bob_channel_reestablish = nodes[1].node.get_and_clear_pending_msg_events().iter().find_map(|event|
if let MessageSendEvent::SendChannelReestablish { node_id, msg } = event {
assert_eq!(*node_id, nodes[0].node.get_our_node_id());
Some(msg.clone())
} else {
None
}
).unwrap();
nodes[0].node.handle_channel_reestablish(&nodes[1].node.get_our_node_id(), &bob_channel_reestablish);

// Sanity check that Alice won't disconnect Bob since she's no longer waiting for any messages.
for _ in 0..DISCONNECT_PEER_AWAITING_RESPONSE_TICKS {
nodes[0].node.timer_tick_occurred();
check_disconnect_event(&nodes[0], false);
}

// However, Bob is still waiting on Alice's `RevokeAndACK`, so he should disconnect her after
// reaching `DISCONNECT_PEER_AWAITING_RESPONSE_TICKS`.
check_disconnect(&nodes[1]);

// Finally, have Bob process the last message.
nodes[1].node.handle_revoke_and_ack(&nodes[0].node.get_our_node_id(), &alice_revoke_and_ack);
check_added_monitors(&nodes[1], 1);

// At this point, neither node should attempt to disconnect each other, since they aren't
// waiting on any messages.
for node in &nodes {
for _ in 0..DISCONNECT_PEER_AWAITING_RESPONSE_TICKS {
node.node.timer_tick_occurred();
check_disconnect_event(node, false);
}
}
}

0 comments on commit 09c051f

Please sign in to comment.