Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Retry send_tx on account sequence mismatch #1349

Merged
merged 16 commits into from
Dec 21, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .changelog/unreleased/bug-fixes/1264-recover-acct-seq.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
- Added a recovery mechanism to automatically retry or drop tx upon account
sequence mismatch errors ([#1264](https://github.com/informalsystems/ibc-
rs/issues/1264))
192 changes: 151 additions & 41 deletions relayer/src/chain/cosmos.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ use chrono::DateTime;
use itertools::Itertools;
use prost::Message;
use prost_types::Any;
use tendermint::abci::{Event, Path as TendermintABCIPath};
use tendermint::abci::{Code, Event, Path as TendermintABCIPath};
use tendermint::account::Id as AccountId;
use tendermint::block::Height;
use tendermint::consensus::Params;
Expand Down Expand Up @@ -87,6 +87,7 @@ use crate::keyring::{KeyEntry, KeyRing};
use crate::light_client::tendermint::LightClient as TmLightClient;
use crate::light_client::LightClient;
use crate::light_client::Verified;
use crate::util::retry::{retry_with_index, RetryResult};
use crate::{
chain::handle::requests::AppVersion, chain::QueryResponse, chain::StatusResponse,
event::monitor::TxMonitorCmd,
Expand Down Expand Up @@ -114,10 +115,22 @@ const DEFAULT_FEE_GRANTER: &str = "";
/// fraction of the maximum block size defined in the Tendermint core consensus parameters.
pub const GENESIS_MAX_BYTES_MAX_FRACTION: f64 = 0.9;

// The error "incorrect account sequence" is defined as the unique error code 32 in cosmos-sdk:
// https://github.com/cosmos/cosmos-sdk/blob/v0.44.0/types/errors/errors.go#L115-L117
pub const INCORRECT_ACCOUNT_SEQUENCE_ERR: u32 = 32;

mod retry_strategy {
use crate::util::retry::Fixed;
use core::time::Duration;

// Maximum number of retries for send_tx in the case of
// an account sequence mismatch.
pub const MAX_ACCOUNT_SEQUENCE_RETRY: u32 = 5;

// Backoff multiplier to apply while retrying in the case
// of account sequence mismatch.
pub const BACKOFF_MULTIPLIER_ACCOUNT_SEQUENCE_RETRY: u64 = 200;

pub fn wait_for_block_commits(max_total_wait: Duration) -> impl Iterator<Item = Duration> {
let backoff_millis = 300; // The periodic backoff
let count: usize = (max_total_wait.as_millis() / backoff_millis as u128) as usize;
Expand Down Expand Up @@ -284,11 +297,11 @@ impl CosmosSdkChain {
self.rt.block_on(f)
}

fn send_tx(&mut self, proto_msgs: Vec<Any>) -> Result<Response, Error> {
crate::time!("send_tx");

let account_seq = self.account_sequence()?;

fn send_tx_with_account_sequence(
&mut self,
proto_msgs: Vec<Any>,
account_seq: u64,
) -> Result<Response, Error> {
debug!(
"[{}] send_tx: sending {} messages using account sequence {}",
self.id(),
Expand Down Expand Up @@ -316,6 +329,7 @@ impl CosmosSdkChain {
signatures: vec![signed_doc],
};

// This may result in an account sequence mismatch error
let estimated_gas = self.estimate_gas(simulate_tx)?;

if estimated_gas > self.max_gas() {
Expand Down Expand Up @@ -352,31 +366,110 @@ impl CosmosSdkChain {
prost::Message::encode(&tx_raw, &mut tx_bytes)
.map_err(|e| Error::protobuf_encode(String::from("Transaction"), e))?;

let response = self.block_on(broadcast_tx_sync(
self.block_on(broadcast_tx_sync(
self.rpc_client(),
&self.config.rpc_addr,
tx_bytes,
))?;
))
}

match response.code {
tendermint::abci::Code::Ok => {
// A success means the account s.n. was increased
self.incr_account_sequence()?;
debug!("[{}] send_tx: broadcast_tx_sync: {:?}", self.id(), response);
/// Try to `send_tx` with retry on account sequence error.
/// An account sequence error can occur if the account sequence that
/// the relayer caches becomes outdated. This may happen if the relayer
/// wallet is used concurrently elsewhere, or when tx fees are mis-configured
/// leading to transactions hanging in the mempool.
///
/// Account sequence mismatch error can occur at two separate steps:
/// 1. as Err variant, propagated from the `estimate_gas` step.
/// 2. as an Ok variant, with an Code::Err response, propagated from
/// the `broadcast_tx_sync` step.
///
/// We treat both cases by re-fetching the account sequence number
/// from the full node.
/// Upon case #1, we do not retry submitting the same tx (retry happens
/// nonetheless at the worker `step` level). Upon case #2, we retry
/// submitting the same transaction.
fn send_tx_with_account_sequence_retry(
&mut self,
proto_msgs: Vec<Any>,
retry_counter: u32,
) -> Result<Response, Error> {
let account_sequence = self.account_sequence()?;

match self.send_tx_with_account_sequence(proto_msgs.clone(), account_sequence) {
// Gas estimation failed with acct. s.n. mismatch.
// This indicates that the full node did not yet push the previous tx out of its
// mempool. Possible explanations: fees too low, network congested, or full node
// congested. Whichever the case, it is more expedient in production to drop the tx
// and refresh the s.n., to allow proceeding to the other transactions.
Err(e) if mismatching_account_sequence_number(&e) => {
warn!("send_tx failed at estimate_gas step mismatching account sequence: dropping the tx & refreshing account sequence number");
self.refresh_account()?;
// Note: propagating error here can lead to bug:
// https://github.com/informalsystems/ibc-rs/issues/1153
// But periodic packet clearing will catch any dropped packets.
Err(e)
}
tendermint::abci::Code::Err(code) => {
// Avoid increasing the account s.n. if CheckTx failed
// Log the error
error!(
"[{}] send_tx: broadcast_tx_sync: {:?}: diagnostic: {:?}",
self.id(),
response,
sdk_error_from_tx_sync_error_code(code)
);

// Gas estimation succeeded. Broadcasting failed with a retry-able error.
Ok(response) if response.code == Code::Err(INCORRECT_ACCOUNT_SEQUENCE_ERR) => {
if retry_counter < retry_strategy::MAX_ACCOUNT_SEQUENCE_RETRY {
let retry_counter = retry_counter + 1;
warn!("send_tx failed at broadcast step with incorrect account sequence. retrying ({}/{})",
retry_counter, retry_strategy::MAX_ACCOUNT_SEQUENCE_RETRY);
// Backoff & re-fetch the account s.n.
let backoff = (retry_counter as u64)
* retry_strategy::BACKOFF_MULTIPLIER_ACCOUNT_SEQUENCE_RETRY;
thread::sleep(Duration::from_millis(backoff));
self.refresh_account()?;

// Now retry.
self.send_tx_with_account_sequence_retry(proto_msgs, retry_counter + 1)
} else {
// If after the max retry we still get an account sequence mismatch error,
// we ignore the error and return the original response to downstream.
// We do not return an error here, because the current convention
// let the caller handle error responses separately.
error!("failed to send_tx due to account sequence errors. the relayer wallet may be used elsewhere concurrently.");
Ok(response)
}
}

// Catch-all arm for the Ok variant.
// This is the case when gas estimation succeeded.
Ok(response) => {
// Complete success.
match response.code {
tendermint::abci::Code::Ok => {
debug!("[{}] send_tx: broadcast_tx_sync: {:?}", self.id(), response);

self.incr_account_sequence();
Ok(response)
}
// Gas estimation succeeded, but broadcasting failed with unrecoverable error.
tendermint::abci::Code::Err(code) => {
// Avoid increasing the account s.n. if CheckTx failed
// Log the error
error!(
"[{}] send_tx: broadcast_tx_sync: {:?}: diagnostic: {:?}",
self.id(),
response,
sdk_error_from_tx_sync_error_code(code)
);
Ok(response)
}
}
}

// Catch-all case for the Err variant.
// Gas estimation failure or other unrecoverable error, propagate.
Err(e) => Err(e),
}
}

Ok(response)
fn send_tx(&mut self, proto_msgs: Vec<Any>) -> Result<Response, Error> {
crate::time!("send_tx");
self.send_tx_with_account_sequence_retry(proto_msgs, 0)
}

/// Try to simulate the given tx in order to estimate how much gas will be needed to submit it.
Expand All @@ -387,7 +480,7 @@ impl CosmosSdkChain {
///
/// If the batch is split in two TX-es, the second one will fail the simulation in `deliverTx` check.
/// In this case we use the `default_gas` param.
fn estimate_gas(&self, tx: Tx) -> Result<u64, Error> {
fn estimate_gas(&mut self, tx: Tx) -> Result<u64, Error> {
let simulated_gas = self.send_tx_simulate(tx).map(|sr| sr.gas_info);

match simulated_gas {
Expand Down Expand Up @@ -426,11 +519,11 @@ impl CosmosSdkChain {

Err(e) => {
error!(
"[{}] estimate_gas: failed to simulate tx with non-recoverable error: {}",
"[{}] estimate_gas: failed to simulate tx. propagating error to caller: {}",
self.id(),
e.detail()
);

// Propagate the error, the retrying mechanism at caller may catch & retry.
Err(e)
}
}
Expand Down Expand Up @@ -600,21 +693,27 @@ impl CosmosSdkChain {
Ok((key, key_bytes))
}

fn account(&mut self) -> Result<&mut BaseAccount, Error> {
fn refresh_account(&mut self) -> Result<(), Error> {
let account = self.block_on(query_account(self, self.key()?.account))?;
debug!(
sequence = %account.sequence,
number = %account.account_number,
"[{}] send_tx: retrieved account",
self.id()
);

self.account = Some(account);
Ok(())
}

fn account(&mut self) -> Result<&BaseAccount, Error> {
if self.account == None {
let account = self.block_on(query_account(self, self.key()?.account))?;
debug!(
sequence = %account.sequence,
number = %account.account_number,
"[{}] send_tx: retrieved account",
self.id()
);
self.account = Some(account);
self.refresh_account()?;
}

Ok(self
.account
.as_mut()
.as_ref()
.expect("account was supposedly just cached"))
}

Expand All @@ -626,9 +725,10 @@ impl CosmosSdkChain {
Ok(self.account()?.sequence)
}

fn incr_account_sequence(&mut self) -> Result<(), Error> {
self.account()?.sequence += 1;
Ok(())
fn incr_account_sequence(&mut self) {
if let Some(account) = &mut self.account {
account.sequence += 1;
}
}

fn signer(&self, sequence: u64) -> Result<SignerInfo, Error> {
Expand Down Expand Up @@ -712,8 +812,6 @@ impl CosmosSdkChain {
&self,
mut tx_sync_results: Vec<TxSyncResult>,
) -> Result<Vec<TxSyncResult>, Error> {
use crate::util::retry::{retry_with_index, RetryResult};

let hashes = tx_sync_results
.iter()
.map(|res| res.response.hash.to_string())
Expand Down Expand Up @@ -2414,6 +2512,18 @@ fn can_recover_from_simulation_failure(e: &Error) -> bool {
}
}

/// Determine whether the given error yielded by `tx_simulate`
/// indicates that the current sequence number cached in Hermes
/// may be out-of-sync with the full node's version of the s.n.
fn mismatching_account_sequence_number(e: &Error) -> bool {
use crate::error::ErrorDetail::*;

match e.detail() {
GrpcStatus(detail) => detail.is_account_sequence_mismatch(),
_ => false,
}
}

struct PrettyFee<'a>(&'a Fee);

impl fmt::Display for PrettyFee<'_> {
Expand Down
16 changes: 12 additions & 4 deletions relayer/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -534,11 +534,19 @@ impl GrpcStatusSubdetail {

/// Check whether this gRPC error matches
/// - status: InvalidArgument
/// - message: account sequence mismatch ...
/// - message: "account sequence mismatch, expected 166791, got 166793: incorrect account sequence: invalid request"
///
/// # Note:
/// This predicate is tested and validated against errors
/// that appear at the `estimate_gas` step. The error
/// predicate to be used at the `broadcast_tx_sync` step
/// is different & relies on parsing the Response error code.
pub fn is_account_sequence_mismatch(&self) -> bool {
if self.status.code() != tonic::Code::InvalidArgument {
return false;
}
// The code changed in SDK 0.44 from `InvalidArgument` to `Unknown`.
// Workaround: Ignore code. We'll match only on the status message.
// if self.status.code() != tonic::Code::InvalidArgument {
// return false;
// }

self.status
.message()
Expand Down