From 0919737bdaa4b0bae0a89be48b1f001476cac382 Mon Sep 17 00:00:00 2001 From: Kamil Braun Date: Thu, 9 Nov 2023 16:18:04 +0100 Subject: [PATCH] migration_manager: more verbose logging for schema versions We're observing nodes getting stuck during bootstrap inside `storage_service::wait_for_ring_to_settle()`, which periodically checks `migration_manager::have_schema_agreement()` until it becomes `true`: scylladb/scylladb#15393. There is no obvious reason why that happens -- according to the nodes' logs, their latest in-memory schema version is the same. So either the gossiped schema version is for some reason different (perhaps there is a race in publishing `application_state::SCHEMA`) or missing entirely. Alternatively, `wait_for_ring_to_settle` is leaving the `have_schema_agreement` loop and getting stuck in `update_topology_change_info` trying to acquire a lock. Modify logging inside `have_schema_agreement` so details about missing schema or version mismatch are logged on INFO level, and an INFO level message is printed before we return `true`. To prevent logs from getting spammed, rate-limit the periodic messages to once every 5 seconds. This will still show the reason in our tests which allow the node to hang for many minutes before timing out. Also these schema agreement checks are done on relatively rare occasions such as bootstrap, so the additional logs should not be harmful. Furthermore, when publishing schema version to gossip, log it on INFO level. This is happening at most once per schema change so it's a rare version. If there's a race in publishing schema versions, this should allow us to observe it. Ref: scylladb/scylladb#15393 --- service/migration_manager.cc | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/service/migration_manager.cc b/service/migration_manager.cc index e5e2f55e5d2a..5f0304e9c18b 100644 --- a/service/migration_manager.cc +++ b/service/migration_manager.cc @@ -228,6 +228,7 @@ bool migration_manager::have_schema_agreement() { } auto our_version = _storage_proxy.get_db().local().get_version(); bool match = false; + static thread_local logger::rate_limit rate_limit{std::chrono::seconds{5}}; _gossiper.for_each_endpoint_state_until([&] (const gms::inet_address& endpoint, const gms::endpoint_state& eps) { if (endpoint == utils::fb_utilities::get_broadcast_address() || !_gossiper.is_alive(endpoint)) { return stop_iteration::no; @@ -235,13 +236,14 @@ bool migration_manager::have_schema_agreement() { mlogger.debug("Checking schema state for {}.", endpoint); auto schema = eps.get_application_state_ptr(gms::application_state::SCHEMA); if (!schema) { - mlogger.debug("Schema state not yet available for {}.", endpoint); + mlogger.log(log_level::info, rate_limit, "Schema state not yet available for {}.", endpoint); match = false; return stop_iteration::yes; } auto remote_version = table_schema_version(utils::UUID{schema->value()}); if (our_version != remote_version) { - mlogger.debug("Schema mismatch for {} ({} != {}).", endpoint, our_version, remote_version); + mlogger.log(log_level::info, rate_limit, "Schema mismatch for {} ({} != {}).", + endpoint, our_version, remote_version); match = false; return stop_iteration::yes; } else { @@ -249,6 +251,9 @@ bool migration_manager::have_schema_agreement() { } return stop_iteration::no; }); + if (match) { + mlogger.info("Schema agreement check passed."); + } return match; } @@ -1009,7 +1014,7 @@ void migration_manager::passive_announce(table_schema_version version) { future<> migration_manager::passive_announce() { assert(this_shard_id() == 0); - mlogger.debug("Gossiping my schema version {}", _schema_version_to_publish); + mlogger.info("Gossiping my schema version {}", _schema_version_to_publish); return _gossiper.add_local_application_state(gms::application_state::SCHEMA, gms::versioned_value::schema(_schema_version_to_publish)); }