Skip to content

Commit

Permalink
Fix START/STOP SLAVE deadlock caused by slave stats daemon
Browse files Browse the repository at this point in the history
Under load, if START SLAVE IO_THREAD and STOP SLAVE execute concurrently, the
following deadlock is possible:

1) The START thread waits to acquire the channel map RW lock in shared mode and
does not check whether it was signaled to quit by the STOP thread:

frame facebook#5: 0x000000010075303c mysqld`Checkable_rwlock::rdlock(this=0x0000600001cd80e0) at rpl_gtid.h:464:5
frame facebook#6: 0x0000000100752168 mysqld`Multisource_info::rdlock(this=0x0000000105d2a780) at rpl_msr.h:408:25
frame facebook#7: 0x0000000100751dc8 mysqld`start_handle_slave_stats_daemon() at slave_stats_daemon.cc:233:15
frame facebook#8: 0x00000001020520ec mysqld`handle_slave_io(arg=0x000000011a869000) at rpl_replica.cc:6051:36

2) The STOP thread waits for the previous thread to stop while holding the
channel map RW lock in exclusive mode:

frame facebook#5: 0x0000000102072a38 mysqld`inline_mysql_cond_timedwait(that=0x000000011a869638, mutex=0x000000011a8694f0, abstime=0x00000001732d9588, src_file="/Users/laurynas/vilniusdb/myr-native-dd-proto/sql/rpl_replica.cc", src_line=2368) at mysql_cond.h:224:16
frame facebook#6: 0x0000000102050128 mysqld`terminate_slave_thread(thd=0x000000011b3ed800, term_lock=0x000000011a8694f0, term_cond=0x000000011a869638, slave_running=0x000000011a8696f4, stop_wait_timeout=0x00000001732d9700, need_lock_term=false, force=false) at rpl_replica.cc:2368:9
frame facebook#7: 0x000000010204fa6c mysqld`terminate_slave_threads(mi=0x000000011a869000, thread_mask=1, stop_wait_timeout=31536000, need_lock_term=false) at rpl_replica.cc:2204:18
frame facebook#8: 0x0000000102048020 mysqld`stop_slave(thd=0x000000011a908000, mi=0x000000011a869000, net_report=true, for_one_channel=true, push_temp_tables_warning=0x00000001732d9a37, invoked_by_raft=false) at rpl_replica.cc:10033:9
frame facebook#9: 0x0000000102048ca8 mysqld`stop_slave_cmd(thd=0x000000011a908000) at rpl_replica.cc:971:13

For the fix, observe that the starting replica I/O thread only tries to signal the
stats thread to start, thus move this code to the START REPLICA
command-executing thread instead, which already happens to hold the channel map
lock. This also forces to move the stopping of the stats thread from the replica
I/O thread to the STOP REPLICA command-executing thread.

This fixes intermittent but often-seen failures on
rpl.rpl_multi_source_channel_map_stress.

Squash with b015dd3
  • Loading branch information
laurynas-biveinis committed Oct 19, 2023
1 parent 1b0e69b commit fcbd7f7
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 16 deletions.
25 changes: 12 additions & 13 deletions sql/rpl_replica.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2173,6 +2173,10 @@ int terminate_slave_threads(Master_info *mi, int thread_mask,

if (thread_mask & (SLAVE_IO | SLAVE_FORCE_ALL)) {
DBUG_PRINT("info", ("Terminating IO thread"));

// stop sending secondary lag stats to primary
stop_handle_slave_stats_daemon();

mi->abort_slave = true;
DBUG_EXECUTE_IF("pause_after_queue_event",
{ rpl_replica_debug_point(DBUG_RPL_S_PAUSE_QUEUE_EV); });
Expand Down Expand Up @@ -2513,10 +2517,17 @@ bool start_slave_threads(bool need_lock_slave, bool wait_for_start,
lock_cond_sql = &mi->rli->run_lock;
}

if ((thread_mask & SLAVE_IO) && !enable_raft_plugin)
if ((thread_mask & SLAVE_IO) && !enable_raft_plugin) {
is_error = start_slave_thread(key_thread_replica_io, handle_slave_io,
lock_io, lock_cond_io, cond_io,
&mi->slave_running, &mi->slave_run_id, mi);
if (!is_error) {
// clean up - stop previous run of slave_stats_daemon, if any
stop_handle_slave_stats_daemon();
// start sending secondary lag stats to primary
start_handle_slave_stats_daemon();
}
}

if (!is_error && (thread_mask & (SLAVE_IO | SLAVE_MONITOR)) &&
mi->is_source_connection_auto_failover() &&
Expand Down Expand Up @@ -5877,7 +5888,6 @@ extern "C" void *handle_slave_io(void *arg) {
uint retry_count;
bool suppress_warnings;
int ret;
bool slave_stats_daemon_created = false;
Global_THD_manager *thd_manager = Global_THD_manager::get_instance();
// needs to call my_thread_init(), otherwise we get a coredump in DBUG_ stuff
my_thread_init();
Expand Down Expand Up @@ -6043,12 +6053,6 @@ extern "C" void *handle_slave_io(void *arg) {
goto connected;
}

if (!slave_stats_daemon_created) {
// clean up - stop previous run of slave_stats_daemon, if any
stop_handle_slave_stats_daemon();
// start sending secondary lag stats to primary
slave_stats_daemon_created = start_handle_slave_stats_daemon();
}
DBUG_PRINT("info", ("Starting reading binary log from master"));
while (!io_slave_killed(thd, mi)) {
MYSQL_RPL rpl;
Expand Down Expand Up @@ -6270,11 +6274,6 @@ extern "C" void *handle_slave_io(void *arg) {

// error = 0;
err:
if (slave_stats_daemon_created) {
// stop sending secondary lag stats to primary
stop_handle_slave_stats_daemon();
}

/*
If source_connection_auto_failover (async connection failover) is
enabled, this server is not a Group Replication SECONDARY and
Expand Down
4 changes: 1 addition & 3 deletions sql/slave_stats_daemon.cc
Original file line number Diff line number Diff line change
Expand Up @@ -230,18 +230,16 @@ static void *handle_slave_stats_daemon(void *arg MY_ATTRIBUTE((unused))) {
bool start_handle_slave_stats_daemon() {
DBUG_ENTER("start_handle_slave_stats_daemon");

channel_map.rdlock();
channel_map.assert_some_lock();
if (channel_map.get_num_instances() != 1) {
// more than one channels exists for this slave. We only support
// single source slave topologies for now. Skip creating the thread.
sql_print_information(
"Number of channels = %lu. There should be only one channel"
" with slave_stats_daemon. Not creating the thread.",
channel_map.get_num_instances());
channel_map.unlock();
DBUG_RETURN(false);
}
channel_map.unlock();

my_thread_handle thread_handle;
slave_stats_daemon_thread_counter++;
Expand Down

0 comments on commit fcbd7f7

Please sign in to comment.