Skip to content

Commit

Permalink
Fix destroy command to be resilient to missing monitor. (#52)
Browse files Browse the repository at this point in the history
* Fix destroy command to be resilient to missing monitor.

In case when we can't contact the monitor at `pg_autoctl do destroy` time,
it's better to just continue with the destroy process rather than fail here.
After all, we might have destroyed the monitor already, and we want to get
rid of the local node anyway.
  • Loading branch information
DimCitus committed Aug 28, 2019
1 parent 61d47ff commit 155b435
Show file tree
Hide file tree
Showing 5 changed files with 25 additions and 29 deletions.
3 changes: 2 additions & 1 deletion src/bin/pg_autoctl/cli_create_drop_node.c
Original file line number Diff line number Diff line change
Expand Up @@ -505,6 +505,7 @@ cli_drop_node(int argc, char **argv)
KeeperConfig config = keeperOptions;
bool missing_pgdata_is_ok = true;
bool pg_is_not_running_is_ok = true;
bool ignore_monitor_errors = false;
pid_t pid;

if (!keeper_config_read_file(&config,
Expand All @@ -515,7 +516,7 @@ cli_drop_node(int argc, char **argv)
exit(EXIT_CODE_BAD_CONFIG);
}

if (!keeper_remove(&keeper, &config))
if (!keeper_remove(&keeper, &config, ignore_monitor_errors))
{
log_fatal("Failed to remove local node from the pg_auto_failover monitor, "
"see above for details");
Expand Down
7 changes: 5 additions & 2 deletions src/bin/pg_autoctl/cli_do_misc.c
Original file line number Diff line number Diff line change
Expand Up @@ -504,12 +504,15 @@ keeper_cli_destroy_keeper_node(Keeper *keeper, KeeperConfig *config)
/* only keeper_remove when we still have a state file around */
if (file_exists(config->pathnames.state))
{
bool ignore_monitor_errors = true;

/* keeper_remove uses log_info() to explain what's happening */
if (!keeper_remove(keeper, config))
if (!keeper_remove(keeper, config, ignore_monitor_errors))
{
log_fatal("Failed to remove local node from the pg_auto_failover "
"monitor, see above for details");
exit(EXIT_CODE_INTERNAL_ERROR);

exit(EXIT_CODE_BAD_STATE);
}
}
else
Expand Down
37 changes: 13 additions & 24 deletions src/bin/pg_autoctl/keeper.c
Original file line number Diff line number Diff line change
Expand Up @@ -741,13 +741,15 @@ keeper_register_and_init(Keeper *keeper,
* local state file.
*/
bool
keeper_remove(Keeper *keeper, KeeperConfig *config)
keeper_remove(Keeper *keeper, KeeperConfig *config, bool ignore_monitor_errors)
{
int errors = 0;

/*
* We don't require keeper_init() to have been done before calling
* keeper_remove, because then we would fail to finish a remove that was
* half-done only: keeper_init loads the state from the state file, which
* might not been there anymore.
* might not exists anymore.
*
* That said, we're going to require keeper->config to have been set the
* usual way, so do that at least.
Expand All @@ -759,24 +761,6 @@ keeper_remove(Keeper *keeper, KeeperConfig *config)
return false;
}

/*
* First, if we are dealing with a Citus Worker node, get the coordinator
* address from the monitor and remove the local node from there, only when
* we are in state SINGLE. In other states, either we are a primary and the
* standby will then be promoted and call master_update_node() on the
* coordinator, or we are a standby and we need to leave the metadata on
* the coordinator as it is.
*/
if (file_exists(keeper->config.pathnames.state))
{
if (!keeper_load_state(keeper))
{
/* here it's a non-fatal error of course. */
log_warn("Failed to read existing state file \"%s\"",
keeper->config.pathnames.state);
}
}

log_info("Removing local node from the pg_auto_failover monitor.");

/*
Expand All @@ -790,15 +774,20 @@ keeper_remove(Keeper *keeper, KeeperConfig *config)
config->pgSetup.pgport))
{
/* we already logged about errors */
return false;
errors++;

if (!ignore_monitor_errors)
{
return false;
}
}

log_info("Removing local node state file: \"%s\"", config->pathnames.state);

if (!unlink_file(config->pathnames.state))
{
/* we already logged about errors */
return false;
errors++;
}

log_info("Removing local node init state file: \"%s\"",
Expand All @@ -807,10 +796,10 @@ keeper_remove(Keeper *keeper, KeeperConfig *config)
if (!unlink_file(config->pathnames.init))
{
/* we already logged about errors */
return false;
errors++;
}

return true;
return errors == 0;
}


Expand Down
3 changes: 2 additions & 1 deletion src/bin/pg_autoctl/keeper.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ bool keeper_restart_postgres(Keeper *keeper);
bool keeper_ensure_current_state(Keeper *keeper);
bool keeper_update_pg_state(Keeper *keeper);
bool ReportPgIsRunning(Keeper *keeper);
bool keeper_remove(Keeper *keeper, KeeperConfig *config);
bool keeper_remove(Keeper *keeper, KeeperConfig *config,
bool ignore_monitor_errors);
bool keeper_check_monitor_extension_version(Keeper *keeper);

bool keeper_init_state_write(Keeper *keeper);
Expand Down
4 changes: 3 additions & 1 deletion src/bin/pg_autoctl/keeper_pg_init.c
Original file line number Diff line number Diff line change
Expand Up @@ -132,14 +132,16 @@ keeper_pg_init(Keeper *keeper, KeeperConfig *config)

if (keeper->state.assigned_role != SINGLE_STATE)
{
bool ignore_monitor_errors = false;

log_error("There is already another postgres node, so the monitor "
"wants us to be in state %s. However, that would involve "
"removing the database directory.",
NodeStateToString(keeper->state.assigned_role));

log_warn("Removing the node from the monitor");

if (!keeper_remove(keeper, config))
if (!keeper_remove(keeper, config, ignore_monitor_errors))
{
log_fatal("Failed to remove the node from the monitor, run "
"`pg_autoctl drop node` to manually remove "
Expand Down

0 comments on commit 155b435

Please sign in to comment.