Skip to content

Commit

Permalink
Have pg_autoctl drop node comment wait until the node has been remove…
Browse files Browse the repository at this point in the history
…d. (#748)
  • Loading branch information
DimCitus committed Jul 2, 2021
1 parent a88ded0 commit 5304068
Show file tree
Hide file tree
Showing 5 changed files with 237 additions and 23 deletions.
9 changes: 9 additions & 0 deletions docs/ref/pg_autoctl_drop_node.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ This command drops a Postgres node from the pg_auto_failover monitor::
--pgport drop the node with given hostname and pgport
--destroy also destroy Postgres database
--force force dropping the node from the monitor
--wait how many seconds to wait, default to 60

Description
-----------
Expand Down Expand Up @@ -89,6 +90,14 @@ Options
possible to use the option ``--force`` to immediately remove the node from
the monitor.

--wait

How many seconds to wait for the node to be dropped entirely. The command
stops when the target node is not to be found on the monitor anymore, or
when the timeout has elapsed, whichever comes first. The value 0 (zero)
disables the timeout and disables waiting entirely, making the command
async.

Examples
--------

Expand Down
122 changes: 115 additions & 7 deletions src/bin/pg_autoctl/cli_drop_node.c
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,10 @@ static int cli_drop_node_getopts(int argc, char **argv);
static void cli_drop_node(int argc, char **argv);
static void cli_drop_monitor(int argc, char **argv);

static void cli_drop_node_from_monitor(KeeperConfig *config);
static void cli_drop_node_from_monitor(KeeperConfig *config,
int64_t *nodeId,
int *groupId);

static void cli_drop_local_node(KeeperConfig *config, bool dropAndDestroy);
static void cli_drop_local_monitor(MonitorConfig *mconfig, bool dropAndDestroy);

Expand All @@ -62,6 +65,7 @@ static void cli_drop_node_files_and_directories(KeeperConfig *config);
static void stop_postgres_and_remove_pgdata_and_config(ConfigFilePaths *pathnames,
PostgresSetup *pgSetup);

static void cli_drop_node_from_monitor_and_wait(KeeperConfig *config);

CommandLine drop_monitor_command =
make_command("monitor",
Expand All @@ -85,7 +89,8 @@ CommandLine drop_node_command =
" --hostname drop the node with given hostname and pgport\n"
" --pgport drop the node with given hostname and pgport\n"
" --destroy also destroy Postgres database\n"
" --force force dropping the node from the monitor\n",
" --force force dropping the node from the monitor\n"
" --wait how many seconds to wait, default to 60 \n",
cli_drop_node_getopts,
cli_drop_node);

Expand All @@ -108,6 +113,7 @@ cli_drop_node_getopts(int argc, char **argv)
{ "hostname", required_argument, NULL, 'n' },
{ "pgport", required_argument, NULL, 'p' },
{ "formation", required_argument, NULL, 'f' },
{ "wait", required_argument, NULL, 'w' },
{ "name", required_argument, NULL, 'a' },
{ "version", no_argument, NULL, 'V' },
{ "verbose", no_argument, NULL, 'v' },
Expand All @@ -118,6 +124,9 @@ cli_drop_node_getopts(int argc, char **argv)

optind = 0;

options.listen_notifications_timeout =
PG_AUTOCTL_LISTEN_NOTIFICATIONS_TIMEOUT;

while ((c = getopt_long(argc, argv, "D:dn:p:Vvqh",
long_options, &option_index)) != -1)
{
Expand Down Expand Up @@ -191,6 +200,19 @@ cli_drop_node_getopts(int argc, char **argv)
break;
}

case 'w':
{
/* { "wait", required_argument, NULL, 'w' }, */
if (!stringToInt(optarg, &options.listen_notifications_timeout))
{
log_fatal("--wait argument is not a valid timeout: \"%s\"",
optarg);
exit(EXIT_CODE_BAD_ARGS);
}
log_trace("--wait %d", options.listen_notifications_timeout);
break;
}

case 'V':
{
/* keeper_cli_print_version prints version and exits. */
Expand Down Expand Up @@ -395,7 +417,7 @@ cli_drop_node(int argc, char **argv)
exit(EXIT_CODE_BAD_ARGS);
}

(void) cli_drop_node_from_monitor(&config);
(void) cli_drop_node_from_monitor_and_wait(&config);
}
}

Expand Down Expand Up @@ -483,7 +505,7 @@ cli_drop_monitor(int argc, char **argv)
* --name.
*/
static void
cli_drop_node_from_monitor(KeeperConfig *config)
cli_drop_node_from_monitor(KeeperConfig *config, int64_t *nodeId, int *groupId)
{
Monitor monitor = { 0 };

Expand All @@ -498,7 +520,9 @@ cli_drop_node_from_monitor(KeeperConfig *config)
if (!monitor_remove_by_nodename(&monitor,
(char *) config->formation,
(char *) config->name,
dropForce))
dropForce,
nodeId,
groupId))
{
/* errors have already been logged */
exit(EXIT_CODE_MONITOR);
Expand All @@ -518,7 +542,9 @@ cli_drop_node_from_monitor(KeeperConfig *config)
if (!monitor_remove_by_hostname(&monitor,
(char *) config->hostname,
pgport,
dropForce))
dropForce,
nodeId,
groupId))
{
/* errors have already been logged */
exit(EXIT_CODE_MONITOR);
Expand Down Expand Up @@ -569,7 +595,10 @@ cli_drop_local_node(KeeperConfig *config, bool dropAndDestroy)
/* first drop the node from the monitor */
if (keeperState->assigned_role != DROPPED_STATE)
{
(void) cli_drop_node_from_monitor(config);
int64_t nodeId = -1;
int groupId = -1;

(void) cli_drop_node_from_monitor(config, &nodeId, &groupId);
}

/*
Expand Down Expand Up @@ -851,3 +880,82 @@ stop_postgres_and_remove_pgdata_and_config(ConfigFilePaths *pathnames,
exit(EXIT_CODE_BAD_CONFIG);
}
}


/*
* cli_drop_node_from_monitor_and_wait waits until the node doesn't exist
* anymore on the monitor, meaning it's been fully dropped now.
*/
static void
cli_drop_node_from_monitor_and_wait(KeeperConfig *config)
{
bool dropped = false;
Monitor monitor = { 0 };

(void) cli_monitor_init_from_option_or_config(&monitor, config);

/* call pgautofailover.remove_node() on the monitor */
int64_t nodeId;
int groupId;

(void) cli_drop_node_from_monitor(config, &nodeId, &groupId);

/* if the timeout is zero, just don't wait at all */
if (config->listen_notifications_timeout == 0)
{
return;
}

log_info("Waiting until the node with id %lld in group %d has been "
"dropped from the monitor, or for %ds, whichever comes first",
(long long) nodeId, groupId, config->listen_notifications_timeout);

uint64_t start = time(NULL);

/* establish a connection for notifications if none present */
(void) pgsql_prepare_to_wait(&(monitor.notificationClient));

while (!dropped)
{
NodeAddressArray nodesArray = { 0 };

bool groupStateHasChanged = false;
int timeoutMs = PG_AUTOCTL_KEEPER_SLEEP_TIME * 1000;

uint64_t now = time(NULL);

if ((now - start) > config->listen_notifications_timeout)
{
log_error("Failed to wait until the node has been dropped");
exit(EXIT_CODE_INTERNAL_ERROR);
}

(void) monitor_wait_for_state_change(&monitor,
config->formation,
groupId,
nodeId,
timeoutMs,
&groupStateHasChanged);

if (!monitor_find_node_by_nodeid(&monitor,
config->formation,
groupId,
nodeId,
&nodesArray))
{
log_error("Failed to query monitor to see if node id %lld "
"has been dropped already",
(long long) nodeId);
exit(EXIT_CODE_MONITOR);
}

dropped = nodesArray.count == 0;

if (dropped)
{
log_info("Node with id %lld in group %d has been successfully "
"dropped from the monitor",
(long long) nodeId, groupId);
}
}
}
7 changes: 6 additions & 1 deletion src/bin/pg_autoctl/cli_enable_disable.c
Original file line number Diff line number Diff line change
Expand Up @@ -1714,11 +1714,16 @@ cli_disable_monitor(int argc, char **argv)
nodesArray.nodes[nodeIndex].host,
nodesArray.nodes[nodeIndex].port);

int64_t nodeId = -1;
int groupId = -1;

if (!monitor_remove_by_hostname(
monitor,
nodesArray.nodes[nodeIndex].host,
nodesArray.nodes[nodeIndex].port,
optForce))
optForce,
&nodeId,
&groupId))
{
/* errors have already been logged */
exit(EXIT_CODE_MONITOR);
Expand Down

0 comments on commit 5304068

Please sign in to comment.