Skip to content

Commit

Permalink
Have drop node --force --destroy work as expected again (#743)
Browse files Browse the repository at this point in the history
The following command would get stuck for 30 seconds when the pg_autoctl service was still running.
```
pg_autoctl drop node --pgdata node1 --force --destroy
```
  • Loading branch information
JelteF committed Jul 2, 2021
1 parent 5304068 commit f665321
Show file tree
Hide file tree
Showing 4 changed files with 110 additions and 43 deletions.
53 changes: 48 additions & 5 deletions src/bin/pg_autoctl/cli_drop_node.c
Original file line number Diff line number Diff line change
Expand Up @@ -607,9 +607,47 @@ cli_drop_local_node(KeeperConfig *config, bool dropAndDestroy)
*/
pid_t pid = 0;

if (!wait_for_pid_to_exit(config->pathnames.pid, 30, &pid))
/*
* Before continuing we need to make sure that a currently running service
* has stopped.
*/
bool stopped;
if (dropForce)
{
/*
* If --force is used, we skip the transition to "dropped". So a
* currently running process won't realise it's dropped, which means it
* will not exit by itself. Thus all we need to know is if it's running
* now or not.
*/
if (!is_process_stopped(config->pathnames.pid, &stopped, &pid))
{
/* errors have already been logged */
exit(EXIT_CODE_INTERNAL_ERROR);
}
}
else
{
/*
* If --force isn't used then a running pg_autoctl process will detect
* that it is dropped and clean itself up nicely and finally it will
* exit. We give the process 30 seconds to exit by itself.
*/
if (!wait_for_process_to_stop(config->pathnames.pid, 30, &stopped, &pid))
{
/* errors have already been logged */
exit(EXIT_CODE_INTERNAL_ERROR);
}
}

/*
* If the service is not stopped yet, we just want to process to exit
* so we can take over. This can happen either because --force was used
* or because 30 seconds was not enough time for the service to exit.
*/
if (!stopped)
{
/* if the service isn't terminated in 30s, signal it to quit now */
/* if the service isn't terminated, signal it to quit now */
log_info("Sending signal %s to pg_autoctl process %d",
signal_to_string(SIGQUIT),
pid);
Expand All @@ -620,7 +658,8 @@ cli_drop_local_node(KeeperConfig *config, bool dropAndDestroy)
exit(EXIT_CODE_INTERNAL_ERROR);
}

if (!wait_for_pid_to_exit(config->pathnames.pid, 30, &pid))
if (!wait_for_process_to_stop(config->pathnames.pid, 30, &stopped, &pid) ||
!stopped)
{
log_fatal("Failed to stop the pg_autoctl process with pid %d", pid);
exit(EXIT_CODE_INTERNAL_ERROR);
Expand Down Expand Up @@ -712,7 +751,9 @@ cli_drop_node_with_monitor_disabled(KeeperConfig *config, bool dropAndDestroy)
exit(EXIT_CODE_INTERNAL_ERROR);
}

if (!wait_for_pid_to_exit(config->pathnames.pid, 30, &pid))
bool stopped;
if (!wait_for_process_to_stop(config->pathnames.pid, 30, &stopped, &pid) ||
!stopped)
{
log_fatal(
"Failed to stop the pg_autoctl process with pid %d",
Expand Down Expand Up @@ -779,7 +820,9 @@ cli_drop_local_monitor(MonitorConfig *mconfig, bool dropAndDestroy)
exit(EXIT_CODE_INTERNAL_ERROR);
}

if (!wait_for_pid_to_exit(mconfig->pathnames.pid, 30, &pid))
bool stopped;
if (!wait_for_process_to_stop(mconfig->pathnames.pid, 30, &stopped, &pid) ||
!stopped)
{
log_fatal("Failed to stop the pg_autoctl process with pid %d", pid);
exit(EXIT_CODE_INTERNAL_ERROR);
Expand Down
74 changes: 43 additions & 31 deletions src/bin/pg_autoctl/pidfile.c
Original file line number Diff line number Diff line change
Expand Up @@ -499,47 +499,59 @@ pidfile_as_json(JSON_Value *js, const char *pidfile, bool includeStatus)
}


bool
is_process_stopped(const char *pidfile, bool *stopped, pid_t *pid)
{
if (!file_exists(pidfile))
{
*stopped = true;
return true;
}

if (!read_pidfile(pidfile, pid))
{
log_error("Failed to read PID file \"%s\"", pidfile);
return false;
}

*stopped = false;
return true;
}


/*
* wait_for_pid_to_exit waits until the PID found in the pidfile is not running
* wait_for_process_to_stop waits until the PID found in the pidfile is not running
* anymore.
*/
bool
wait_for_pid_to_exit(const char *pidfile, int timeout, pid_t *pid)
wait_for_process_to_stop(const char *pidfile, int timeout, bool *stopped, pid_t *pid)
{
if (file_exists(pidfile))
if (!is_process_stopped(pidfile, stopped, pid))
{
if (read_pidfile(pidfile, pid))
{
log_info("An instance of pg_autoctl is running with PID %d, "
"waiting for it to stop.", *pid);

int timeout_counter = timeout;
/* errors have already been logged */
return false;
}

while (timeout_counter > 0)
{
if (kill(*pid, 0) == -1 && errno == ESRCH)
{
log_info("The pg_autoctl instance with pid %d "
"has now terminated.",
*pid);
break;
}
log_info("An instance of pg_autoctl is running with PID %d, "
"waiting for it to stop.", *pid);

sleep(1);
--timeout_counter;
}
int timeout_counter = timeout;

return timeout_counter > 0;
}
else
while (timeout_counter > 0)
{
if (kill(*pid, 0) == -1 && errno == ESRCH)
{
log_error("Failed to read PID file \"%s\"", pidfile);
return false;
log_info("The pg_autoctl instance with pid %d "
"has now terminated.",
*pid);
*stopped = true;
return true;
}

sleep(1);
--timeout_counter;
}
else
{
/* when the pidfile doesn't exist, assume the service is not running */
return true;
}

*stopped = false;
return true;
}
4 changes: 3 additions & 1 deletion src/bin/pg_autoctl/pidfile.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ void check_pidfile(const char *pidfile, pid_t start_pid);

void pidfile_as_json(JSON_Value *js, const char *pidfile, bool includeStatus);

bool wait_for_pid_to_exit(const char *pidfile, int timeout, pid_t *pid);
bool is_process_stopped(const char *pidfile, bool *stopped, pid_t *pid);
bool wait_for_process_to_stop(const char *pidfile, int timeout, bool *stopped,
pid_t *pid);

#endif /* PIDFILE_H */
22 changes: 16 additions & 6 deletions tests/pgautofailover_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,12 +181,12 @@ def pg_createcluster(self, datadir, port=5432):

return abspath

def destroy(self):
def destroy(self, force=True):
"""
Cleanup whatever was created for this Cluster.
"""
for datanode in list(reversed(self.datanodes)):
datanode.destroy()
datanode.destroy(force=force, ignore_failure=True, timeout=3)
if self.monitor:
self.monitor.destroy()
self.vlan.destroy()
Expand Down Expand Up @@ -1100,23 +1100,33 @@ def get_nodename(self, nodeId=None):

return self.name

def destroy(self):
def destroy(
self, force=False, ignore_failure=False, timeout=COMMAND_TIMEOUT
):
"""
Cleans up processes and files created for this data node.
"""

self.stop_pg_autoctl()

flags = ["--destroy"]
if force:
flags.append("--force")

try:
destroy = PGAutoCtl(self)
destroy.execute(
"pg_autoctl drop node --destroy",
"drop",
"node",
"--destroy",
timeout=3,
*flags,
timeout=timeout,
)
except Exception as e:
print(str(e))
if ignore_failure:
print(str(e))
else:
raise

try:
os.remove(self.config_file_path())
Expand Down

0 comments on commit f665321

Please sign in to comment.