Skip to content

Commit

Permalink
Only exit at upgrade when the on-disk binary is ready. (#771)
Browse files Browse the repository at this point in the history
* Only exit at upgrade when the on-disk binary is ready.

* Fix connection retry when ping succeeds and then connection fail.

This happens when e.g. the HBA rules have not been set yet on the monitor.
We have a timing where we could ping successfully and then fail to connect,
and in that situation we had a spurious call to pgsql_finish() that would
reset the connection statement type to SINGLE, even when it was before set
to SINGLE.

* Prepare an environment where we can test upgrade paths.

The new directory tests/upgrade contains a docker compose file that allows
running a monitor and three nodes in a way that will allow testing upgrades
thanks to the following command:

  $ docker-compose up --build monitor

This will rebuild the docker image used in the docker compose environment
from the local repository checkout and restart the monitor with the new
binaries.

To test upgrades from earlier versions of pg_auto_failover though, hacking
the docker compose file to its argument-based style (instead of the current
environment based style) would be necessary, because this PR introduces new
code to enhance our support for environment variables at create time.

* Implement a retry policy for getaddrinfo().

* When starting the monitor, review the HBA settings later.

Only allow connections to be made to the monitor when everything else has
been checked and is known to be ready.

* Use buster-slim instead of stable-slim

Debian bullseye got released 5 days ago and this broke the docker build
file.

* Use docker-compose instead of docker compose

Co-authored-by: Jelte Fennema <github-tech@jeltef.nl>
  • Loading branch information
DimCitus and JelteF committed Aug 19, 2021
1 parent 65458e2 commit 9284de8
Show file tree
Hide file tree
Showing 14 changed files with 627 additions and 76 deletions.
17 changes: 14 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ ENV PATH /usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/lib/p
ENV PG_AUTOCTL_DEBUG 1


FROM debian:stable-slim as run
FROM debian:buster-slim as run

ENV PGVERSION 10

Expand Down Expand Up @@ -102,7 +102,7 @@ RUN apt-get update\
&& apt-get install -y --no-install-recommends postgresql-${PGVERSION} \
&& rm -rf /var/lib/apt/lists/*

RUN adduser --disabled-password --gecos '' docker
RUN adduser --disabled-password --gecos '' --home /var/lib/postgres docker
RUN adduser docker sudo
RUN adduser docker postgres
RUN echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
Expand All @@ -111,8 +111,19 @@ COPY --from=build-test /usr/lib/postgresql/${PGVERSION}/lib/pgautofailover.so /u
COPY --from=build-test /usr/share/postgresql/${PGVERSION}/extension/pgautofailover* /usr/share/postgresql/${PGVERSION}/extension/
COPY --from=build-test /usr/lib/postgresql/${PGVERSION}/bin/pg_autoctl /usr/local/bin

#
# In tests/upgrade/docker-compose.yml we use internal docker volumes in
# order to be able to restart the nodes and keep the data around. For that
# to work, we must prepare a mount-point that is owned by our target user
# (docker), so that once the volume in mounted there by docker compose,
# pg_autoctl has the necessary set of privileges.
#
RUN mkdir -p /var/lib/postgres \
&& chown -R docker /var/lib/postgres

USER docker
ENV PATH /usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/lib/postgresql/${PGVERSION}/bin
ENV PG_AUTOCTL_DEBUG 1
ENV PGDATA /var/lib/postgres/pgaf

CMD pg_autoctl do tmux session --nodes 3
CMD pg_autoctl do tmux session --nodes 3 --binpath /usr/local/bin/pg_autoctl
17 changes: 12 additions & 5 deletions src/bin/pg_autoctl/cli_common.c
Original file line number Diff line number Diff line change
Expand Up @@ -704,12 +704,19 @@ cli_create_node_getopts(int argc, char **argv,
log_fatal("Use either --monitor or --disable-monitor, not both.");
exit(EXIT_CODE_BAD_ARGS);
}
else if (IS_EMPTY_STRING_BUFFER(options->monitor_pguri) &&
!options->monitorDisabled)
else if (!options->monitorDisabled)
{
log_fatal("Failed to set the monitor URI: "
"use either --monitor postgresql://... or --disable-monitor");
exit(EXIT_CODE_BAD_ARGS);
if (IS_EMPTY_STRING_BUFFER(options->monitor_pguri) &&
!(env_exists(PG_AUTOCTL_MONITOR) &&
get_env_copy(PG_AUTOCTL_MONITOR,
options->monitor_pguri,
sizeof(options->monitor_pguri))))
{
log_fatal("Failed to set the monitor URI: "
"use either --monitor postgresql://... "
"or --disable-monitor");
exit(EXIT_CODE_BAD_ARGS);
}
}
else if (options->monitorDisabled)
{
Expand Down
27 changes: 22 additions & 5 deletions src/bin/pg_autoctl/cli_do_demoapp.c
Original file line number Diff line number Diff line change
Expand Up @@ -329,16 +329,33 @@ cli_do_demoapp_getopts(int argc, char **argv)

if (IS_EMPTY_STRING_BUFFER(options.monitor_pguri))
{
log_fatal("Please provide --monitor");
errors++;
if (env_exists(PG_AUTOCTL_MONITOR) &&
get_env_copy(PG_AUTOCTL_MONITOR,
options.monitor_pguri,
sizeof(options.monitor_pguri)))
{
log_debug("Using environment PG_AUTOCTL_MONITOR \"%s\"",
options.monitor_pguri);
}
else
{
log_fatal("Please provide --monitor");
errors++;
}
}

if (IS_EMPTY_STRING_BUFFER(options.username))
{
PostgresSetup pgSetup = { 0 };
char *username = pg_setup_get_username(&pgSetup);
if (!get_env_copy_with_fallback("PGUSER",
options.username,
NAMEDATALEN,
""))
{
PostgresSetup pgSetup = { 0 };
char *username = pg_setup_get_username(&pgSetup);

strlcpy(options.username, username, sizeof(options.username));
strlcpy(options.username, username, sizeof(options.username));
}
}

/* set our Postgres username as the PGUSER environment variable now */
Expand Down
30 changes: 30 additions & 0 deletions src/bin/pg_autoctl/cli_do_show.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ static void cli_show_cidr(int argc, char **argv);
static void cli_show_lookup(int argc, char **argv);
static void cli_show_hostname(int argc, char **argv);
static void cli_show_reverse(int argc, char **argv);
static void cli_show_version(int arg, char **argv);

static CommandLine do_show_ipaddr_command =
make_command("ipaddr",
Expand Down Expand Up @@ -65,12 +66,18 @@ static CommandLine do_show_reverse_command =
"Lookup given hostname and check reverse DNS setup", "", "",
NULL, cli_show_reverse);

static CommandLine do_show_version_command =
make_command("version",
"Run pg_autoctl version --json and parses the output", "", "",
NULL, cli_show_version);

CommandLine *do_show_subcommands[] = {
&do_show_ipaddr_command,
&do_show_cidr_command,
&do_show_lookup_command,
&do_show_hostname_command,
&do_show_reverse_command,
&do_show_version_command,
NULL
};

Expand Down Expand Up @@ -339,3 +346,26 @@ cli_show_reverse(int argc, char **argv)
hostname,
ipaddr);
}


/*
* cli_show_version runs pg_autoctl version --json and parses the version
* string.
*/
static void
cli_show_version(int arg, char **argv)
{
Keeper keeper = { 0 };
KeeperVersion version = { 0 };

log_debug("cli_show_version");

if (!keeper_pg_autoctl_get_version_from_disk(&keeper, &version))
{
/* errors have already been logged */
exit(EXIT_CODE_INTERNAL_ERROR);
}

log_info("pg_autoctl \"%s\"", version.pg_autoctl_version);
log_info("pgautofailover \"%s\"", version.required_extension_version);
}
95 changes: 73 additions & 22 deletions src/bin/pg_autoctl/ipaddr.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
#include "ipaddr.h"
#include "log.h"
#include "pgsetup.h"
#include "pgsql.h"
#include "string_utils.h"

static unsigned int countSetBits(unsigned int n);
Expand All @@ -42,6 +43,10 @@ static bool fetchIPAddressFromInterfaceList(char *localIpAddress, int size);
static bool ipaddr_sockaddr_to_string(struct addrinfo *ai,
char *ipaddr, size_t size);
static bool ipaddr_getsockname(int sock, char *ipaddr, size_t size);
static bool GetAddrInfo(const char *restrict node,
const char *restrict service,
const struct addrinfo *restrict hints,
struct addrinfo **restrict res);


/*
Expand Down Expand Up @@ -74,14 +79,12 @@ fetchLocalIPAddress(char *localIpAddress, int size,
hints.ai_socktype = SOCK_STREAM; /* we only want TCP sockets */
hints.ai_protocol = IPPROTO_TCP; /* we only want TCP sockets */

int error = getaddrinfo(serviceName,
intToString(servicePort).strValue,
&hints,
&lookup);
if (error != 0)
if (!GetAddrInfo(serviceName,
intToString(servicePort).strValue,
&hints,
&lookup))
{
log_warn("Failed to parse host name or IP address \"%s\": %s",
serviceName, gai_strerror(error));
/* errors have already been logged */
return false;
}

Expand Down Expand Up @@ -473,11 +476,9 @@ findHostnameLocalAddress(const char *hostname, char *localIpAddress, int size)
struct addrinfo *dns_addr;
struct ifaddrs *ifaddrList, *ifaddr;

int error = getaddrinfo(hostname, NULL, 0, &dns_lookup_addr);
if (error != 0)
if (!GetAddrInfo(hostname, NULL, 0, &dns_lookup_addr))
{
log_warn("Failed to resolve DNS name \"%s\": %s",
hostname, gai_strerror(error));
/* errors have already been logged */
return false;
}

Expand Down Expand Up @@ -628,19 +629,17 @@ findHostnameFromLocalIpAddress(char *localIpAddress, char *hostname, int size)
struct addrinfo *lookup, *ai;

/* parse ipv4 or ipv6 address using getaddrinfo() */
int ret = getaddrinfo(localIpAddress, NULL, 0, &lookup);
if (ret != 0)
if (!GetAddrInfo(localIpAddress, NULL, 0, &lookup))
{
log_warn("Failed to resolve DNS name \"%s\": %s",
localIpAddress, gai_strerror(ret));
/* errors have already been logged */
return false;
}

/* now reverse lookup (NI_NAMEREQD) the address with getnameinfo() */
for (ai = lookup; ai; ai = ai->ai_next)
{
ret = getnameinfo(ai->ai_addr, ai->ai_addrlen,
hbuf, sizeof(hbuf), NULL, 0, NI_NAMEREQD);
int ret = getnameinfo(ai->ai_addr, ai->ai_addrlen,
hbuf, sizeof(hbuf), NULL, 0, NI_NAMEREQD);

if (ret != 0)
{
Expand Down Expand Up @@ -686,12 +685,9 @@ resolveHostnameForwardAndReverse(const char *hostname, char *ipaddr, int size,

*foundHostnameFromAddress = false;

int error = getaddrinfo(hostname, NULL, 0, &lookup);

if (error != 0)
if (!GetAddrInfo(hostname, NULL, 0, &lookup))
{
log_warn("Failed to resolve DNS name \"%s\": %s",
hostname, gai_strerror(error));
/* errors have already been logged */
return false;
}

Expand Down Expand Up @@ -867,3 +863,58 @@ ipaddrGetLocalHostname(char *hostname, size_t size)

return true;
}


/*
* GetAddrInfo calls getaddrinfo and implement a retry policy in case we get a
* transient failure from the system. And for kubernetes compatibility, we also
* retry when the plain EAI_FAIL error code is returned, because DNS entries in
* this environments are dynamic.
*/
static bool
GetAddrInfo(const char *restrict node,
const char *restrict service,
const struct addrinfo *restrict hints,
struct addrinfo **restrict res)
{
bool success = false;
ConnectionRetryPolicy retryPolicy = { 0 };

(void) pgsql_set_interactive_retry_policy(&retryPolicy);

while (!pgsql_retry_policy_expired(&retryPolicy))
{
int error = getaddrinfo(node, service, hints, res);

/*
* Given docker/kubernetes environments, we treat permanent DNS
* failures (EAI_FAIL) as a retryable condition, same as EAI_AGAIN.
*/
if (error != 0 && error != EAI_AGAIN && error != EAI_FAIL)
{
log_warn("Failed to resolve DNS name \"%s\": %s",
node, gai_strerror(error));
return false;
}
else if (error != 0)
{
log_debug("Failed to resolve DNS name \"%s\": %s",
node, gai_strerror(error));
}

success = (error == 0);

if (success)
{
break;
}

int sleepTimeMs =
pgsql_compute_connection_retry_sleep_time(&retryPolicy);

/* we have milliseconds, pg_usleep() wants microseconds */
(void) pg_usleep(sleepTimeMs * 1000);
}

return success;
}

0 comments on commit 9284de8

Please sign in to comment.