Skip to content

Commit

Permalink
workbench: WIP: add a backend agnostic healthcheck service
Browse files Browse the repository at this point in the history
  • Loading branch information
fmaste committed May 29, 2023
1 parent d625a47 commit 2c0652b
Show file tree
Hide file tree
Showing 8 changed files with 754 additions and 20 deletions.
38 changes: 36 additions & 2 deletions nix/workbench/backend/nomad-job.nix
Expand Up @@ -370,9 +370,11 @@ let
# - host: Advertise the host port for this service. port must match
# a port label specified in the network block.
port = portName;
# TODO: Use it to heartbeat with cardano-ping!!!
# Checks of type "script" need "consul" instead of "nomad" as
# service provider. As healthcheck we are using a supervisord
# "program".
# https://developer.hashicorp.com/nomad/docs/job-specification/check
# check = {};
check = null;
};

# Specifies the set of templates to render for the task. Templates can
Expand Down Expand Up @@ -405,6 +407,24 @@ let
change_mode = "noop";
error_on_missing_key = true;
}
## Make the profile.json file available (mainly for healthchecks)
{
env = false;
destination = "${task_statedir}/profile.json";
data = escapeTemplate (__readFile
profileData.JSON.outPath);
change_mode = "noop";
error_on_missing_key = true;
}
## Make the node-specs.json file available (mainly for healthchecks)
{
env = false;
destination = "${task_statedir}/node-specs.json";
data = escapeTemplate (__readFile
profileData.node-specs.JSON.outPath);
change_mode = "noop";
error_on_missing_key = true;
}
# entrypoint
{
env = false;
Expand Down Expand Up @@ -587,6 +607,20 @@ let
error_on_missing_key = true;
}
])
++
# healthcheck
[
## healthcheck start.sh script.
{
env = false;
destination = "${task_statedir}/healthcheck/start.sh";
data = escapeTemplate
profileData.healthcheck-service.startupScript.value;
change_mode = "noop";
error_on_missing_key = true;
perms = "744"; # Only for every "start.sh" script. Default: "644"
}
]
;

# Specifies logging configuration for the stdout and stderr of the
Expand Down
23 changes: 22 additions & 1 deletion nix/workbench/backend/nomad.nix
Expand Up @@ -69,7 +69,22 @@ let
flake-output = "legacyPackages.x86_64-linux.python3Packages.supervisor";
installable = "${flake-reference}/${gitrev}#${flake-output}";
};
# TODO: profileData.node-services."node-0".serviceConfig.value.eventlog
gnugrep = rec {
nix-store-path = pkgs.gnugrep;
flake-reference = "github:input-output-hk/cardano-node";
flake-output = "legacyPackages.x86_64-linux.gnugrep";
installable = "${flake-reference}/${gitrev}#${flake-output}";
};
jq = rec {
nix-store-path = pkgs.jq;
flake-reference = "github:input-output-hk/cardano-node";
flake-output = "legacyPackages.x86_64-linux.jq";
installable = "${flake-reference}/${gitrev}#${flake-output}";
};
# TODO: - cardano-node.passthru.profiled
# - cardano-node.passthru.eventlogged
# - cardano-node.passthru.asserted
# profileData.node-services."node-0".serviceConfig.value.eventlog
# builtins.trace (builtins.attrNames profileData.node-services."node-0".serviceConfig.value.eventlog) XXXX
cardano-node = rec {
nix-store-path = with pkgs;
Expand All @@ -85,6 +100,12 @@ let
;
installable = "${flake-reference}/${gitrev}#${flake-output}";
};
cardano-cli = rec {
nix-store-path = pkgs.cardanoNodePackages.cardano-cli;
flake-reference = "github:input-output-hk/cardano-cli";
flake-output = "cardanoNodePackages.cardano-cli";
installable = "${flake-reference}/${gitrev}#${flake-output}";
};
cardano-tracer = rec {
nix-store-path = pkgs.cardanoNodePackages.cardano-tracer;
flake-reference = "github:input-output-hk/cardano-node";
Expand Down
138 changes: 130 additions & 8 deletions nix/workbench/backend/nomad.sh
Expand Up @@ -69,11 +69,12 @@ backend_nomad() {
* ) break;; esac; shift; done

# Create the dispatcher's local directories hierarchy.
backend_nomad allocate-run-directory-nomad "${dir}"
backend_nomad allocate-run-directory-supervisor "${dir}"
backend_nomad allocate-run-directory-nodes "${dir}"
backend_nomad allocate-run-directory-generator "${dir}"
backend_nomad allocate-run-directory-tracers "${dir}"
backend_nomad allocate-run-directory-nomad "${dir}"
backend_nomad allocate-run-directory-supervisor "${dir}"
backend_nomad allocate-run-directory-nodes "${dir}"
backend_nomad allocate-run-directory-generator "${dir}"
backend_nomad allocate-run-directory-tracers "${dir}"
backend_nomad allocate-run-directory-healthcheck "${dir}"

# These ones are decided at "setenv-defaults" of each sub-backend.
local nomad_environment=$(envjqr 'nomad_environment')
Expand Down Expand Up @@ -187,12 +188,28 @@ backend_nomad() {
# FIXME: Looks like I'm not using these ones!!!
#cp $(jq '."tracer-config"' -r ${dir}/profile/tracer-service.json) "${dir}"/tracer/tracer-config.json
#cp $(jq '."service-config"' -r ${dir}/profile/tracer-service.json) "${dir}"/tracer/service-config.json
cp $(jq '."config"' -r ${dir}/profile/tracer-service.json) "${dir}"/tracer/config.json
cp $(jq '."start"' -r ${dir}/profile/tracer-service.json) "${dir}"/tracer/start.sh
cp $(jq '."config"' -r ${dir}/profile/tracer-service.json) "${dir}"/tracer/config.json
cp $(jq '."start"' -r ${dir}/profile/tracer-service.json) "${dir}"/tracer/start.sh
fi
fi
;;

allocate-run-directory-healthcheck )
local usage="USAGE: wb backend $op RUN-DIR"
local dir=${1:?$usage}; shift
mkdir "${dir}"/healthcheck
# For every node ...
local nodes=($(jq_tolist keys "${dir}"/node-specs.json))
for node in ${nodes[*]}
do
# Files "start.sh" and "topology.sh" that usually go in here are copied
# from the Task/container once it's started because the contents are
# created or patched using Nomad's "template" stanza in the job spec
# and we want to hold a copy of what was actually run.
mkdir "${dir}"/healthcheck/"${node}"
done
;;

# Change the Nomad job name to the current run tag. This allows to run
# multiple clusters simulatenously (as long as the network isolation mode
# and/or topology.json allows no port clashing)
Expand Down Expand Up @@ -447,7 +464,14 @@ backend_nomad() {
backend_nomad download-config-tracer "${dir}" "tracer" &
jobs_array+=("$!")
fi

# For every node ...
local nodes=($(jq_tolist keys "$dir"/node-specs.json))
for node in ${nodes[*]}
do
# Only used for debugging!
backend_nomad download-config-healthcheck "${dir}" "${node}" &
jobs_array+=("$!")
done
# Wait and check!
if test -n "${jobs_array}"
then
Expand Down Expand Up @@ -803,6 +827,19 @@ backend_nomad() {
# TODO: Make it in parallel ?
msg "Fetch logs ..."

# Download healthcheck(s) logs.
###############################
# Remove "live" symlinks before downloading the "originals"
if test "${nomad_environment}" != "cloud"
then
rm -f "${dir}"/healthcheck/{stdout,stderr,exit_code}
fi
# Download retry "infinite" loop.
while ! backend_nomad download-logs-healthcheck "${dir}" "node-0"
do
msg "Retrying \"healthcheck\" logs download"
done
msg "$(green "Finished downloading \"healthcheck\" logs")"
# Download generator logs.
##########################
# Remove "live" symlinks before downloading the "originals"
Expand Down Expand Up @@ -1198,6 +1235,12 @@ backend_nomad() {
# It was "intentionally started and should not automagically stop" flag!
touch "${dir}"/generator/started
fi

# TODO ########################################
# Still need to be decided who starts the healthchecks
# My best are on `scenario.sh`
backend_nomad start-healthcheck "${dir}" node-0
###############################################
;;

# Called by "start" that has no exit trap, don't use fatal here!
Expand Down Expand Up @@ -1329,6 +1372,34 @@ backend_nomad() {
fi
;;

# TODO #################################################
# Called by ... ??? ####################################
########################################################
start-healthcheck ) # Nomad backend specific subcommands
local usage="USAGE: wb backend $op RUN-DIR TASK"
local dir=${1:?$usage}; shift
local task=${1:?$usage}; shift

if ! backend_nomad task-program-start "${dir}" "${task}" healthcheck
then
msg "$(yellow "healthcheck of \"${task}\" startup failed!")"
else
local nomad_environment=$(envjqr 'nomad_environment')
if test "${nomad_environment}" != "cloud"
then
ln -s \
../../nomad/alloc/"${task}"/local/run/current/healthcheck/stdout \
"${dir}"/healthcheck/"${task}"/stdout
ln -s \
../../nomad/alloc/"${task}"/local/run/current/healthcheck/stderr \
"${dir}"/healthcheck/"${task}"/stderr
ln -s \
../../nomad/alloc/"${task}"/local/run/current/healthcheck/exit_code \
"${dir}"/healthcheck/"${task}"/exit_code
fi
fi
;;

# Called by "start-node" that has no exit trap, don't use fatal here!
wait-node )
local usage="USAGE: wb backend $op RUN-DIR [NODE-NAME]"
Expand Down Expand Up @@ -1708,6 +1779,34 @@ backend_nomad() {
echo "${array[@]}"
;;

# For debugging when something fails, downloads and prints details!
download-logs-healthcheck )
local usage="USAGE: wb backend pass $op RUN-DIR TASK-NAME"
local dir=${1:?$usage}; shift
local task=${1:?$usage}; shift
local download_ok="true"
# Should show the output/log of `supervisord` (runs as "entrypoint").
msg "$(blue Fetching) $(yellow "entrypoint's stdout and stderr") of Nomad $(yellow "Task \"${task}\"") ..."
backend_nomad task-entrypoint-stdout "${dir}" "${task}" \
> "${dir}"/nomad/"${task}"/stdout \
|| download_ok="false"
backend_nomad task-entrypoint-stderr "${dir}" "${task}" \
> "${dir}"/nomad/"${task}"/stderr \
|| download_ok="false"
# Downloads "exit_code", "stdout", "stderr" and GHC files.
# Depending on when the start command failed, logs may not be available!
backend_nomad download-zstd-healthcheck "${dir}" "${task}" \
|| download_ok="false"
# Return
if test "${download_ok}" = "false"
then
msg "$(red "Failed to download \"healthcheck\" run files from \"${task}\"")"
return 1
else
return 0
fi
;;

# For debugging when something fails, downloads and prints details!
download-logs-generator )
local usage="USAGE: wb backend pass $op RUN-DIR TASK-NAME"
Expand Down Expand Up @@ -1861,6 +1960,20 @@ backend_nomad() {
fi
;;

download-zstd-healthcheck )
local usage="USAGE: wb backend pass $op RUN-DIR TASK-NAME"
local dir=${1:?$usage}; shift
local task=${1:?$usage}; shift

msg "$(blue Fetching) $(yellow "\"healthcheck\"") run files from Nomad $(yellow "Task \"${task}\"") ..."
# TODO: Add compression, either "--zstd" or "--xz"
backend_nomad task-exec-program-run-files-tar-zstd \
"${dir}" "${task}" "healthcheck" \
| tar --extract \
--directory="${dir}"/healthcheck/ --file=- \
--no-same-owner --no-same-permissions
;;

download-zstd-generator )
local usage="USAGE: wb backend pass $op RUN-DIR TASK-NAME"
local dir=${1:?$usage}; shift
Expand Down Expand Up @@ -2056,6 +2169,15 @@ backend_nomad() {
fi
;;

download-config-healthcheck )
local usage="USAGE: wb backend pass $op RUN-DIR NODE-NAME"
local dir=${1:?$usage}; shift
local node=${1:?$usage}; shift
backend_nomad task-file-contents "${dir}" "${node}" \
/local/run/current/healthcheck/start.sh \
> "${dir}"/healthcheck/"${node}"/start.sh
;;

## Nomad job tasks supervisord queries
######################################

Expand Down
18 changes: 18 additions & 0 deletions nix/workbench/backend/supervisor-conf.nix
Expand Up @@ -136,6 +136,24 @@ let
startsecs = 5;
};
}
//
{
"program:healthcheck" = {
# "command" below assumes "directory" is set accordingly.
directory = "${stateDir}/healthcheck";
command = "${command}";
stdout_logfile = "${stateDir}/healthcheck/stdout";
stderr_logfile = "${stateDir}/healthcheck/stderr";
stopasgroup = false;
killasgroup = false;
autostart = false;
autorestart = false;
# Don't attempt any restart!
startretries = 0;
# Seconds it needs to stay running to consider the start successful
startsecs = 5;
};
}
;

in {
Expand Down
6 changes: 5 additions & 1 deletion nix/workbench/backend/supervisor.sh
Expand Up @@ -205,7 +205,11 @@ EOF
echo "$(white -------------------------------------------------)" >&2
fatal "could not start $(white supervisord)"
fi
backend_supervisor save-child-pids "$dir";;
backend_supervisor save-child-pids "$dir"
if ! supervisorctl start healthcheck
then
msg "$(red "supervisorctl start healthcheck failed")"
fi;;

wait-node-stopped )
local usage="USAGE: wb backend $op RUN-DIR NODE"
Expand Down
20 changes: 19 additions & 1 deletion nix/workbench/profile/profile.nix
Expand Up @@ -51,6 +51,15 @@ rec {
inherit runJq nodeSpecs;
})
tracer-service;

inherit
(pkgs.callPackage
../service/healthcheck.nix
{
inherit backend profile;
inherit runJq nodeSpecs;
})
healthcheck-service;
};

## WARNING: IFD !!
Expand Down Expand Up @@ -96,7 +105,8 @@ rec {
})
node-services
generator-service
tracer-service;
tracer-service
healthcheck-service;
};

profileData = { profile }:
Expand Down Expand Up @@ -136,11 +146,18 @@ rec {
config = config.JSON;
start = startupScript.JSON;
};
healthcheckService =
with profile.healthcheck-service;
__toJSON
{ name = "healthcheck";
start = startupScript.JSON;
};
passAsFile =
[
"nodeServices"
"generatorService"
"tracerService"
"healthcheckService"
"topologyJson"
"topologyDot"
];
Expand All @@ -154,6 +171,7 @@ rec {
cp $nodeServicesPath $out/node-services.json
cp $generatorServicePath $out/generator-service.json
cp $tracerServicePath $out/tracer-service.json
cp $healthcheckServicePath $out/healthcheck-service.json
''
// profile;

Expand Down

0 comments on commit 2c0652b

Please sign in to comment.