From 6dc0c21d951137dcdc492a4d65b94dd6dd11df5c Mon Sep 17 00:00:00 2001 From: Grzegorz Uriasz Date: Fri, 14 Jul 2023 10:21:49 -0700 Subject: [PATCH] Add health check to agent and ensure configfs is mounted (#123) * Add health check to agent * lint * Ensure configfs is mounted in the agent container --- entrypoints/agent.ts | 16 ++++++++++++++++ ops/bin/dev/setup-agent-dependencies.sh | 5 ----- ops/bin/local-up.sh | 5 ++--- ops/bin/setup-tcmu.sh | 10 ++++++++++ ops/docker/agent-in-hocus.docker-compose.yml | 1 - ops/docker/agent-tests.docker-compose.yml | 1 - ops/docker/deploy-smoke-test.yml | 2 +- ops/docker/hocus-local.yml | 18 +++++++++++++++--- 8 files changed, 44 insertions(+), 14 deletions(-) diff --git a/entrypoints/agent.ts b/entrypoints/agent.ts index 5895f5c6..3d0d3514 100644 --- a/entrypoints/agent.ts +++ b/entrypoints/agent.ts @@ -1,3 +1,4 @@ +import fs from "fs/promises"; import { join } from "path"; // eslint-disable-next-line @typescript-eslint/no-restricted-imports @@ -61,7 +62,22 @@ async function run() { // eslint-disable-next-line no-console console.log("Starting worker..."); + const healtcheckInterval = setInterval(async () => { + let file: fs.FileHandle | null = null; + try { + file = await fs.open("/run/.hocus-agent-healthcheck", "w"); + await file.write("A"); + } catch (err) { + if (telemetryService) telemetryService.captureException(err); + // eslint-disable-next-line no-console + console.error(err); + } finally { + if (file !== null) await file.close(); + } + }, 2000); + await Promise.race([worker.run(), overlaybdProcessPromise]) + .finally(() => clearInterval(healtcheckInterval)) .finally(worker.shutdown.bind(worker)) // https://hocus.sentry.io/issues/4308331837/events/dc3b651528aa4155a57a24da65b9d161/?project=4505510149095424&referrer=issue-list .finally(brService.hideEverything.bind(brService)) diff --git a/ops/bin/dev/setup-agent-dependencies.sh b/ops/bin/dev/setup-agent-dependencies.sh index 101c6eec..add6ff65 100755 --- a/ops/bin/dev/setup-agent-dependencies.sh +++ b/ops/bin/dev/setup-agent-dependencies.sh @@ -11,9 +11,4 @@ fi export SCRIPT_DIR="$(dirname "$0")" export REPO_DIR="$(realpath "${SCRIPT_DIR}/../../..")" -# Check if configfs is mounted(for some reason right now hocus workspaces don't automount this) -if ! mountpoint -q /sys/kernel/config/; then - mount -t configfs none /sys/kernel/config -fi - "$REPO_DIR"/ops/bin/setup-tcmu.sh diff --git a/ops/bin/local-up.sh b/ops/bin/local-up.sh index fa4785f8..872d75cd 100755 --- a/ops/bin/local-up.sh +++ b/ops/bin/local-up.sh @@ -136,7 +136,7 @@ build_service () { DT=$(printf %.2f\\n "$(( $T1 - $T0 ))e-3") echo -e "\r\033[KBuilding $2 failed in $DT s ❌\n" - echo -e "$BUILD_OUTPUT" | grep --color -E '^|ERROR:.*' + echo -e "$BUILD_OUTPUT" | grep --color=always -E '^|ERROR:.*' echo "We were unable to build Hocus 😭" echo "Above you will find the docker build logs with the errors highlighted" fatal_error @@ -195,9 +195,8 @@ start_service () { T1=$(date +%s%N | cut -b1-13) DT=$(printf %.2f\\n "$(( $T1 - $T0 ))e-3") echo -e "\r\033[KStarting $2 - ❌ in $DT\n" - - echo -e "$DOCKER_UP_LOGS" | grep -v "variable is not set" | grep --color -E '^|Bind for.*failed' $REPO_DIR/ops/bin/local-cmd.sh logs $1 2> /dev/null + echo -e "$DOCKER_UP_LOGS" | grep -v "variable is not set" | grep --color=always -i -E '^|Bind for.*failed|unhealthy' echo -e "\nAbove you will find the logs" fatal_error else diff --git a/ops/bin/setup-tcmu.sh b/ops/bin/setup-tcmu.sh index 211e6688..06513fcf 100755 --- a/ops/bin/setup-tcmu.sh +++ b/ops/bin/setup-tcmu.sh @@ -10,6 +10,16 @@ if [ "$(id -u)" != "0" ]; then exit 1 fi +# Check if configfs is mounted +if ! mountpoint -q /sys/kernel/config/; then + mount -t configfs none /sys/kernel/config +fi + +if ! mountpoint -q /sys/kernel/config/; then + echo "Configfs not available"; + exit 1 +fi + # First check whether we need to load target_core_user if ! [ -d /sys/kernel/config/target ] ; then if [ -f /proc/modules ] ; then diff --git a/ops/docker/agent-in-hocus.docker-compose.yml b/ops/docker/agent-in-hocus.docker-compose.yml index 331624f1..ab94dec5 100644 --- a/ops/docker/agent-in-hocus.docker-compose.yml +++ b/ops/docker/agent-in-hocus.docker-compose.yml @@ -19,7 +19,6 @@ services: - "${HOCUS_RESOURCES_DIR}:/srv/jailer" - /dev/kvm:/dev/kvm - /dev:/dev/hocus - - /sys/kernel/config:/sys/kernel/config environment: - "HOCUS_DEV_GIT_NAME=${HOCUS_DEV_GIT_NAME}" - "HOCUS_DEV_GIT_EMAIL=${HOCUS_DEV_GIT_EMAIL}" diff --git a/ops/docker/agent-tests.docker-compose.yml b/ops/docker/agent-tests.docker-compose.yml index 05241885..c95417eb 100644 --- a/ops/docker/agent-tests.docker-compose.yml +++ b/ops/docker/agent-tests.docker-compose.yml @@ -43,5 +43,4 @@ services: - /dev/kvm:/dev/kvm - /dev:/dev/hocus - /lib/modules:/lib/modules:ro - - /sys/kernel/config:/sys/kernel/config command: sleep infinity diff --git a/ops/docker/deploy-smoke-test.yml b/ops/docker/deploy-smoke-test.yml index 3de9fe82..adfac809 100644 --- a/ops/docker/deploy-smoke-test.yml +++ b/ops/docker/deploy-smoke-test.yml @@ -12,7 +12,7 @@ services: hocus-ui: condition: service_healthy hocus-agent: - condition: service_started + condition: service_healthy # Place the DB on a ramdisk db: command: postgres -c 'max_connections=10000' -c 'shared_buffers=2GB' -c 'effective_cache_size=6GB' -c 'maintenance_work_mem=512MB' -c 'random_page_cost=1.1' -c 'effective_io_concurrency=200' -c 'work_mem=4MB' -c 'min_wal_size=2GB' -c 'max_wal_size=8GB' -c 'max_worker_processes=8' -c 'max_parallel_workers_per_gather=4' -c 'max_parallel_workers=8' -c 'max_parallel_maintenance_workers=4' -c 'fsync=off' -c 'full_page_writes=off' -c 'synchronous_commit=off' -c 'wal_level=minimal' -c 'max_wal_senders=0' -c 'log_error_verbosity=terse' diff --git a/ops/docker/hocus-local.yml b/ops/docker/hocus-local.yml index 40f04054..01f43ee3 100644 --- a/ops/docker/hocus-local.yml +++ b/ops/docker/hocus-local.yml @@ -63,6 +63,7 @@ services: command: "start" build: dockerfile: keycloak-dev.Dockerfile + restart: unless-stopped networks: - hocus_net environment: @@ -72,7 +73,6 @@ services: - KC_DB_PASSWORD=${POSTGRES_PASSWORD} ports: - "${NO_EXPOSE_PORTS-4200:}8080" - restart: unless-stopped healthcheck: test: ["CMD-SHELL", "curl --fail http://localhost:8080/health"] interval: 10s @@ -92,6 +92,7 @@ services: - POSTGRES_DB=temporal #- DYNAMIC_CONFIG_FILE_PATH=config/dynamicconfig/development_es.yaml image: temporalio/auto-setup:1.20.0 + restart: unless-stopped networks: - hocus_net healthcheck: @@ -150,6 +151,7 @@ services: build: dockerfile: ops/docker/hocus-ui.Dockerfile context: ${REPO_DIR} + restart: unless-stopped environment: - LOG_LEVEL=info - OIDC_ISSUER_BASE_URL=http://keycloak:8080/realms/hocus @@ -190,6 +192,7 @@ services: build: dockerfile: ops/docker/hocus-agent.Dockerfile context: ${REPO_DIR} + restart: unless-stopped environment: - LOG_LEVEL=info - AGENT_TEMPORAL_ADDRESS=temporal:7233 @@ -207,7 +210,6 @@ services: - CONTROL_PLANE_AGENT_HOSTNAME=${HOCUS_HOSTNAME} networks: - hocus_net - restart: "no" privileged: true ports: - "${NO_EXPOSE_PORTS-8822:}22" @@ -215,6 +217,16 @@ services: - agent_data:/srv/jailer:rw - /dev/kvm:/dev/kvm - /dev:/dev/hocus - - /sys/kernel/config:/sys/kernel/config # used by setup-tcmu.sh - /lib/modules:/lib/modules:ro + healthcheck: + # Use the mtime of the file in /run + # /tmp is not guaranteed to be a tmpfs, atime is not used due to the possibility of noatime being enabled + test: + [ + "CMD-SHELL", + "if [ `stat -c %Y /run/.hocus-agent-healthcheck` -ge `date +%s -d'3 seconds ago'` ]; then exit 0; else exit 1; fi", + ] + interval: 1s + timeout: 1s + retries: 7