Skip to content

Commit

Permalink
Add health check to agent and ensure configfs is mounted (#123)
Browse files Browse the repository at this point in the history
* Add health check to agent

* lint

* Ensure configfs is mounted in the agent container
  • Loading branch information
gorbak25 committed Jul 14, 2023
1 parent ffa329c commit 6dc0c21
Show file tree
Hide file tree
Showing 8 changed files with 44 additions and 14 deletions.
16 changes: 16 additions & 0 deletions entrypoints/agent.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import fs from "fs/promises";
import { join } from "path";

// eslint-disable-next-line @typescript-eslint/no-restricted-imports
Expand Down Expand Up @@ -61,7 +62,22 @@ async function run() {
// eslint-disable-next-line no-console
console.log("Starting worker...");

const healtcheckInterval = setInterval(async () => {
let file: fs.FileHandle | null = null;
try {
file = await fs.open("/run/.hocus-agent-healthcheck", "w");
await file.write("A");
} catch (err) {
if (telemetryService) telemetryService.captureException(err);
// eslint-disable-next-line no-console
console.error(err);
} finally {
if (file !== null) await file.close();
}
}, 2000);

await Promise.race([worker.run(), overlaybdProcessPromise])
.finally(() => clearInterval(healtcheckInterval))
.finally(worker.shutdown.bind(worker))
// https://hocus.sentry.io/issues/4308331837/events/dc3b651528aa4155a57a24da65b9d161/?project=4505510149095424&referrer=issue-list
.finally(brService.hideEverything.bind(brService))
Expand Down
5 changes: 0 additions & 5 deletions ops/bin/dev/setup-agent-dependencies.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,4 @@ fi
export SCRIPT_DIR="$(dirname "$0")"
export REPO_DIR="$(realpath "${SCRIPT_DIR}/../../..")"

# Check if configfs is mounted(for some reason right now hocus workspaces don't automount this)
if ! mountpoint -q /sys/kernel/config/; then
mount -t configfs none /sys/kernel/config
fi

"$REPO_DIR"/ops/bin/setup-tcmu.sh
5 changes: 2 additions & 3 deletions ops/bin/local-up.sh
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ build_service () {
DT=$(printf %.2f\\n "$(( $T1 - $T0 ))e-3")
echo -e "\r\033[KBuilding $2 failed in $DT s ❌\n"

echo -e "$BUILD_OUTPUT" | grep --color -E '^|ERROR:.*'
echo -e "$BUILD_OUTPUT" | grep --color=always -E '^|ERROR:.*'
echo "We were unable to build Hocus 😭"
echo "Above you will find the docker build logs with the errors highlighted"
fatal_error
Expand Down Expand Up @@ -195,9 +195,8 @@ start_service () {
T1=$(date +%s%N | cut -b1-13)
DT=$(printf %.2f\\n "$(( $T1 - $T0 ))e-3")
echo -e "\r\033[KStarting $2 - ❌ in $DT\n"

echo -e "$DOCKER_UP_LOGS" | grep -v "variable is not set" | grep --color -E '^|Bind for.*failed'
$REPO_DIR/ops/bin/local-cmd.sh logs $1 2> /dev/null
echo -e "$DOCKER_UP_LOGS" | grep -v "variable is not set" | grep --color=always -i -E '^|Bind for.*failed|unhealthy'
echo -e "\nAbove you will find the logs"
fatal_error
else
Expand Down
10 changes: 10 additions & 0 deletions ops/bin/setup-tcmu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,16 @@ if [ "$(id -u)" != "0" ]; then
exit 1
fi

# Check if configfs is mounted
if ! mountpoint -q /sys/kernel/config/; then
mount -t configfs none /sys/kernel/config
fi

if ! mountpoint -q /sys/kernel/config/; then
echo "Configfs not available";
exit 1
fi

# First check whether we need to load target_core_user
if ! [ -d /sys/kernel/config/target ] ; then
if [ -f /proc/modules ] ; then
Expand Down
1 change: 0 additions & 1 deletion ops/docker/agent-in-hocus.docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ services:
- "${HOCUS_RESOURCES_DIR}:/srv/jailer"
- /dev/kvm:/dev/kvm
- /dev:/dev/hocus
- /sys/kernel/config:/sys/kernel/config
environment:
- "HOCUS_DEV_GIT_NAME=${HOCUS_DEV_GIT_NAME}"
- "HOCUS_DEV_GIT_EMAIL=${HOCUS_DEV_GIT_EMAIL}"
Expand Down
1 change: 0 additions & 1 deletion ops/docker/agent-tests.docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,5 +43,4 @@ services:
- /dev/kvm:/dev/kvm
- /dev:/dev/hocus
- /lib/modules:/lib/modules:ro
- /sys/kernel/config:/sys/kernel/config
command: sleep infinity
2 changes: 1 addition & 1 deletion ops/docker/deploy-smoke-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ services:
hocus-ui:
condition: service_healthy
hocus-agent:
condition: service_started
condition: service_healthy
# Place the DB on a ramdisk
db:
command: postgres -c 'max_connections=10000' -c 'shared_buffers=2GB' -c 'effective_cache_size=6GB' -c 'maintenance_work_mem=512MB' -c 'random_page_cost=1.1' -c 'effective_io_concurrency=200' -c 'work_mem=4MB' -c 'min_wal_size=2GB' -c 'max_wal_size=8GB' -c 'max_worker_processes=8' -c 'max_parallel_workers_per_gather=4' -c 'max_parallel_workers=8' -c 'max_parallel_maintenance_workers=4' -c 'fsync=off' -c 'full_page_writes=off' -c 'synchronous_commit=off' -c 'wal_level=minimal' -c 'max_wal_senders=0' -c 'log_error_verbosity=terse'
Expand Down
18 changes: 15 additions & 3 deletions ops/docker/hocus-local.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ services:
command: "start"
build:
dockerfile: keycloak-dev.Dockerfile
restart: unless-stopped
networks:
- hocus_net
environment:
Expand All @@ -72,7 +73,6 @@ services:
- KC_DB_PASSWORD=${POSTGRES_PASSWORD}
ports:
- "${NO_EXPOSE_PORTS-4200:}8080"
restart: unless-stopped
healthcheck:
test: ["CMD-SHELL", "curl --fail http://localhost:8080/health"]
interval: 10s
Expand All @@ -92,6 +92,7 @@ services:
- POSTGRES_DB=temporal
#- DYNAMIC_CONFIG_FILE_PATH=config/dynamicconfig/development_es.yaml
image: temporalio/auto-setup:1.20.0
restart: unless-stopped
networks:
- hocus_net
healthcheck:
Expand Down Expand Up @@ -150,6 +151,7 @@ services:
build:
dockerfile: ops/docker/hocus-ui.Dockerfile
context: ${REPO_DIR}
restart: unless-stopped
environment:
- LOG_LEVEL=info
- OIDC_ISSUER_BASE_URL=http://keycloak:8080/realms/hocus
Expand Down Expand Up @@ -190,6 +192,7 @@ services:
build:
dockerfile: ops/docker/hocus-agent.Dockerfile
context: ${REPO_DIR}
restart: unless-stopped
environment:
- LOG_LEVEL=info
- AGENT_TEMPORAL_ADDRESS=temporal:7233
Expand All @@ -207,14 +210,23 @@ services:
- CONTROL_PLANE_AGENT_HOSTNAME=${HOCUS_HOSTNAME}
networks:
- hocus_net
restart: "no"
privileged: true
ports:
- "${NO_EXPOSE_PORTS-8822:}22"
volumes:
- agent_data:/srv/jailer:rw
- /dev/kvm:/dev/kvm
- /dev:/dev/hocus
- /sys/kernel/config:/sys/kernel/config
# used by setup-tcmu.sh
- /lib/modules:/lib/modules:ro
healthcheck:
# Use the mtime of the file in /run
# /tmp is not guaranteed to be a tmpfs, atime is not used due to the possibility of noatime being enabled
test:
[
"CMD-SHELL",
"if [ `stat -c %Y /run/.hocus-agent-healthcheck` -ge `date +%s -d'3 seconds ago'` ]; then exit 0; else exit 1; fi",
]
interval: 1s
timeout: 1s
retries: 7

0 comments on commit 6dc0c21

Please sign in to comment.