diff --git a/.github/workflows/qemu-emulator-build.yaml b/.github/workflows/qemu-emulator-build.yaml index e4a42207ca..a5a3f187df 100644 --- a/.github/workflows/qemu-emulator-build.yaml +++ b/.github/workflows/qemu-emulator-build.yaml @@ -26,14 +26,23 @@ env: jobs: build: name: Build QEMU Image (${{ matrix.arch }}) - runs-on: ubicloud-standard-8 + runs-on: ${{ matrix.runner }} timeout-minutes: 120 strategy: fail-fast: false matrix: include: + # amd64 runs natively under KVM on ubicloud's amd64 runner. - arch: amd64 + runner: ubicloud-standard-8 + # arm64 runs under cross-arch TCG on ubicloud's amd64 runner. + # No KVM for arm64 guests on an amd64 host; cortex-a72 + V8 + # --jitless together sidestep the SIGTRAPs that cross-arch TCG + # hits on aggressive arm64 JIT code. Smoke test is still skipped + # because the backend can't come up reliably under cross-arch + # TCG within any sane window. - arch: arm64 + runner: ubicloud-standard-8 steps: - uses: actions/checkout@v6 @@ -47,7 +56,20 @@ jobs: - name: Install QEMU dependencies run: | sudo apt-get update - sudo apt-get install -y qemu-system-x86 qemu-system-arm qemu-utils genisoimage socat qemu-efi-aarch64 + sudo apt-get install -y qemu-system-x86 qemu-system-arm qemu-kvm qemu-utils genisoimage socat qemu-efi-aarch64 + + - name: Enable KVM access + run: | + echo 'KERNEL=="kvm", GROUP="kvm", MODE="0666", OPTIONS+="static_node=kvm"' \ + | sudo tee /etc/udev/rules.d/99-kvm4all.rules + sudo udevadm control --reload-rules + sudo udevadm trigger --name-match=kvm || true + ls -la /dev/kvm || echo "no /dev/kvm present" + if [ -w /dev/kvm ]; then + echo "KVM is writable — hardware acceleration will be used" + else + echo "WARNING: /dev/kvm is not writable — will fall back to TCG (very slow)" + fi - name: Build QEMU image run: | @@ -58,7 +80,13 @@ jobs: - name: Generate emulator env run: node docker/local-emulator/generate-env-development.mjs + # arm64 runs under cross-arch TCG on an amd64 runner; the backend's + # V8 TurboFan JIT re-triggers the SIGTRAPs we dodge in migrations + # with --no-opt, and even if it didn't, boot is too slow under TCG + # to verify in any sane window. amd64 KVM already exercises the + # service stack; real arm64 hosts have KVM for end-users. - name: Start emulator and verify + if: matrix.arch == 'amd64' run: | chmod +x docker/local-emulator/qemu/run-emulator.sh EMULATOR_ARCH=${{ matrix.arch }} \ @@ -66,12 +94,13 @@ jobs: docker/local-emulator/qemu/run-emulator.sh start - name: Verify services are healthy + if: matrix.arch == 'amd64' run: | EMULATOR_ARCH=${{ matrix.arch }} \ docker/local-emulator/qemu/run-emulator.sh status - name: Stop emulator - if: always() + if: always() && matrix.arch == 'amd64' run: | EMULATOR_ARCH=${{ matrix.arch }} \ docker/local-emulator/qemu/run-emulator.sh stop diff --git a/docker/local-emulator/Dockerfile b/docker/local-emulator/Dockerfile index 7f9e6d45a3..db7cba2b33 100644 --- a/docker/local-emulator/Dockerfile +++ b/docker/local-emulator/Dockerfile @@ -103,6 +103,28 @@ RUN cp $(which qstash) /qstash-binary 2>/dev/null || \ { echo "ERROR: qstash binary not found" >&2; exit 1; } +# ── Strip / compress service binaries (parallel stages) ────────────────────── + +FROM debian:trixie-slim AS upx-compress +RUN apt-get update && apt-get install -y --no-install-recommends upx-ucl binutils && \ + rm -rf /var/lib/apt/lists/* +COPY --from=clickhouse-bin /usr/bin/clickhouse /out/clickhouse +COPY --from=svix-bin /usr/local/bin/svix-server /out/svix-server +COPY --from=minio-bin /usr/bin/minio /out/minio +COPY --from=mc-bin /usr/bin/mc /out/mc +COPY --from=qstash-bin /qstash-binary /out/qstash +RUN chmod u+w /out/* && \ + # Intentionally NOT stripping /out/clickhouse. The clickhouse binary is a + # self-extracting compressed executable (a small loader with a ZSTD + # payload appended after the section table); strip rewrites the ELF and + # can invalidate the loader's "find my payload" lookup, causing the + # decompressor to spin on garbage with zero log output — the exact + # symptom seen on cross-arch TCG runs. Savings from stripping would be + # only the tiny bootstrap anyway since the payload isn't in any section. + strip --strip-all /out/minio /out/svix-server /out/mc /out/qstash && \ + upx -9 /out/minio /out/svix-server /out/mc /out/qstash + + # ── Final image ─────────────────────────────────────────────────────────────── FROM debian:trixie-slim @@ -139,20 +161,20 @@ COPY --from=node-base /usr/local/bin/node /usr/local/bin/node # Inbucket COPY --from=inbucket-bin /opt/inbucket /opt/inbucket -# Svix -COPY --from=svix-bin /usr/local/bin/svix-server /usr/local/bin/svix-server +# Svix (UPX-compressed) +COPY --from=upx-compress /out/svix-server /usr/local/bin/svix-server -# ClickHouse -COPY --from=clickhouse-bin /usr/bin/clickhouse /usr/bin/clickhouse +# ClickHouse (stripped only) +COPY --from=upx-compress /out/clickhouse /usr/bin/clickhouse RUN ln -sf /usr/bin/clickhouse /usr/bin/clickhouse-server && \ ln -sf /usr/bin/clickhouse /usr/bin/clickhouse-client -# MinIO -COPY --from=minio-bin /usr/bin/minio /usr/local/bin/minio -COPY --from=mc-bin /usr/bin/mc /usr/local/bin/mc +# MinIO (UPX-compressed) +COPY --from=upx-compress /out/minio /usr/local/bin/minio +COPY --from=upx-compress /out/mc /usr/local/bin/mc -# QStash -COPY --from=qstash-bin --chmod=755 /qstash-binary /usr/local/bin/qstash +# QStash (UPX-compressed) +COPY --from=upx-compress --chmod=755 /out/qstash /usr/local/bin/qstash # App WORKDIR /app @@ -164,6 +186,10 @@ COPY --from=builder /app/apps/backend/node_modules ./apps/backend/node_modules COPY --from=builder /app/apps/dashboard/.next/standalone ./ COPY --from=builder /app/apps/dashboard/.next/static ./apps/dashboard/.next/static COPY --from=builder /app/apps/dashboard/public ./apps/dashboard/public +# Save the standalone-traced node_modules (runtime deps only) before the full +# migration-pruner copy overwrites it. The slim-docker-image step in the QEMU +# build restores this after migrations are baked in. +RUN cp -a /app/node_modules /app/node_modules.standalone 2>/dev/null || mkdir -p /app/node_modules.standalone COPY --from=migration-pruner /pruned-node_modules ./node_modules COPY --from=builder /app/packages ./packages diff --git a/docker/local-emulator/qemu/build-image.sh b/docker/local-emulator/qemu/build-image.sh index 8071fb5012..498d161735 100755 --- a/docker/local-emulator/qemu/build-image.sh +++ b/docker/local-emulator/qemu/build-image.sh @@ -112,15 +112,36 @@ qemu_cmd_prefix_for_arch() { case "$arch" in arm64) local accel="tcg" + local cpu="max" if [ "$HOST_ARCH" = "arm64" ]; then + # Same-arch: prefer hardware acceleration, keep -cpu max. If no + # accelerator is available (e.g. Azure arm64 runners with no + # nested virt) we fall through to TCG, but same-arch TCG handles + # -cpu max correctly and more named CPU models have TCG bugs + # than -cpu max does. case "$HOST_OS" in darwin) accel="hvf" ;; linux) [ -w /dev/kvm ] && accel="kvm" ;; esac + else + # Cross-arch TCG (amd64 host emulating arm64 guest) needs a CPU + # model that threads a narrow needle: + # * -cpu max advertises armv8.5+ features (PAC, BTI, SVE, LSE…) + # that V8's TurboFan then emits JIT code for; cross-arch TCG + # mistranslates some of those and node SIGTRAPs in migrations. + # * -cpu cortex-a72 (armv8.0-a) keeps V8 safe but makes + # ClickHouse SIGILL on startup because its statically-linked + # LSE atomics (armv8.1+) aren't recognized. + # cortex-a76 is armv8.2-a: it exposes LSE (ClickHouse happy) + # while predating PAC (v8.3) and BTI (v8.5), so V8's aggressive + # JIT tiers don't emit the instructions that tripped TCG. Pair + # this with `node --no-opt` on the migration exec, which keeps + # V8 in Ignition+Sparkplug only (no TurboFan/Maglev). + cpu="cortex-a76" fi local firmware firmware="$(find_aarch64_firmware)" - echo "qemu-system-aarch64 -machine virt -accel $accel -cpu max -bios $firmware" + echo "qemu-system-aarch64 -machine virt -accel $accel -cpu $cpu -bios $firmware" ;; amd64) local accel="tcg" @@ -176,6 +197,40 @@ prepare_bundle_artifacts() { printf "%s" "$current_ids" > "$bundle_meta" } +contains_provision_marker() { + local provision_log="$1" + local serial_log="$2" + local marker="$3" + + if [ -f "$provision_log" ] && grep -Fqx "$marker" "$provision_log" 2>/dev/null; then + return 0 + fi + + if [ -f "$serial_log" ] && LC_ALL=C strings -a "$serial_log" 2>/dev/null | grep -Fqx "$marker" 2>/dev/null; then + return 0 + fi + + return 1 +} + +line_count() { + local file="$1" + local count=0 + if [ -f "$file" ]; then + count="$(wc -l < "$file" | tr -d '[:space:]')" || count=0 + fi + printf '%s\n' "$count" +} + +persist_provision_logs() { + local arch="$1" + local serial_log="$2" + local provision_log="$3" + + cp "$serial_log" "$IMAGE_DIR/provision-emulator-${arch}.log" 2>/dev/null || true + cp "$provision_log" "$IMAGE_DIR/provision-emulator-${arch}.progress.log" 2>/dev/null || true +} + build_one() { local arch="$1" local base_img="$IMAGE_DIR/debian-${DEBIAN_VERSION}-base-${arch}.qcow2" @@ -192,8 +247,12 @@ build_one() { local bundle_iso="$tmp_dir/bundle.iso" local bundle_dir="$tmp_dir/bundle" local serial_log="$tmp_dir/serial.log" + local provision_log="$tmp_dir/provision.log" local pidfile="$tmp_dir/qemu.pid" - local qemu_base pid elapsed + local qemu_base pid elapsed total_build_lines + local last_build_lines=0 + local guest_exited=false + local guest_failed=false local start_time=$SECONDS cp "$base_img" "$tmp_img" @@ -209,21 +268,28 @@ build_one() { mkdir -p "$bundle_dir" cp "$bundle_tgz" "$bundle_dir/img.tgz" + cp "$BUILD_ENV_FILE" "$bundle_dir/build.env" + # Tell the guest which arch it's being built for so cross-arch (TCG) builds + # can skip the smoke test, which isn't reliable under software emulation. + printf 'STACK_EMULATOR_BUILD_ARCH=%s\n' "$arch" > "$bundle_dir/build-arch.env" make_iso_from_dir "$bundle_iso" "STACKBUNDLE" "$bundle_dir" : > "$serial_log" + : > "$provision_log" qemu_base="$(qemu_cmd_prefix_for_arch "$arch")" + log "QEMU command prefix (${arch}): $qemu_base" # shellcheck disable=SC2086 $qemu_base \ -boot order=c \ -m "$RAM" \ -smp "$CPUS" \ - -drive "file=$tmp_img,format=qcow2,if=virtio" \ + -drive "file=$tmp_img,format=qcow2,if=virtio,discard=on,detect-zeroes=unmap" \ -drive "file=$seed_iso,format=raw,if=virtio,readonly=on" \ -drive "file=$bundle_iso,format=raw,if=virtio,readonly=on" \ -netdev user,id=net0 \ -device virtio-net-pci,netdev=net0 \ + -virtfs "local,path=$tmp_dir,mount_tag=hostfs,security_model=none" \ -serial "file:$serial_log" \ -display none \ -daemonize \ @@ -232,23 +298,62 @@ build_one() { pid="$(cat "$pidfile")" elapsed=0 while [ "$elapsed" -lt "$PROVISION_TIMEOUT" ]; do - if grep -q "STACK_CLOUD_INIT_DONE" "$serial_log" 2>/dev/null; then + if contains_provision_marker "$provision_log" "$serial_log" "STACK_CLOUD_INIT_DONE"; then break fi + + if contains_provision_marker "$provision_log" "$serial_log" "STACK_CLOUD_INIT_FAILED"; then + guest_failed=true + break + fi + + if [ -f "$provision_log" ]; then + total_build_lines="$(line_count "$provision_log")" + if [ "$total_build_lines" -gt "$last_build_lines" ]; then + echo "" + sed -n "$((last_build_lines + 1)),${total_build_lines}p" "$provision_log" 2>/dev/null | while IFS= read -r msg; do + if [ "$msg" = "STACK_CLOUD_INIT_DONE" ]; then + continue + fi + printf " [%3ds] %s\n" "$elapsed" "$msg" + done + last_build_lines="$total_build_lines" + fi + fi + + if ! kill -0 "$pid" 2>/dev/null; then + guest_exited=true + break + fi + sleep 5 elapsed=$((SECONDS - start_time)) printf "\r [%3ds / %ds] provisioning emulator..." "$elapsed" "$PROVISION_TIMEOUT" done echo "" - if ! grep -q "STACK_CLOUD_INIT_DONE" "$serial_log" 2>/dev/null; then - err "Provisioning timed out for emulator (${arch})" - tail -50 "$serial_log" >&2 || true + if ! contains_provision_marker "$provision_log" "$serial_log" "STACK_CLOUD_INIT_DONE"; then + if [ "$guest_failed" = true ]; then + err "Guest provisioning reported failure for emulator (${arch})" + elif [ "$guest_exited" = true ]; then + err "Provisioning exited before completion for emulator (${arch})" + else + err "Provisioning timed out for emulator (${arch})" + fi + + if [ -s "$provision_log" ]; then + tail -50 "$provision_log" >&2 || true + else + LC_ALL=C strings -a "$serial_log" 2>/dev/null | tail -50 >&2 || tail -50 "$serial_log" >&2 || true + fi + if kill -0 "$pid" 2>/dev/null; then kill "$pid" 2>/dev/null || true sleep 1 kill -9 "$pid" 2>/dev/null || true fi + + persist_provision_logs "$arch" "$serial_log" "$provision_log" rm -rf "$tmp_dir" exit 1 fi @@ -266,19 +371,21 @@ build_one() { kill -9 "$pid" 2>/dev/null || true fi - cp "$tmp_img" "$final_img" - cp "$serial_log" "$IMAGE_DIR/provision-emulator-${arch}.log" - rm -rf "$tmp_dir" + persist_provision_logs "$arch" "$serial_log" "$provision_log" log "Compressing final image (this may take several minutes)..." - qemu-img convert -p -O qcow2 -c "$final_img" "$final_img.tmp" - mv "$final_img.tmp" "$final_img" + qemu-img convert -p -O qcow2 -c "$tmp_img" "$final_img" + rm -rf "$tmp_dir" local size size="$(du -h "$final_img" | cut -f1)" log "━━━ Emulator image ready: $final_img (${size}) ━━━" } +log "Generating emulator build env file..." +node "$REPO_ROOT/docker/local-emulator/generate-env-development.mjs" +BUILD_ENV_FILE="$REPO_ROOT/docker/local-emulator/.env.development" + for arch in "${TARGET_ARCHS[@]}"; do local_base="$IMAGE_DIR/debian-${DEBIAN_VERSION}-base-${arch}.qcow2" download_cloud_image "$arch" "$local_base" diff --git a/docker/local-emulator/qemu/cloud-init/emulator/user-data b/docker/local-emulator/qemu/cloud-init/emulator/user-data index 39b8c33cdb..07b0bc5f4e 100644 --- a/docker/local-emulator/qemu/cloud-init/emulator/user-data +++ b/docker/local-emulator/qemu/cloud-init/emulator/user-data @@ -43,6 +43,15 @@ write_files: gzip -dc /mnt/stack-bundle/img.tgz | docker load + if [ -f /mnt/stack-bundle/build.env ]; then + cp /mnt/stack-bundle/build.env /etc/stack-build.env + fi + + # build-arch.env lets the guest skip the smoke test on cross-arch TCG. + if [ -f /mnt/stack-bundle/build-arch.env ]; then + cp /mnt/stack-bundle/build-arch.env /etc/stack-build-arch.env + fi + - path: /usr/local/bin/render-stack-env permissions: '0755' content: | @@ -71,25 +80,33 @@ write_files: cat /mnt/stack-runtime/runtime.env # Computed vars — depend on port prefix or deps host + # Host-side ports (for browser URLs — browser runs on host, not in VM) + HP_BACKEND="$STACK_EMULATOR_BACKEND_HOST_PORT" + HP_DASHBOARD="$STACK_EMULATOR_DASHBOARD_HOST_PORT" + HP_MINIO="$STACK_EMULATOR_MINIO_HOST_PORT" + HP_INBUCKET="$STACK_EMULATOR_INBUCKET_HOST_PORT" + cat <3s to + # respond; let curl wait, outer DEPS_TIMEOUT bounds the whole dep wait. + SERVICES=( + 'postgres|nc -z 127.0.0.1 5432' + 'clickhouse|curl -sf http://127.0.0.1:8123/ping' + 'svix|curl -sf http://127.0.0.1:8071/api/v1/health/' + 'minio|curl -sf http://127.0.0.1:9090/minio/health/live' + 'qstash|[ "$(curl -s -o /dev/null -w "%{http_code}" http://127.0.0.1:8080/ 2>/dev/null || true)" = "401" ]' + ) + + dump_diagnostics() { + log "dumping diagnostics for stuck dep wait..." + log "--- docker ps -a ---" + docker ps -a 2>&1 | /usr/local/bin/log-provision-stream "wait-for-deps: ps" || true + log "--- docker logs ${DEPS_CONTAINER} (last 300 lines) ---" + docker logs --tail 300 "$DEPS_CONTAINER" 2>&1 | /usr/local/bin/log-provision-stream "wait-for-deps: deps" || true + log "--- per-service probes (3s timeout) ---" + nc -z -w 3 127.0.0.1 5432 >/dev/null 2>&1 && log "postgres:5432 reachable" || log "postgres:5432 NOT reachable" + curl -sf --max-time 3 http://127.0.0.1:8123/ping >/dev/null 2>&1 && log "clickhouse:8123 reachable" || log "clickhouse:8123 NOT reachable" + curl -sf --max-time 3 http://127.0.0.1:8071/api/v1/health/ >/dev/null 2>&1 && log "svix:8071 reachable" || log "svix:8071 NOT reachable" + curl -sf --max-time 3 http://127.0.0.1:9090/minio/health/live >/dev/null 2>&1 && log "minio:9090 reachable" || log "minio:9090 NOT reachable" + code=$(curl -s -o /dev/null -w '%{http_code}' --max-time 3 http://127.0.0.1:8080/ 2>/dev/null || true) + [ "$code" = "401" ] && log "qstash:8080 reachable (401)" || log "qstash:8080 NOT reachable (code=${code:-none})" + } + + wait_for() { + local name="$1" probe="$2" elapsed + local svc_start=$SECONDS + local next_heartbeat=$((svc_start + 30)) + while true; do + if eval "$probe" >/dev/null 2>&1; then + elapsed=$((SECONDS - svc_start)) + log "${name} ready (${elapsed}s)" + return 0 + fi + if [ "$SECONDS" -ge "$next_heartbeat" ]; then + log "still waiting for ${name} ($((SECONDS - svc_start))s elapsed)" + next_heartbeat=$((SECONDS + 30)) + fi + if [ "$((SECONDS - start))" -ge "$DEPS_TIMEOUT" ]; then + elapsed=$((SECONDS - start)) + log "TIMEOUT waiting for ${name} after ${elapsed}s (hard cap ${DEPS_TIMEOUT}s)" + dump_diagnostics + exit 1 + fi + sleep 2 + done + } + + log "starting dep wait (timeout=${DEPS_TIMEOUT}s)" + for entry in "${SERVICES[@]}"; do + wait_for "${entry%%|*}" "${entry#*|}" + done + log "all deps ready ($((SECONDS - start))s total)" + + - path: /etc/stack-build-computed.env + content: | + USE_INLINE_ENV_VARS=true + NEXT_PUBLIC_STACK_API_URL=http://localhost:8102 + NEXT_PUBLIC_STACK_DASHBOARD_URL=http://localhost:8101 + NEXT_PUBLIC_BROWSER_STACK_API_URL=http://localhost:8102 + NEXT_PUBLIC_BROWSER_STACK_DASHBOARD_URL=http://localhost:8101 + NEXT_PUBLIC_SERVER_STACK_API_URL=http://127.0.0.1:8102 + NEXT_PUBLIC_SERVER_STACK_DASHBOARD_URL=http://127.0.0.1:8101 + NEXT_PUBLIC_STACK_SVIX_SERVER_URL=http://localhost:8071 + NEXT_PUBLIC_STACK_PORT_PREFIX=81 + STACK_CLICKHOUSE_DATABASE=analytics + BACKEND_PORT=8102 + DASHBOARD_PORT=8101 + + - path: /usr/local/bin/log-provision + permissions: '0755' + content: | + #!/bin/bash + set -euo pipefail + + msg="$*" + echo "STACK_PROVISION: $msg" + if [ -n "${STACK_PROVISION_LOG_FILE:-}" ]; then + printf '%s\n' "$msg" >> "$STACK_PROVISION_LOG_FILE" + fi + + - path: /usr/local/bin/log-provision-stream + permissions: '0755' + content: | + #!/bin/bash + set -uo pipefail + + prefix="${1:-}" + while IFS= read -r line; do + /usr/local/bin/log-provision "${prefix}: ${line}" + done + + - path: /usr/local/bin/run-build-migrations + permissions: '0755' + content: | + #!/bin/bash + set -euo pipefail + + log() { /usr/local/bin/log-provision "$*"; } + + log "Starting deps container..." + docker run --rm --name stack-build-init \ + --network host \ + -e STACK_DEPS_ONLY=true \ + -v stack-postgres-data:/data/postgres \ + -v stack-redis-data:/data/redis \ + -v stack-clickhouse-data:/data/clickhouse \ + -v stack-minio-data:/data/minio \ + -v stack-inbucket-data:/data/inbucket \ + -d stack-local-emulator + + log "Waiting for deps (postgres, redis, clickhouse, minio, qstash)..." + /usr/local/bin/wait-for-deps + log "Deps ready." + + # Wait for init-services.sh (MinIO buckets, ClickHouse DB creation) + log "Waiting for init-services.sh..." + timeout=120 + elapsed=0 + while [ "$elapsed" -lt "$timeout" ]; do + if docker exec stack-build-init test -f /var/run/stack-local-init-services.done 2>/dev/null; then + break + fi + sleep 1 + elapsed=$((elapsed + 1)) + done + if [ "$elapsed" -ge "$timeout" ]; then + log "ERROR: init-services.sh did not finish within ${timeout}s" + exit 1 + fi + log "init-services done (${elapsed}s)." + + log "Running migrations..." + # Cross-arch TCG mistranslates V8's JIT-emitted arm64, and V8's wasm + # tier-up path trips an InnerPointerToCodeCache check deep in the heap + # (Runtime_WasmTriggerTierUp → StackFrameIterator::Advance crashes + # when Wasm code has been freed while a frame still references it). + # --no-opt keeps JS off TurboFan/Maglev + # --no-wasm-tier-up keeps Wasm on Liftoff (no TurboFan) + # --no-wasm-dynamic-tiering suppresses the tier-up decision runtime call + # --no-wasm-code-gc keeps Wasm code alive across stack walks + # All four are no-ops under KVM, and must be passed on node's CLI + # (NODE_OPTIONS rejects them). + migrate_log="$(mktemp)" + set +e + docker exec \ + --env-file /etc/stack-build.env \ + --env-file /etc/stack-build-computed.env \ + stack-build-init \ + sh -c 'cd /app/apps/backend && node --no-opt --no-wasm-tier-up --no-wasm-dynamic-tiering --no-wasm-code-gc dist/db-migrations.mjs migrate && node --no-opt --no-wasm-tier-up --no-wasm-dynamic-tiering --no-wasm-code-gc dist/db-migrations.mjs seed' \ + > "$migrate_log" 2>&1 + migrate_status=$? + set -e + if [ "$migrate_status" -ne 0 ]; then + log "MIGRATIONS FAILED (exit ${migrate_status}) — last 200 lines of migration output:" + tail -200 "$migrate_log" | /usr/local/bin/log-provision-stream "migrate" || true + rm -f "$migrate_log" + exit "$migrate_status" + fi + rm -f "$migrate_log" + log "Migrations + seed complete." + + log "Stopping deps container..." + docker stop stack-build-init || true + log "run-build-migrations done." + + - path: /usr/local/bin/slim-docker-image permissions: '0755' content: | #!/bin/bash set -euo pipefail - until nc -z 127.0.0.1 5432 >/dev/null 2>&1; do sleep 1; done - until curl -sf http://127.0.0.1:8123/ping >/dev/null 2>&1; do sleep 1; done - until curl -sf http://127.0.0.1:8071/api/v1/health/ >/dev/null 2>&1; do sleep 1; done - until curl -sf http://127.0.0.1:9090/minio/health/live >/dev/null 2>&1; do sleep 1; done - until [ "$(curl -s -o /dev/null -w '%{http_code}' http://127.0.0.1:8080/ 2>/dev/null || true)" = "401" ]; do sleep 1; done + log() { /usr/local/bin/log-provision "$*"; } + + log "Building slim Docker image..." + docker build -t stack-local-emulator-slim - <<'DOCKERFILE' + FROM stack-local-emulator + RUN rm -rf /app/node_modules /app/apps/backend/dist && \ + mv /app/node_modules.standalone /app/node_modules && \ + for entry in /app/node_modules/.pnpm/node_modules/*; do \ + name="$(basename "$entry")"; \ + [ "$name" = ".bin" ] && continue; \ + ln -sf ".pnpm/node_modules/$name" "/app/node_modules/$name" 2>/dev/null || true; \ + done + DOCKERFILE + log "Slim image built." + + # Determine build arch to decide whether to run the smoke test. Cross-arch + # (TCG) builds can't reliably run the Next.js backend inside the smoke + # test container: V8 JIT ↔ QEMU TCG mistranslations crash the process, + # and even with --jitless the backend is too slow to respond within any + # sane timeout. amd64 builds run under KVM and are unaffected. + BUILD_ARCH="" + if [ -f /etc/stack-build-arch.env ]; then + # shellcheck disable=SC1091 + . /etc/stack-build-arch.env + BUILD_ARCH="${STACK_EMULATOR_BUILD_ARCH:-}" + fi + + if [ "$BUILD_ARCH" = "arm64" ]; then + log "Skipping smoke test: build arch is arm64 and cross-arch TCG can't reliably run the backend." + else + log "Running smoke test on slim image..." + docker run --rm --name smoke-test \ + --network host \ + --env-file /etc/stack-build.env \ + --env-file /etc/stack-build-computed.env \ + -e STACK_SKIP_MIGRATIONS=true \ + -e STACK_SKIP_SEED_SCRIPT=true \ + -e STACK_RUNTIME_WORK_DIR=/app \ + -v stack-postgres-data:/data/postgres \ + -v stack-redis-data:/data/redis \ + -v stack-clickhouse-data:/data/clickhouse \ + -v stack-minio-data:/data/minio \ + -v stack-inbucket-data:/data/inbucket \ + -d stack-local-emulator-slim + + smoke_timeout=300 + smoke_elapsed=0 + smoke_passed=false + while [ "$smoke_elapsed" -lt "$smoke_timeout" ]; do + code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 3 http://127.0.0.1:8102/health?db=1 2>/dev/null || true) + if [ "$code" = "200" ]; then + smoke_passed=true + break + fi + sleep 2 + smoke_elapsed=$((smoke_elapsed + 2)) + done + + if [ "$smoke_passed" = "false" ]; then + log "SMOKE TEST FAILED: backend /health?db=1 did not return 200 within ${smoke_timeout}s" + log "--- docker ps -a ---" + docker ps -a 2>&1 | /usr/local/bin/log-provision-stream "ps" || true + log "--- smoke-test container logs (last 200 lines) ---" + docker logs --tail 200 smoke-test 2>&1 | /usr/local/bin/log-provision-stream "smoke-test" || true + log "--- free -m ---" + free -m 2>&1 | /usr/local/bin/log-provision-stream "mem" || true + log "--- curl -v /health?db=1 ---" + curl -v --max-time 5 http://127.0.0.1:8102/health?db=1 2>&1 | /usr/local/bin/log-provision-stream "curl" || true + docker stop smoke-test 2>/dev/null || true + exit 1 + fi + + docker stop smoke-test 2>/dev/null || true + sleep 2 + log "Smoke test passed (${smoke_elapsed}s)." + fi + + log "Flattening image (docker export/import)..." + docker create --name flatten stack-local-emulator-slim /bin/true + docker export flatten | docker import \ + --change 'WORKDIR /app' \ + --change 'ENTRYPOINT ["/entrypoint.sh"]' \ + --change 'EXPOSE 5432 6379 2500 9001 1100 8071 8123 9009 9090 8080 8101 8102' \ + --change 'ENV DEBIAN_FRONTEND=noninteractive' \ + - stack-local-emulator:final + log "Flatten done." + + log "Saving final image to /var/tmp..." + docker rm flatten + docker save stack-local-emulator:final -o /var/tmp/final-image.tar + mv /var/lib/docker/volumes /var/tmp/volumes-backup + log "Nuking Docker storage and reloading..." + systemctl stop docker containerd + rm -rf /var/lib/docker /var/lib/containerd + systemctl start docker containerd + until docker info >/dev/null 2>&1; do sleep 1; done + docker load -i /var/tmp/final-image.tar + docker tag stack-local-emulator:final stack-local-emulator + docker rmi stack-local-emulator:final || true + rm -f /var/tmp/final-image.tar + systemctl stop docker + rm -rf /var/lib/docker/volumes + mv /var/tmp/volumes-backup /var/lib/docker/volumes + systemctl start docker + log "Docker storage rebuilt." + + log "Zeroing free space for qcow2 compression..." + dd if=/dev/zero of=/zero.fill bs=1M 2>/dev/null || true + rm -f /zero.fill + sync + fstrim -av 2>/dev/null || true + log "slim-docker-image done." - path: /etc/systemd/system/stack.service content: | @@ -162,24 +468,77 @@ write_files: [Install] WantedBy=multi-user.target + - path: /usr/local/bin/provision-build + permissions: '0755' + content: | + #!/bin/bash + set -euo pipefail + + if bash /usr/local/bin/mount-host-fs 2>/dev/null; then + export STACK_PROVISION_LOG_FILE=/host/provision.log + : > "$STACK_PROVISION_LOG_FILE" + else + export STACK_PROVISION_LOG_FILE="" + fi + + write_marker_to_consoles() { + local marker="$1" + for dev in /dev/console /dev/ttyAMA0 /dev/ttyS0; do + echo "$marker" > "$dev" 2>/dev/null || true + done + } + + cleanup() { + local status=$? + if [ "$status" -ne 0 ]; then + if [ -n "${STACK_PROVISION_LOG_FILE:-}" ]; then + printf 'ERROR: provision-build exited with code %s\n' "$status" >> "$STACK_PROVISION_LOG_FILE" + printf '%s\n' "STACK_CLOUD_INIT_FAILED" >> "$STACK_PROVISION_LOG_FILE" + fi + write_marker_to_consoles "STACK_CLOUD_INIT_FAILED" + sync || true + (sleep 2 && shutdown -P now) & + (sleep 15 && poweroff -f) & + fi + } + trap cleanup EXIT + + SERIAL="" + for d in /dev/ttyAMA0 /dev/ttyS0; do + [ -c "$d" ] && SERIAL="$d" && break + done + if [ -n "$SERIAL" ]; then + exec > >(tee -a "$SERIAL") 2>&1 + fi + + log_provision() { + /usr/local/bin/log-provision "$*" + } + + log_provision "runcmd starting" + + systemctl disable --now ssh || true + systemctl mask ssh || true + + log_provision "installing emulator containers" + bash /usr/local/bin/install-emulator-containers + + systemctl daemon-reload + systemctl enable stack.service + + log_provision "starting build migrations" + bash /usr/local/bin/run-build-migrations + + log_provision "starting slim-docker-image" + bash /usr/local/bin/slim-docker-image + + log_provision "build pipeline complete" + if [ -n "${STACK_PROVISION_LOG_FILE:-}" ]; then + printf '%s\n' "STACK_CLOUD_INIT_DONE" >> "$STACK_PROVISION_LOG_FILE" + fi + write_marker_to_consoles "STACK_CLOUD_INIT_DONE" + + shutdown -P now + runcmd: - - systemctl disable --now ssh || true - - systemctl mask ssh || true - - bash /usr/local/bin/install-emulator-containers - - systemctl daemon-reload - - systemctl enable stack.service - - docker run --rm --name stack-build-init - --network host - -e STACK_DEPS_ONLY=true - -v stack-postgres-data:/data/postgres - -v stack-redis-data:/data/redis - -v stack-clickhouse-data:/data/clickhouse - -v stack-minio-data:/data/minio - -v stack-inbucket-data:/data/inbucket - -d stack-local-emulator - - bash /usr/local/bin/wait-for-deps - - docker stop stack-build-init || true - - echo "STACK_CLOUD_INIT_DONE" > /dev/console 2>/dev/null || true - - echo "STACK_CLOUD_INIT_DONE" > /dev/ttyAMA0 2>/dev/null || true - - echo "STACK_CLOUD_INIT_DONE" > /dev/ttyS0 2>/dev/null || true - - shutdown -P now + - [bash, /usr/local/bin/provision-build] diff --git a/docker/local-emulator/qemu/run-emulator.sh b/docker/local-emulator/qemu/run-emulator.sh index f2f3028ca6..0a82c1b883 100755 --- a/docker/local-emulator/qemu/run-emulator.sh +++ b/docker/local-emulator/qemu/run-emulator.sh @@ -85,6 +85,10 @@ prepare_runtime_config_iso() { mkdir -p "$cfg_dir" { printf "STACK_EMULATOR_PORT_PREFIX=%s\n" "$PORT_PREFIX" + printf "STACK_EMULATOR_DASHBOARD_HOST_PORT=%s\n" "$EMULATOR_DASHBOARD_PORT" + printf "STACK_EMULATOR_BACKEND_HOST_PORT=%s\n" "$EMULATOR_BACKEND_PORT" + printf "STACK_EMULATOR_MINIO_HOST_PORT=%s\n" "$EMULATOR_MINIO_PORT" + printf "STACK_EMULATOR_INBUCKET_HOST_PORT=%s\n" "$EMULATOR_INBUCKET_PORT" } > "$cfg_dir/runtime.env" cp "$SCRIPT_DIR/../.env.development" "$cfg_dir/base.env" make_iso_from_dir "$cfg_iso" "STACKCFG" "$cfg_dir" diff --git a/docker/local-emulator/qemu/test-serial.sh b/docker/local-emulator/qemu/test-serial.sh new file mode 100755 index 0000000000..e118db6c4e --- /dev/null +++ b/docker/local-emulator/qemu/test-serial.sh @@ -0,0 +1,124 @@ +#!/usr/bin/env bash +# Quick test: boot the base QEMU image with a minimal cloud-init that writes to +# serial via runcmd. Verifies that our logging approach works without running +# the full emulator build (~10s instead of ~10min). +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +source "$SCRIPT_DIR/common.sh" + +detect_host +ARCH="${1:-$HOST_ARCH}" + +BASE_IMG="$SCRIPT_DIR/images/debian-13-base-${ARCH}.qcow2" +if [ ! -f "$BASE_IMG" ]; then + echo "Base image not found: $BASE_IMG" >&2 + exit 1 +fi + +TMP_DIR="$(mktemp -d /tmp/stack-serial-test-XXXXXX)" +trap 'kill "$(cat "$TMP_DIR/qemu.pid" 2>/dev/null)" 2>/dev/null; rm -rf "$TMP_DIR"' EXIT + +# Create a temporary disk +cp "$BASE_IMG" "$TMP_DIR/disk.qcow2" + +# Minimal cloud-init user-data that tests serial output from runcmd +cat > "$TMP_DIR/user-data" << 'EOF' +#cloud-config +write_files: + - path: /usr/local/bin/provision-build + permissions: '0755' + content: | + #!/bin/bash + set -euo pipefail + + SERIAL="" + for d in /dev/ttyAMA0 /dev/ttyS0; do + [ -c "$d" ] && SERIAL="$d" && break + done + if [ -n "$SERIAL" ]; then + exec > >(tee -a "$SERIAL") 2>&1 + fi + + echo "STACK_PROVISION: script started" + sleep 1 + echo "STACK_PROVISION: step 2" + for dev in /dev/console /dev/ttyAMA0 /dev/ttyS0; do + echo "STACK_CLOUD_INIT_DONE" > "$dev" 2>/dev/null || true + done + shutdown -P now + +runcmd: + - [bash, /usr/local/bin/provision-build] +EOF + +cat > "$TMP_DIR/meta-data" << 'EOF' +instance-id: serial-test +local-hostname: serial-test +EOF + +# Build seed ISO +make_iso_from_dir "$TMP_DIR/seed.iso" "cidata" "$TMP_DIR" + +: > "$TMP_DIR/serial.log" + +case "$ARCH" in + arm64) + accel="hvf" + firmware="$(find_aarch64_firmware)" + qemu_base="qemu-system-aarch64 -machine virt -accel $accel -cpu max -bios $firmware" + ;; + amd64) + qemu_base="qemu-system-x86_64 -machine q35 -accel hvf -cpu max" + ;; +esac + +$qemu_base \ + -boot order=c \ + -m 1024 \ + -smp 2 \ + -drive "file=$TMP_DIR/disk.qcow2,format=qcow2,if=virtio" \ + -drive "file=$TMP_DIR/seed.iso,format=raw,if=virtio,readonly=on" \ + -netdev user,id=net0 \ + -device virtio-net-pci,netdev=net0 \ + -serial "file:$TMP_DIR/serial.log" \ + -display none \ + -daemonize \ + -pidfile "$TMP_DIR/qemu.pid" + +echo "QEMU started, waiting for serial output..." +echo "Serial log: $TMP_DIR/serial.log" + +elapsed=0 +timeout=120 +while [ "$elapsed" -lt "$timeout" ]; do + if grep -q "STACK_CLOUD_INIT_DONE" "$TMP_DIR/serial.log" 2>/dev/null; then + echo "" + echo "=== SUCCESS: STACK_CLOUD_INIT_DONE received ===" + echo "" + echo "=== All STACK_PROVISION lines ===" + grep "STACK_PROVISION" "$TMP_DIR/serial.log" || echo "(none found)" + exit 0 + fi + + # Show any STACK_PROVISION lines as they appear + if grep -q "STACK_PROVISION" "$TMP_DIR/serial.log" 2>/dev/null; then + grep "STACK_PROVISION" "$TMP_DIR/serial.log" | while IFS= read -r line; do + echo " [${elapsed}s] $line" + done + fi + + sleep 2 + elapsed=$((elapsed + 2)) + printf "\r [%ds / %ds] waiting..." "$elapsed" "$timeout" +done + +echo "" +echo "=== TIMEOUT: no STACK_CLOUD_INIT_DONE after ${timeout}s ===" +echo "" +echo "=== Last 30 lines of serial log ===" +tail -30 "$TMP_DIR/serial.log" +echo "" +echo "=== STACK_PROVISION lines ===" +grep "STACK_PROVISION" "$TMP_DIR/serial.log" || echo "(none found)" +exit 1