Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
7d9e156
local emulator image opt
BilalG1 Apr 9, 2026
d724eb2
emulator fixes
BilalG1 Apr 10, 2026
cd087c5
Merge branch 'dev' into local-emulator-image-optimization
BilalG1 Apr 10, 2026
784f17c
emulator: fail-fast on provision errors, diagnose smoke test failures
BilalG1 Apr 10, 2026
7bf4a15
emulator: make cross-arch arm64 build survive TCG
BilalG1 Apr 10, 2026
6c5615b
emulator: drop --jitless, capture migration errors on failure
BilalG1 Apr 10, 2026
2538382
ci: run arm64 emulator build on ubuntu-24.04-arm (same-arch TCG)
BilalG1 Apr 10, 2026
54ecd7c
emulator: bounded dep wait with per-service diagnostics
BilalG1 Apr 10, 2026
5c3c436
emulator: only use -cpu cortex-a72 for cross-arch TCG
BilalG1 Apr 11, 2026
e636151
emulator: move arm64 back to ubicloud cross-arch, run migrations with…
BilalG1 Apr 13, 2026
f4aca6d
emulator: swap --jitless for --no-opt on migration exec
BilalG1 Apr 13, 2026
144866a
emulator: pass --no-opt on node CLI, not via NODE_OPTIONS
BilalG1 Apr 13, 2026
95054ca
emulator: don't strip the clickhouse binary (breaks self-extractor)
BilalG1 Apr 13, 2026
999843b
emulator: bump cross-arch TCG -cpu to cortex-a76 (LSE for ClickHouse)
BilalG1 Apr 13, 2026
0896f14
ci: skip emulator boot/verify on arm64 (cross-arch TCG)
BilalG1 Apr 13, 2026
44e4079
emulator: add --no-wasm-tier-up to migration exec
BilalG1 Apr 13, 2026
9ec08f4
emulator: dedupe probe list, factor log-stream and console-marker hel…
BilalG1 Apr 13, 2026
f8524e9
Merge branch 'dev' into emulator-arm64-ubicloud-jitless
BilalG1 Apr 14, 2026
1104023
emulator: add --no-wasm-dynamic-tiering and --no-wasm-code-gc to migr…
BilalG1 Apr 14, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 32 additions & 3 deletions .github/workflows/qemu-emulator-build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,23 @@ env:
jobs:
build:
name: Build QEMU Image (${{ matrix.arch }})
runs-on: ubicloud-standard-8
runs-on: ${{ matrix.runner }}
timeout-minutes: 120
strategy:
fail-fast: false
matrix:
include:
# amd64 runs natively under KVM on ubicloud's amd64 runner.
- arch: amd64
runner: ubicloud-standard-8
# arm64 runs under cross-arch TCG on ubicloud's amd64 runner.
# No KVM for arm64 guests on an amd64 host; cortex-a72 + V8
# --jitless together sidestep the SIGTRAPs that cross-arch TCG
# hits on aggressive arm64 JIT code. Smoke test is still skipped
# because the backend can't come up reliably under cross-arch
# TCG within any sane window.
- arch: arm64
runner: ubicloud-standard-8

steps:
- uses: actions/checkout@v6
Expand All @@ -47,7 +56,20 @@ jobs:
- name: Install QEMU dependencies
run: |
sudo apt-get update
sudo apt-get install -y qemu-system-x86 qemu-system-arm qemu-utils genisoimage socat qemu-efi-aarch64
sudo apt-get install -y qemu-system-x86 qemu-system-arm qemu-kvm qemu-utils genisoimage socat qemu-efi-aarch64

- name: Enable KVM access
run: |
echo 'KERNEL=="kvm", GROUP="kvm", MODE="0666", OPTIONS+="static_node=kvm"' \
| sudo tee /etc/udev/rules.d/99-kvm4all.rules
sudo udevadm control --reload-rules
sudo udevadm trigger --name-match=kvm || true
ls -la /dev/kvm || echo "no /dev/kvm present"
if [ -w /dev/kvm ]; then
echo "KVM is writable — hardware acceleration will be used"
else
echo "WARNING: /dev/kvm is not writable — will fall back to TCG (very slow)"
fi

- name: Build QEMU image
run: |
Expand All @@ -58,20 +80,27 @@ jobs:
- name: Generate emulator env
run: node docker/local-emulator/generate-env-development.mjs

# arm64 runs under cross-arch TCG on an amd64 runner; the backend's
# V8 TurboFan JIT re-triggers the SIGTRAPs we dodge in migrations
# with --no-opt, and even if it didn't, boot is too slow under TCG
# to verify in any sane window. amd64 KVM already exercises the
# service stack; real arm64 hosts have KVM for end-users.
- name: Start emulator and verify
if: matrix.arch == 'amd64'
run: |
chmod +x docker/local-emulator/qemu/run-emulator.sh
EMULATOR_ARCH=${{ matrix.arch }} \
EMULATOR_READY_TIMEOUT=3200 \
docker/local-emulator/qemu/run-emulator.sh start

- name: Verify services are healthy
if: matrix.arch == 'amd64'
run: |
EMULATOR_ARCH=${{ matrix.arch }} \
docker/local-emulator/qemu/run-emulator.sh status

- name: Stop emulator
if: always()
if: always() && matrix.arch == 'amd64'
run: |
EMULATOR_ARCH=${{ matrix.arch }} \
docker/local-emulator/qemu/run-emulator.sh stop
Expand Down
44 changes: 35 additions & 9 deletions docker/local-emulator/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,28 @@ RUN cp $(which qstash) /qstash-binary 2>/dev/null || \
{ echo "ERROR: qstash binary not found" >&2; exit 1; }


# ── Strip / compress service binaries (parallel stages) ──────────────────────

FROM debian:trixie-slim AS upx-compress
RUN apt-get update && apt-get install -y --no-install-recommends upx-ucl binutils && \
rm -rf /var/lib/apt/lists/*
COPY --from=clickhouse-bin /usr/bin/clickhouse /out/clickhouse
COPY --from=svix-bin /usr/local/bin/svix-server /out/svix-server
COPY --from=minio-bin /usr/bin/minio /out/minio
COPY --from=mc-bin /usr/bin/mc /out/mc
COPY --from=qstash-bin /qstash-binary /out/qstash
RUN chmod u+w /out/* && \
# Intentionally NOT stripping /out/clickhouse. The clickhouse binary is a
# self-extracting compressed executable (a small loader with a ZSTD
# payload appended after the section table); strip rewrites the ELF and
# can invalidate the loader's "find my payload" lookup, causing the
# decompressor to spin on garbage with zero log output — the exact
# symptom seen on cross-arch TCG runs. Savings from stripping would be
# only the tiny bootstrap anyway since the payload isn't in any section.
strip --strip-all /out/minio /out/svix-server /out/mc /out/qstash && \
upx -9 /out/minio /out/svix-server /out/mc /out/qstash


# ── Final image ───────────────────────────────────────────────────────────────

FROM debian:trixie-slim
Expand Down Expand Up @@ -139,20 +161,20 @@ COPY --from=node-base /usr/local/bin/node /usr/local/bin/node
# Inbucket
COPY --from=inbucket-bin /opt/inbucket /opt/inbucket

# Svix
COPY --from=svix-bin /usr/local/bin/svix-server /usr/local/bin/svix-server
# Svix (UPX-compressed)
COPY --from=upx-compress /out/svix-server /usr/local/bin/svix-server

# ClickHouse
COPY --from=clickhouse-bin /usr/bin/clickhouse /usr/bin/clickhouse
# ClickHouse (stripped only)
COPY --from=upx-compress /out/clickhouse /usr/bin/clickhouse
RUN ln -sf /usr/bin/clickhouse /usr/bin/clickhouse-server && \
ln -sf /usr/bin/clickhouse /usr/bin/clickhouse-client

# MinIO
COPY --from=minio-bin /usr/bin/minio /usr/local/bin/minio
COPY --from=mc-bin /usr/bin/mc /usr/local/bin/mc
# MinIO (UPX-compressed)
COPY --from=upx-compress /out/minio /usr/local/bin/minio
COPY --from=upx-compress /out/mc /usr/local/bin/mc

# QStash
COPY --from=qstash-bin --chmod=755 /qstash-binary /usr/local/bin/qstash
# QStash (UPX-compressed)
COPY --from=upx-compress --chmod=755 /out/qstash /usr/local/bin/qstash

# App
WORKDIR /app
Expand All @@ -164,6 +186,10 @@ COPY --from=builder /app/apps/backend/node_modules ./apps/backend/node_modules
COPY --from=builder /app/apps/dashboard/.next/standalone ./
COPY --from=builder /app/apps/dashboard/.next/static ./apps/dashboard/.next/static
COPY --from=builder /app/apps/dashboard/public ./apps/dashboard/public
# Save the standalone-traced node_modules (runtime deps only) before the full
# migration-pruner copy overwrites it. The slim-docker-image step in the QEMU
# build restores this after migrations are baked in.
RUN cp -a /app/node_modules /app/node_modules.standalone 2>/dev/null || mkdir -p /app/node_modules.standalone
COPY --from=migration-pruner /pruned-node_modules ./node_modules
COPY --from=builder /app/packages ./packages

Expand Down
131 changes: 119 additions & 12 deletions docker/local-emulator/qemu/build-image.sh
Original file line number Diff line number Diff line change
Expand Up @@ -112,15 +112,36 @@ qemu_cmd_prefix_for_arch() {
case "$arch" in
arm64)
local accel="tcg"
local cpu="max"
if [ "$HOST_ARCH" = "arm64" ]; then
# Same-arch: prefer hardware acceleration, keep -cpu max. If no
# accelerator is available (e.g. Azure arm64 runners with no
# nested virt) we fall through to TCG, but same-arch TCG handles
# -cpu max correctly and more named CPU models have TCG bugs
# than -cpu max does.
case "$HOST_OS" in
darwin) accel="hvf" ;;
linux) [ -w /dev/kvm ] && accel="kvm" ;;
esac
else
# Cross-arch TCG (amd64 host emulating arm64 guest) needs a CPU
# model that threads a narrow needle:
# * -cpu max advertises armv8.5+ features (PAC, BTI, SVE, LSE…)
# that V8's TurboFan then emits JIT code for; cross-arch TCG
# mistranslates some of those and node SIGTRAPs in migrations.
# * -cpu cortex-a72 (armv8.0-a) keeps V8 safe but makes
# ClickHouse SIGILL on startup because its statically-linked
# LSE atomics (armv8.1+) aren't recognized.
# cortex-a76 is armv8.2-a: it exposes LSE (ClickHouse happy)
# while predating PAC (v8.3) and BTI (v8.5), so V8's aggressive
# JIT tiers don't emit the instructions that tripped TCG. Pair
# this with `node --no-opt` on the migration exec, which keeps
# V8 in Ignition+Sparkplug only (no TurboFan/Maglev).
cpu="cortex-a76"
fi
local firmware
firmware="$(find_aarch64_firmware)"
echo "qemu-system-aarch64 -machine virt -accel $accel -cpu max -bios $firmware"
echo "qemu-system-aarch64 -machine virt -accel $accel -cpu $cpu -bios $firmware"
;;
amd64)
local accel="tcg"
Expand Down Expand Up @@ -176,6 +197,40 @@ prepare_bundle_artifacts() {
printf "%s" "$current_ids" > "$bundle_meta"
}

contains_provision_marker() {
local provision_log="$1"
local serial_log="$2"
local marker="$3"

if [ -f "$provision_log" ] && grep -Fqx "$marker" "$provision_log" 2>/dev/null; then
return 0
fi

if [ -f "$serial_log" ] && LC_ALL=C strings -a "$serial_log" 2>/dev/null | grep -Fqx "$marker" 2>/dev/null; then
return 0
fi

return 1
}

line_count() {
local file="$1"
local count=0
if [ -f "$file" ]; then
count="$(wc -l < "$file" | tr -d '[:space:]')" || count=0
fi
printf '%s\n' "$count"
}

persist_provision_logs() {
local arch="$1"
local serial_log="$2"
local provision_log="$3"

cp "$serial_log" "$IMAGE_DIR/provision-emulator-${arch}.log" 2>/dev/null || true
cp "$provision_log" "$IMAGE_DIR/provision-emulator-${arch}.progress.log" 2>/dev/null || true
}

build_one() {
local arch="$1"
local base_img="$IMAGE_DIR/debian-${DEBIAN_VERSION}-base-${arch}.qcow2"
Expand All @@ -192,8 +247,12 @@ build_one() {
local bundle_iso="$tmp_dir/bundle.iso"
local bundle_dir="$tmp_dir/bundle"
local serial_log="$tmp_dir/serial.log"
local provision_log="$tmp_dir/provision.log"
local pidfile="$tmp_dir/qemu.pid"
local qemu_base pid elapsed
local qemu_base pid elapsed total_build_lines
local last_build_lines=0
local guest_exited=false
local guest_failed=false
local start_time=$SECONDS

cp "$base_img" "$tmp_img"
Expand All @@ -209,21 +268,28 @@ build_one() {

mkdir -p "$bundle_dir"
cp "$bundle_tgz" "$bundle_dir/img.tgz"
cp "$BUILD_ENV_FILE" "$bundle_dir/build.env"
# Tell the guest which arch it's being built for so cross-arch (TCG) builds
# can skip the smoke test, which isn't reliable under software emulation.
printf 'STACK_EMULATOR_BUILD_ARCH=%s\n' "$arch" > "$bundle_dir/build-arch.env"
make_iso_from_dir "$bundle_iso" "STACKBUNDLE" "$bundle_dir"

: > "$serial_log"
: > "$provision_log"
qemu_base="$(qemu_cmd_prefix_for_arch "$arch")"
log "QEMU command prefix (${arch}): $qemu_base"

# shellcheck disable=SC2086
$qemu_base \
-boot order=c \
-m "$RAM" \
-smp "$CPUS" \
-drive "file=$tmp_img,format=qcow2,if=virtio" \
-drive "file=$tmp_img,format=qcow2,if=virtio,discard=on,detect-zeroes=unmap" \
-drive "file=$seed_iso,format=raw,if=virtio,readonly=on" \
-drive "file=$bundle_iso,format=raw,if=virtio,readonly=on" \
-netdev user,id=net0 \
-device virtio-net-pci,netdev=net0 \
-virtfs "local,path=$tmp_dir,mount_tag=hostfs,security_model=none" \
-serial "file:$serial_log" \
-display none \
-daemonize \
Expand All @@ -232,23 +298,62 @@ build_one() {
pid="$(cat "$pidfile")"
elapsed=0
while [ "$elapsed" -lt "$PROVISION_TIMEOUT" ]; do
if grep -q "STACK_CLOUD_INIT_DONE" "$serial_log" 2>/dev/null; then
if contains_provision_marker "$provision_log" "$serial_log" "STACK_CLOUD_INIT_DONE"; then
break
fi

if contains_provision_marker "$provision_log" "$serial_log" "STACK_CLOUD_INIT_FAILED"; then
guest_failed=true
break
fi

if [ -f "$provision_log" ]; then
total_build_lines="$(line_count "$provision_log")"
if [ "$total_build_lines" -gt "$last_build_lines" ]; then
echo ""
sed -n "$((last_build_lines + 1)),${total_build_lines}p" "$provision_log" 2>/dev/null | while IFS= read -r msg; do
if [ "$msg" = "STACK_CLOUD_INIT_DONE" ]; then
continue
fi
printf " [%3ds] %s\n" "$elapsed" "$msg"
done
last_build_lines="$total_build_lines"
fi
fi

if ! kill -0 "$pid" 2>/dev/null; then
guest_exited=true
break
fi

sleep 5
elapsed=$((SECONDS - start_time))
printf "\r [%3ds / %ds] provisioning emulator..." "$elapsed" "$PROVISION_TIMEOUT"
done
echo ""

if ! grep -q "STACK_CLOUD_INIT_DONE" "$serial_log" 2>/dev/null; then
err "Provisioning timed out for emulator (${arch})"
tail -50 "$serial_log" >&2 || true
if ! contains_provision_marker "$provision_log" "$serial_log" "STACK_CLOUD_INIT_DONE"; then
if [ "$guest_failed" = true ]; then
err "Guest provisioning reported failure for emulator (${arch})"
elif [ "$guest_exited" = true ]; then
err "Provisioning exited before completion for emulator (${arch})"
else
err "Provisioning timed out for emulator (${arch})"
fi

if [ -s "$provision_log" ]; then
tail -50 "$provision_log" >&2 || true
else
LC_ALL=C strings -a "$serial_log" 2>/dev/null | tail -50 >&2 || tail -50 "$serial_log" >&2 || true
fi

if kill -0 "$pid" 2>/dev/null; then
kill "$pid" 2>/dev/null || true
sleep 1
kill -9 "$pid" 2>/dev/null || true
fi

persist_provision_logs "$arch" "$serial_log" "$provision_log"
rm -rf "$tmp_dir"
exit 1
fi
Expand All @@ -266,19 +371,21 @@ build_one() {
kill -9 "$pid" 2>/dev/null || true
fi

cp "$tmp_img" "$final_img"
cp "$serial_log" "$IMAGE_DIR/provision-emulator-${arch}.log"
rm -rf "$tmp_dir"
persist_provision_logs "$arch" "$serial_log" "$provision_log"

log "Compressing final image (this may take several minutes)..."
qemu-img convert -p -O qcow2 -c "$final_img" "$final_img.tmp"
mv "$final_img.tmp" "$final_img"
qemu-img convert -p -O qcow2 -c "$tmp_img" "$final_img"
rm -rf "$tmp_dir"

local size
size="$(du -h "$final_img" | cut -f1)"
log "━━━ Emulator image ready: $final_img (${size}) ━━━"
}

log "Generating emulator build env file..."
node "$REPO_ROOT/docker/local-emulator/generate-env-development.mjs"
BUILD_ENV_FILE="$REPO_ROOT/docker/local-emulator/.env.development"

for arch in "${TARGET_ARCHS[@]}"; do
local_base="$IMAGE_DIR/debian-${DEBIAN_VERSION}-base-${arch}.qcow2"
download_cloud_image "$arch" "$local_base"
Expand Down
Loading
Loading