diff --git a/.github/workflows/qemu-emulator-build.yaml b/.github/workflows/qemu-emulator-build.yaml index 5df1497465..5792b7f600 100644 --- a/.github/workflows/qemu-emulator-build.yaml +++ b/.github/workflows/qemu-emulator-build.yaml @@ -22,8 +22,16 @@ concurrency: env: EMULATOR_IMAGE_NAME: stack-local-emulator + # Shell scripts (build-image.sh, run-emulator.sh) read these directly. EMULATOR_IMAGE_DIR: ${{ github.workspace }}/docker/local-emulator/qemu/images EMULATOR_RUN_DIR: ${{ github.workspace }}/docker/local-emulator/qemu/run + # The stack-cli ignores EMULATOR_IMAGE_DIR/RUN_DIR and derives its own paths + # from STACK_EMULATOR_HOME. Point it at the same workspace so `emulator + # start` finds the freshly-built qcow2 from build-image.sh and cold-boots + # it, instead of auto-pulling from a prior release. CI doesn't capture a + # savevm (EMULATOR_CAPTURE_SAVEVM defaults to 0); users capture locally + # on first `stack emulator pull`. + STACK_EMULATOR_HOME: ${{ github.workspace }}/docker/local-emulator/qemu jobs: build: @@ -34,15 +42,16 @@ jobs: fail-fast: false matrix: include: - # amd64 runs natively under KVM on ubicloud's amd64 runner. + # Both arches build on ubicloud's amd64 runner. amd64 uses KVM; + # arm64 runs under cross-arch TCG (slow, but only cloud-init + # provisioning has to complete — the boot/verify smoke test below + # is gated to amd64 because TCG can't boot Next.js in any + # reasonable time). Snapshots are NOT published — `stack emulator + # pull` captures one locally on first run, which is the only way + # to guarantee KVM/HVF/TCG + `-cpu max` compatibility on the + # user's machine. - arch: amd64 runner: ubicloud-standard-8 - # arm64 runs under cross-arch TCG on ubicloud's amd64 runner. - # No KVM for arm64 guests on an amd64 host; cortex-a72 + V8 - # --jitless together sidestep the SIGTRAPs that cross-arch TCG - # hits on aggressive arm64 JIT code. Smoke test is still skipped - # because the backend can't come up reliably under cross-arch - # TCG within any sane window. - arch: arm64 runner: ubicloud-standard-8 @@ -55,10 +64,60 @@ jobs: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - - name: Install QEMU dependencies + # Node/pnpm are needed on both arches: arm64 also runs + # generate-env-development.mjs inside build-image.sh. amd64 additionally + # builds and runs the CLI for the verification steps below. + - uses: pnpm/action-setup@v4 + with: + version: 10.23.0 + + - uses: actions/setup-node@v4 + with: + node-version: 22 + cache: pnpm + + - name: Install system dependencies run: | sudo apt-get update - sudo apt-get install -y qemu-system-x86 qemu-system-arm qemu-kvm qemu-utils genisoimage socat qemu-efi-aarch64 + # qemu-utils gives us qemu-img; qemu-efi-aarch64 provides the arm64 + # UEFI firmware. The actual qemu-system-* binaries come from the + # source build below — Ubuntu 24.04 ships QEMU 8.2 which predates + # the mapped-ram migration capability we rely on. + sudo apt-get install -y qemu-utils qemu-efi-aarch64 socat genisoimage zstd \ + ninja-build pkg-config python3-venv \ + libglib2.0-dev libpixman-1-dev libslirp-dev libepoxy-dev libgbm-dev + + # QEMU 10.2.2 is required for the mapped-ram + multifd migration path + # used by the fast-resume snapshot. Cache the compiled prefix so CI + # only pays the ~5-8 min build cost once per runner image. + - name: Restore QEMU 10.2.2 cache + id: qemu-cache + uses: actions/cache@v4 + with: + path: /opt/qemu + key: qemu-10.2.2-${{ runner.os }}-${{ runner.arch }}-v1 + + - name: Build QEMU 10.2.2 from source + if: steps.qemu-cache.outputs.cache-hit != 'true' + run: | + set -euxo pipefail + curl -fsSL https://download.qemu.org/qemu-10.2.2.tar.xz -o /tmp/qemu.tar.xz + mkdir -p /tmp/qemu-src + tar -xf /tmp/qemu.tar.xz -C /tmp/qemu-src --strip-components=1 + cd /tmp/qemu-src + ./configure --prefix=/opt/qemu \ + --target-list=x86_64-softmmu,aarch64-softmmu \ + --enable-kvm --enable-slirp --enable-tcg \ + --disable-docs --disable-gtk --disable-sdl --disable-vnc \ + --disable-guest-agent --disable-tools + make -j"$(nproc)" + sudo make install + + - name: Put QEMU 10.2.2 on PATH + run: | + echo "/opt/qemu/bin" >> "$GITHUB_PATH" + /opt/qemu/bin/qemu-system-x86_64 --version + /opt/qemu/bin/qemu-system-aarch64 --version - name: Enable KVM access run: | @@ -82,41 +141,56 @@ jobs: - name: Generate emulator env run: node docker/local-emulator/generate-env-development.mjs - # arm64 runs under cross-arch TCG on an amd64 runner; the backend's - # V8 TurboFan JIT re-triggers the SIGTRAPs we dodge in migrations - # with --no-opt, and even if it didn't, boot is too slow under TCG - # to verify in any sane window. amd64 KVM already exercises the - # service stack; real arm64 hosts have KVM for end-users. - - name: Start emulator and verify + # amd64 runs under KVM on the runner so we can boot the newly-built + # image to verify it works end-to-end before publishing. arm64 runs + # under cross-arch TCG on an amd64 host, which can't reliably boot + # Next.js within any sane window — skipped. + - name: Build stack-cli (for emulator CLI) if: matrix.arch == 'amd64' run: | - chmod +x docker/local-emulator/qemu/run-emulator.sh - EMULATOR_ARCH=${{ matrix.arch }} \ - EMULATOR_READY_TIMEOUT=3200 \ - docker/local-emulator/qemu/run-emulator.sh start + pnpm install --frozen-lockfile --filter '@stackframe/stack-cli...' + # Turbo's trailing `...` filter builds stack-cli AND its workspace + # deps (@stackframe/js, @stackframe/stack-shared, etc.) — stack-cli + # imports them at runtime from their dist/ outputs. + pnpm exec turbo run build --filter='@stackframe/stack-cli...' + + - name: Start emulator and verify + if: matrix.arch == 'amd64' + env: + EMULATOR_ARCH: ${{ matrix.arch }} + EMULATOR_READY_TIMEOUT: 3200 + EMULATOR_IMAGE_DIR: ${{ env.EMULATOR_IMAGE_DIR }} + EMULATOR_RUN_DIR: ${{ env.EMULATOR_RUN_DIR }} + run: node packages/stack-cli/dist/index.js emulator start - name: Verify services are healthy if: matrix.arch == 'amd64' - run: | - EMULATOR_ARCH=${{ matrix.arch }} \ - docker/local-emulator/qemu/run-emulator.sh status + env: + EMULATOR_ARCH: ${{ matrix.arch }} + EMULATOR_IMAGE_DIR: ${{ env.EMULATOR_IMAGE_DIR }} + EMULATOR_RUN_DIR: ${{ env.EMULATOR_RUN_DIR }} + run: node packages/stack-cli/dist/index.js emulator status - name: Stop emulator if: always() && matrix.arch == 'amd64' - run: | - EMULATOR_ARCH=${{ matrix.arch }} \ - docker/local-emulator/qemu/run-emulator.sh stop + env: + EMULATOR_ARCH: ${{ matrix.arch }} + EMULATOR_IMAGE_DIR: ${{ env.EMULATOR_IMAGE_DIR }} + EMULATOR_RUN_DIR: ${{ env.EMULATOR_RUN_DIR }} + run: node packages/stack-cli/dist/index.js emulator stop - name: Package image run: | BASE_IMG="docker/local-emulator/qemu/images/stack-emulator-${{ matrix.arch }}.qcow2" cp "$BASE_IMG" "stack-emulator-${{ matrix.arch }}.qcow2" + ls -lh "stack-emulator-${{ matrix.arch }}.qcow2" - name: Upload image artifact uses: actions/upload-artifact@v4 with: name: qemu-emulator-${{ matrix.arch }} path: stack-emulator-${{ matrix.arch }}.qcow2 + if-no-files-found: error retention-days: 30 compression-level: 0 @@ -134,31 +208,80 @@ jobs: steps: - uses: actions/checkout@v6 - - name: Install QEMU dependencies + - name: Install system dependencies run: | sudo apt-get update - sudo apt-get install -y qemu-system-x86 qemu-utils genisoimage socat + sudo apt-get install -y qemu-utils socat zstd \ + ninja-build pkg-config python3-venv \ + libglib2.0-dev libpixman-1-dev libslirp-dev libepoxy-dev libgbm-dev + + - name: Restore QEMU 10.2.2 cache + id: qemu-cache + uses: actions/cache@v4 + with: + path: /opt/qemu + key: qemu-10.2.2-${{ runner.os }}-${{ runner.arch }}-v1 + + - name: Build QEMU 10.2.2 from source + if: steps.qemu-cache.outputs.cache-hit != 'true' + run: | + set -euxo pipefail + curl -fsSL https://download.qemu.org/qemu-10.2.2.tar.xz -o /tmp/qemu.tar.xz + mkdir -p /tmp/qemu-src + tar -xf /tmp/qemu.tar.xz -C /tmp/qemu-src --strip-components=1 + cd /tmp/qemu-src + ./configure --prefix=/opt/qemu \ + --target-list=x86_64-softmmu,aarch64-softmmu \ + --enable-kvm --enable-slirp --enable-tcg \ + --disable-docs --disable-gtk --disable-sdl --disable-vnc \ + --disable-guest-agent --disable-tools + make -j"$(nproc)" + sudo make install + + - name: Put QEMU 10.2.2 on PATH + run: | + echo "/opt/qemu/bin" >> "$GITHUB_PATH" + /opt/qemu/bin/qemu-system-x86_64 --version + + - uses: pnpm/action-setup@v4 + with: + version: 10.23.0 + + - uses: actions/setup-node@v4 + with: + node-version: 22 + cache: pnpm + + - name: Install stack-cli deps + build + run: | + pnpm install --frozen-lockfile --filter '@stackframe/stack-cli...' + # Turbo's trailing `...` filter builds stack-cli AND its workspace + # deps (@stackframe/js, @stackframe/stack-shared, etc.) — stack-cli + # imports them at runtime from their dist/ outputs. + pnpm exec turbo run build --filter='@stackframe/stack-cli...' - name: Download built image uses: actions/download-artifact@v4 with: name: qemu-emulator-${{ matrix.arch }} - path: docker/local-emulator/qemu/images/ + path: ${{ github.workspace }}/.stack-emulator-images/ - - name: Generate emulator env - run: node docker/local-emulator/generate-env-development.mjs + - name: Place qcow2 into STACK_EMULATOR_HOME layout + run: | + mkdir -p "$STACK_EMULATOR_HOME/images" + cp "${{ github.workspace }}/.stack-emulator-images/stack-emulator-${{ matrix.arch }}.qcow2" "$STACK_EMULATOR_HOME/images/" + ls -lh "$STACK_EMULATOR_HOME/images/" - - name: Start emulator from artifact + # No savevm.zst artifact (users capture locally via `emulator pull`), + # so `emulator start` cold-boots the qcow2. Budget accordingly. + - name: Start emulator via CLI run: | - chmod +x docker/local-emulator/qemu/run-emulator.sh docker/local-emulator/qemu/common.sh EMULATOR_ARCH=${{ matrix.arch }} \ EMULATOR_READY_TIMEOUT=600 \ - docker/local-emulator/qemu/run-emulator.sh start + node packages/stack-cli/dist/index.js emulator start - name: Verify services are healthy - run: | - EMULATOR_ARCH=${{ matrix.arch }} \ - docker/local-emulator/qemu/run-emulator.sh status + run: node packages/stack-cli/dist/index.js emulator status - name: Smoke test — backend health run: curl -sf http://localhost:26701/health?db=1 @@ -174,13 +297,11 @@ jobs: - name: Stop emulator if: always() - run: | - EMULATOR_ARCH=${{ matrix.arch }} \ - docker/local-emulator/qemu/run-emulator.sh stop + run: node packages/stack-cli/dist/index.js emulator stop - name: Print serial log on failure if: failure() - run: tail -100 docker/local-emulator/qemu/run/vm/serial.log 2>/dev/null || true + run: tail -100 "$STACK_EMULATOR_HOME/run/vm/serial.log" 2>/dev/null || true publish: name: Publish to GitHub Releases @@ -220,8 +341,14 @@ jobs: ### Images | File | Description | |------|-------------| - | \`stack-emulator-arm64.qcow2\` | ARM64 emulator image | - | \`stack-emulator-amd64.qcow2\` | AMD64 emulator image | + | \`stack-emulator-arm64.qcow2\` | ARM64 disk image | + | \`stack-emulator-amd64.qcow2\` | AMD64 disk image | + + \`emulator pull\` downloads the qcow2 and captures a local fast-start + snapshot (~1-3 min). Subsequent \`emulator start\`s resume in ~3-8 s. + Snapshots are captured locally because QEMU migration state isn't + portable across accelerators (KVM / HVF / TCG) or \`-cpu max\` + feature sets. ### Usage \`\`\`bash diff --git a/.gitignore b/.gitignore index eab9ce0b48..8fa5c69404 100644 --- a/.gitignore +++ b/.gitignore @@ -144,3 +144,6 @@ packages/stack/* !packages/react/package.json !packages/next/package.json !packages/stack/package.json + +# claude code +.claude/scheduled_tasks.lock diff --git a/apps/backend/src/lib/seed-dummy-data.ts b/apps/backend/src/lib/seed-dummy-data.ts index 346724680f..c784d6e439 100644 --- a/apps/backend/src/lib/seed-dummy-data.ts +++ b/apps/backend/src/lib/seed-dummy-data.ts @@ -1485,6 +1485,12 @@ async function seedDummySessionActivityEvents(options: SessionActivityEventSeedO await tx.event.createMany({ data: events, }); + }, { + // Under cross-arch arm64 TCG in the emulator qcow2 build, this batch + // takes ~10s; Prisma's default is 5s. Production (KVM/native) runs it + // in well under 1s, so the looser bound only kicks in when the DB is + // genuinely slow. + timeout: 30_000, }); if (clickhouseClient && clickhouseRows.length > 0) { diff --git a/docker/local-emulator/Dockerfile b/docker/local-emulator/Dockerfile index 4cb1ea7d60..f08a977a92 100644 --- a/docker/local-emulator/Dockerfile +++ b/docker/local-emulator/Dockerfile @@ -58,8 +58,22 @@ ENV NEXT_PUBLIC_STACK_STRIPE_PUBLISHABLE_KEY=pk_test_mock_publishable_key_for_lo # Build the backend NextJS app RUN pnpm turbo run docker-build --filter=@stackframe/backend... --filter=@stackframe/dashboard... -# Build the self-host seed script -RUN cd apps/backend && pnpm build-self-host-migration-script +# Build the self-host seed script. +# tsdown -> rolldown is multi-threaded Rust; under qemu-user (cross-arch +# arm64-on-amd64) its futex emulation occasionally deadlocks and the build +# hangs forever. Bound each attempt and retry to ride out the race. +RUN cd apps/backend && \ + attempt=1; \ + while :; do \ + timeout --kill-after=30s 600s pnpm build-self-host-migration-script && break; \ + rc=$?; \ + if [ "$attempt" -ge 3 ]; then \ + echo "build-self-host-migration-script failed after $attempt attempts (last rc=$rc)" >&2; \ + exit "$rc"; \ + fi; \ + echo "build-self-host-migration-script attempt $attempt failed (rc=$rc); retrying..." >&2; \ + attempt=$((attempt + 1)); \ + done # Prune node_modules for runtime: remove dev tools, heavy UI packages, @@ -263,10 +277,11 @@ COPY docker/local-emulator/run-cron-jobs.sh /run-cron-jobs.sh COPY docker/local-emulator/entrypoint.sh /entrypoint.sh COPY docker/local-emulator/init-services.sh /init-services.sh COPY docker/local-emulator/start-app.sh /start-app.sh +COPY docker/local-emulator/rotate-secrets.sh /usr/local/bin/rotate-secrets COPY docker/local-emulator/clickhouse-config.xml /etc/clickhouse-server/config.xml COPY docker/local-emulator/clickhouse-users.xml /etc/clickhouse-server/users.xml COPY docker/server/entrypoint.sh /app-entrypoint.sh -RUN chmod +x /entrypoint.sh /init-services.sh /start-app.sh /app-entrypoint.sh /run-cron-jobs.sh +RUN chmod +x /entrypoint.sh /init-services.sh /start-app.sh /app-entrypoint.sh /run-cron-jobs.sh /usr/local/bin/rotate-secrets # PostgreSQL: 5432, Redis: 6379, Inbucket: 2500/9001/1100, # Svix: 8071, ClickHouse: 8123/9009, MinIO: 9090, QStash: 8080 diff --git a/docker/local-emulator/entrypoint.sh b/docker/local-emulator/entrypoint.sh index 562cb67955..58157c7914 100644 --- a/docker/local-emulator/entrypoint.sh +++ b/docker/local-emulator/entrypoint.sh @@ -33,6 +33,12 @@ fi # baked-in mock value from .env.development to be a usable credential against # a running emulator. Overriding here propagates to both the backend and the # run-cron-jobs.sh loop via supervisord's inherited environment. -export CRON_SECRET="$(openssl rand -hex 32)" +# +# In snapshot-build mode the VM supplies a deterministic placeholder via the +# --env-file so the baked snapshot doesn't contain a real secret; on resume, +# /usr/local/bin/rotate-secrets swaps in a fresh per-install value. +if [ -z "${CRON_SECRET:-}" ]; then + export CRON_SECRET="$(openssl rand -hex 32)" +fi exec /usr/bin/supervisord -n -c /etc/supervisor/conf.d/supervisord.conf diff --git a/docker/local-emulator/qemu/build-image.sh b/docker/local-emulator/qemu/build-image.sh index f4d91771b7..741d60d029 100755 --- a/docker/local-emulator/qemu/build-image.sh +++ b/docker/local-emulator/qemu/build-image.sh @@ -12,9 +12,34 @@ REPO_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)" DEBIAN_VERSION="${DEBIAN_VERSION:-13}" DISK_SIZE="${EMULATOR_DISK_SIZE:-12G}" RAM="${EMULATOR_BUILD_RAM:-4096}" -CPUS="${EMULATOR_BUILD_CPUS:-$(sysctl -n hw.ncpu 2>/dev/null || nproc 2>/dev/null || echo 4)}" PROVISION_TIMEOUT="${EMULATOR_PROVISION_TIMEOUT:-3200}" EMULATOR_IMAGE_NAME="${EMULATOR_IMAGE_NAME:-stack-local-emulator}" +# Snapshot-ready qcow2: bake deterministic placeholder secrets (PCK/SSK/SAK/ +# CRON_SECRET) into the image so runtime `rotate-secrets` can swap them for +# fresh per-install values on every `emulator start`. Without this, the image +# would ship with random shared secrets — a security regression. Cheap to +# build (no extra wall-clock cost in CI), so it stays on by default. +EMULATOR_BUILD_SNAPSHOT="${EMULATOR_BUILD_SNAPSHOT:-1}" +# Capture RAM/device state via QMP at build time, producing a +# `stack-emulator-.savevm.zst` next to the qcow2. Off by default — +# users capture locally on first `stack emulator pull` (run-emulator.sh +# capture) because migration state isn't portable across accelerators +# (KVM/HVF/TCG) or `-cpu max` feature sets, so a CI-captured snapshot +# couldn't resume reliably on arbitrary user hardware. Implies +# EMULATOR_BUILD_SNAPSHOT=1. +EMULATOR_CAPTURE_SAVEVM="${EMULATOR_CAPTURE_SAVEVM:-0}" +if [ "$EMULATOR_CAPTURE_SAVEVM" = "1" ] && [ "$EMULATOR_BUILD_SNAPSHOT" != "1" ]; then + echo "EMULATOR_CAPTURE_SAVEVM=1 requires EMULATOR_BUILD_SNAPSHOT=1" >&2 + exit 1 +fi +# Capture mode pins SMP to a fixed value so the resume QEMU command (which +# uses EMULATOR_CPUS, default 4) can match the captured device topology — +# RAM migration replay requires identical vCPU count. +if [ "$EMULATOR_CAPTURE_SAVEVM" = "1" ]; then + CPUS="${EMULATOR_BUILD_CPUS:-4}" +else + CPUS="${EMULATOR_BUILD_CPUS:-$(sysctl -n hw.ncpu 2>/dev/null || nproc 2>/dev/null || echo 4)}" +fi RED='\033[0;31m' GREEN='\033[0;32m' @@ -47,9 +72,18 @@ check_deps() { command -v "$qemu_bin" >/dev/null 2>&1 || missing+=("$qemu_bin") done - for cmd in qemu-img curl docker gzip; do + for cmd in qemu-img curl gzip; do command -v "$cmd" >/dev/null 2>&1 || missing+=("$cmd") done + if [ "${SKIP_DOCKER_BUILD:-0}" != "1" ]; then + command -v docker >/dev/null 2>&1 || missing+=("docker") + fi + + if [ "$EMULATOR_CAPTURE_SAVEVM" = "1" ]; then + for cmd in socat zstd; do + command -v "$cmd" >/dev/null 2>&1 || missing+=("$cmd") + done + fi if ! command -v mkisofs >/dev/null 2>&1 && ! command -v genisoimage >/dev/null 2>&1 && ! command -v hdiutil >/dev/null 2>&1; then missing+=("mkisofs/genisoimage/hdiutil") @@ -231,6 +265,9 @@ persist_provision_logs() { cp "$provision_log" "$IMAGE_DIR/provision-emulator-${arch}.progress.log" 2>/dev/null || true } +# qmp_session() and capture_vm_state() live in common.sh; both build-image.sh +# (CI) and run-emulator.sh (stack emulator pull local capture) call them. + build_one() { local arch="$1" local base_img="$IMAGE_DIR/debian-${DEBIAN_VERSION}-base-${arch}.qcow2" @@ -245,7 +282,9 @@ build_one() { local tmp_img="$tmp_dir/disk.qcow2" local seed_iso="$tmp_dir/seed.iso" local bundle_iso="$tmp_dir/bundle.iso" + local runtime_iso="$tmp_dir/runtime.iso" local bundle_dir="$tmp_dir/bundle" + local runtime_cfg_dir="$tmp_dir/runtime" local serial_log="$tmp_dir/serial.log" local provision_log="$tmp_dir/provision.log" local pidfile="$tmp_dir/qemu.pid" @@ -269,16 +308,83 @@ build_one() { mkdir -p "$bundle_dir" cp "$bundle_tgz" "$bundle_dir/img.tgz" cp "$BUILD_ENV_FILE" "$bundle_dir/build.env" + if [ "$EMULATOR_BUILD_SNAPSHOT" = "1" ]; then + # Guest reads this flag to use deterministic placeholder secrets so that + # runtime rotate-secrets can swap them out per-install. + printf 'STACK_EMULATOR_BUILD_SNAPSHOT=1\n' >> "$bundle_dir/build.env" + fi + if [ "$EMULATOR_CAPTURE_SAVEVM" = "1" ]; then + # Guest reads this flag to start stack.service during provision-build, + # wait for backend+dashboard health, then block forever waiting for the + # host to capture VM state via QMP (stop + migrate + quit). + printf 'STACK_EMULATOR_CAPTURE_SAVEVM=1\n' >> "$bundle_dir/build.env" + fi # Tell the guest which arch it's being built for so cross-arch (TCG) builds # can skip the smoke test, which isn't reliable under software emulation. printf 'STACK_EMULATOR_BUILD_ARCH=%s\n' "$arch" > "$bundle_dir/build-arch.env" make_iso_from_dir "$bundle_iso" "STACKBUNDLE" "$bundle_dir" + if [ "$EMULATOR_CAPTURE_SAVEVM" = "1" ]; then + # render-stack-env (inside the guest) mounts a STACKCFG disk containing + # runtime.env + base.env. At runtime the host-side run-emulator.sh builds + # this ISO; in capture mode stack.service also starts during the build, + # so we must provide the same shape here. Values mirror the defaults the + # runtime would supply — port-prefix 81 and matching host-port numbers + # (unused at build time since nothing is port-forwarded, but + # render-stack-env embeds them into /run/stack-auth/local-emulator.env). + mkdir -p "$runtime_cfg_dir" + { + printf 'STACK_EMULATOR_PORT_PREFIX=81\n' + printf 'STACK_EMULATOR_DASHBOARD_HOST_PORT=26700\n' + printf 'STACK_EMULATOR_BACKEND_HOST_PORT=26701\n' + printf 'STACK_EMULATOR_MINIO_HOST_PORT=26702\n' + printf 'STACK_EMULATOR_INBUCKET_HOST_PORT=26703\n' + printf 'STACK_EMULATOR_VM_DIR_HOST=\n' + } > "$runtime_cfg_dir/runtime.env" + cp "$BUILD_ENV_FILE" "$runtime_cfg_dir/base.env" + make_iso_from_dir "$runtime_iso" "STACKCFG" "$runtime_cfg_dir" + fi + : > "$serial_log" : > "$provision_log" qemu_base="$(qemu_cmd_prefix_for_arch "$arch")" log "QEMU command prefix (${arch}): $qemu_base" + local monitor_sock="$tmp_dir/monitor.sock" + local qga_sock="$tmp_dir/qga.sock" + local snapshot_args=() + local runtime_disk_args=() + local virtfs_args=(-virtfs "local,path=$tmp_dir,mount_tag=hostfs,security_model=none") + if [ "$EMULATOR_CAPTURE_SAVEVM" = "1" ]; then + # STACKCFG runtime ISO lets stack.service start during the build — same + # disk shape render-stack-env expects at runtime. Placed before netdev + # so its virtio-blk PCI slot precedes virtio-net-pci, matching the + # resume argv order in run-emulator.sh (slots must line up or + # migrate-incoming fails the device-tree check). + runtime_disk_args=( + -drive "file=$runtime_iso,format=raw,if=virtio,readonly=on" + ) + # QMP for stop/migrate/quit; virtio-serial + QGA channel so we can exec + # inside the guest post-resume (only needed at runtime but harmless here). + snapshot_args=( + -chardev "socket,id=monitor,path=$monitor_sock,server=on,wait=off" + -mon "chardev=monitor,mode=control" + -chardev "socket,path=$qga_sock,server=on,wait=off,id=qga0" + -device virtio-serial + -device "virtserialport,chardev=qga0,name=org.qemu.guest_agent.0" + # Empty PCIe root port reserved for runtime hot-plug of virtio-9p. + # The integrated pcie.0 bus on q35 / arm64-virt is static — hotplug + # only works through a root port. Must be present at snapshot capture + # so the resumed device tree matches. + -device "pcie-root-port,id=hostfs-port,bus=pcie.0,chassis=1" + ) + # QEMU disallows migration when virtfs is mounted in the guest — virtfs + # has guest-side state (open handles, mount table) that isn't migratable. + # Drop the host fs mount in capture mode; STACK_SERVICES_READY still + # arrives on the serial log so contains_provision_marker can detect it. + virtfs_args=() + fi + # shellcheck disable=SC2086 $qemu_base \ -boot order=c \ @@ -287,18 +393,24 @@ build_one() { -drive "file=$tmp_img,format=qcow2,if=virtio,discard=on,detect-zeroes=unmap" \ -drive "file=$seed_iso,format=raw,if=virtio,readonly=on" \ -drive "file=$bundle_iso,format=raw,if=virtio,readonly=on" \ + ${runtime_disk_args[@]+"${runtime_disk_args[@]}"} \ -netdev user,id=net0 \ -device virtio-net-pci,netdev=net0 \ - -virtfs "local,path=$tmp_dir,mount_tag=hostfs,security_model=none" \ + ${virtfs_args[@]+"${virtfs_args[@]}"} \ + ${snapshot_args[@]+"${snapshot_args[@]}"} \ -serial "file:$serial_log" \ -display none \ -daemonize \ -pidfile "$pidfile" pid="$(cat "$pidfile")" + local ready_marker="STACK_CLOUD_INIT_DONE" + if [ "$EMULATOR_CAPTURE_SAVEVM" = "1" ]; then + ready_marker="STACK_SERVICES_READY" + fi elapsed=0 while [ "$elapsed" -lt "$PROVISION_TIMEOUT" ]; do - if contains_provision_marker "$provision_log" "$serial_log" "STACK_CLOUD_INIT_DONE"; then + if contains_provision_marker "$provision_log" "$serial_log" "$ready_marker"; then break fi @@ -312,7 +424,7 @@ build_one() { if [ "$total_build_lines" -gt "$last_build_lines" ]; then echo "" sed -n "$((last_build_lines + 1)),${total_build_lines}p" "$provision_log" 2>/dev/null | while IFS= read -r msg; do - if [ "$msg" = "STACK_CLOUD_INIT_DONE" ]; then + if [ "$msg" = "STACK_CLOUD_INIT_DONE" ] || [ "$msg" = "STACK_SERVICES_READY" ]; then continue fi printf " [%3ds] %s\n" "$elapsed" "$msg" @@ -332,7 +444,7 @@ build_one() { done echo "" - if ! contains_provision_marker "$provision_log" "$serial_log" "STACK_CLOUD_INIT_DONE"; then + if ! contains_provision_marker "$provision_log" "$serial_log" "$ready_marker"; then if [ "$guest_failed" = true ]; then err "Guest provisioning reported failure for emulator (${arch})" elif [ "$guest_exited" = true ]; then @@ -358,17 +470,69 @@ build_one() { exit 1 fi - local shutdown_wait=0 - while [ "$shutdown_wait" -lt 90 ] && kill -0 "$pid" 2>/dev/null; do - sleep 1 - shutdown_wait=$((shutdown_wait + 1)) - done + if [ "$EMULATOR_CAPTURE_SAVEVM" = "1" ]; then + local savevm_file="$IMAGE_DIR/stack-emulator-${arch}.savevm.zst" + local savevm_raw="$tmp_dir/state.raw" + local savevm_tmp="$tmp_dir/state.zst" + + # Capture raw RAM/device state via QEMU's native file: migration; then + # compress on the host side. Avoids any reliance on QEMU spawning a shell + # that has zstd in PATH. + log "Capturing VM state via QMP (${arch})..." + if ! capture_vm_state "$monitor_sock" "$savevm_raw"; then + err "Failed to capture VM state for ${arch}" + if kill -0 "$pid" 2>/dev/null; then + kill "$pid" 2>/dev/null || true + sleep 1 + kill -9 "$pid" 2>/dev/null || true + fi + persist_provision_logs "$arch" "$serial_log" "$provision_log" + rm -rf "$tmp_dir" + exit 1 + fi - if kill -0 "$pid" 2>/dev/null; then - warn "Guest did not power off cleanly; forcing shutdown." - kill "$pid" 2>/dev/null || true - sleep 2 - kill -9 "$pid" 2>/dev/null || true + # QEMU exited cleanly via `quit`. Wait briefly to release the pid file. + local shutdown_wait=0 + while [ "$shutdown_wait" -lt 30 ] && kill -0 "$pid" 2>/dev/null; do + sleep 1 + shutdown_wait=$((shutdown_wait + 1)) + done + if kill -0 "$pid" 2>/dev/null; then + warn "QEMU did not exit after quit; forcing." + kill "$pid" 2>/dev/null || true + sleep 2 + kill -9 "$pid" 2>/dev/null || true + fi + + if [ ! -s "$savevm_raw" ]; then + err "VM state file missing or empty at $savevm_raw" + persist_provision_logs "$arch" "$serial_log" "$provision_log" + rm -rf "$tmp_dir" + exit 1 + fi + + # zstd -1 trades ~30% larger file for ~40% faster decompression at resume. + # For shipping-and-decompress-once-per-start, that's the right balance. + log "Compressing VM state with zstd..." + zstd -1 -T0 --rm -o "$savevm_tmp" "$savevm_raw" + + mv "$savevm_tmp" "$savevm_file" + local savevm_size + savevm_size="$(du -h "$savevm_file" | cut -f1)" + log "Saved VM state: $savevm_file (${savevm_size})" + else + local shutdown_wait=0 + while [ "$shutdown_wait" -lt 90 ] && kill -0 "$pid" 2>/dev/null; do + sleep 1 + shutdown_wait=$((shutdown_wait + 1)) + done + + if kill -0 "$pid" 2>/dev/null; then + warn "Guest did not power off cleanly; forcing shutdown." + kill "$pid" 2>/dev/null || true + sleep 2 + kill -9 "$pid" 2>/dev/null || true + fi fi persist_provision_logs "$arch" "$serial_log" "$provision_log" @@ -389,8 +553,16 @@ BUILD_ENV_FILE="$REPO_ROOT/docker/local-emulator/.env.development" for arch in "${TARGET_ARCHS[@]}"; do local_base="$IMAGE_DIR/debian-${DEBIAN_VERSION}-base-${arch}.qcow2" download_cloud_image "$arch" "$local_base" - build_local_emulator_image "$arch" - prepare_bundle_artifacts "$arch" + if [ "${SKIP_DOCKER_BUILD:-0}" = "1" ]; then + log "SKIP_DOCKER_BUILD=1: reusing pre-built Docker bundle" + if [ ! -f "$IMAGE_DIR/emulator-${arch}-docker-images.tar.gz" ]; then + err "Pre-built bundle not found: $IMAGE_DIR/emulator-${arch}-docker-images.tar.gz" + exit 1 + fi + else + build_local_emulator_image "$arch" + prepare_bundle_artifacts "$arch" + fi build_one "$arch" done diff --git a/docker/local-emulator/qemu/cloud-init/emulator/user-data b/docker/local-emulator/qemu/cloud-init/emulator/user-data index 38fe2b0646..6ec0976192 100644 --- a/docker/local-emulator/qemu/cloud-init/emulator/user-data +++ b/docker/local-emulator/qemu/cloud-init/emulator/user-data @@ -75,12 +75,24 @@ write_files: # ssk/sak: required by the emulator's own dashboard (StackServerApp # construction throws without them). Not used by user-app flows; the # /local-emulator/project route mints separate per-project credentials. + # + # Snapshot-build mode (STACK_EMULATOR_BUILD_SNAPSHOT=1 in /etc/stack-build.env): + # use deterministic placeholder hex strings instead of random values. The + # built image then contains these placeholders; at every `emulator start` + # resume the host generates fresh per-install secrets and + # /usr/local/bin/rotate-secrets (inside the stack container) swaps them in. umask 077 - for key in internal-pck internal-ssk internal-sak; do - if [ ! -s "/var/lib/stack-auth/$key" ]; then - openssl rand -hex 32 > "/var/lib/stack-auth/$key" - fi - done + if [ -f /etc/stack-build.env ] && grep -q '^STACK_EMULATOR_BUILD_SNAPSHOT=1' /etc/stack-build.env 2>/dev/null; then + printf '%s' '00000000000000000000000000000000ffffffffffffffffffffffffffffffff' > /var/lib/stack-auth/internal-pck + printf '%s' '00000000000000000000000000000000eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee' > /var/lib/stack-auth/internal-ssk + printf '%s' '00000000000000000000000000000000dddddddddddddddddddddddddddddddd' > /var/lib/stack-auth/internal-sak + else + for key in internal-pck internal-ssk internal-sak; do + if [ ! -s "/var/lib/stack-auth/$key" ]; then + openssl rand -hex 32 > "/var/lib/stack-auth/$key" + fi + done + fi INTERNAL_PCK="$(cat /var/lib/stack-auth/internal-pck)" INTERNAL_SSK="$(cat /var/lib/stack-auth/internal-ssk)" INTERNAL_SAK="$(cat /var/lib/stack-auth/internal-sak)" @@ -92,6 +104,15 @@ write_files: HOST_SERVICES_HOST=10.0.2.2 P="$STACK_EMULATOR_PORT_PREFIX" + # Snapshot-build mode: ship a deterministic placeholder CRON_SECRET so the + # baked VM contains a known-public value that rotate-secrets swaps out on + # every resume. Outside snapshot-build mode, leave CRON_SECRET unset so + # docker/local-emulator/entrypoint.sh generates a fresh random one. + EMULATOR_CRON_SECRET="" + if [ -f /etc/stack-build.env ] && grep -q '^STACK_EMULATOR_BUILD_SNAPSHOT=1' /etc/stack-build.env 2>/dev/null; then + EMULATOR_CRON_SECRET="00000000000000000000000000000000cccccccccccccccccccccccccccccccc" + fi + { # Static vars from base config and runtime (e.g. API keys, feature flags) cat /mnt/stack-runtime/base.env @@ -99,6 +120,9 @@ write_files: printf 'STACK_SEED_INTERNAL_PROJECT_PUBLISHABLE_CLIENT_KEY=%s\n' "$INTERNAL_PCK" printf 'STACK_SEED_INTERNAL_PROJECT_SECRET_SERVER_KEY=%s\n' "$INTERNAL_SSK" printf 'STACK_SEED_INTERNAL_PROJECT_SUPER_SECRET_ADMIN_KEY=%s\n' "$INTERNAL_SAK" + if [ -n "$EMULATOR_CRON_SECRET" ]; then + printf 'CRON_SECRET=%s\n' "$EMULATOR_CRON_SECRET" + fi # Computed vars — depend on port prefix or deps host # Host-side ports (for browser URLs — browser runs on host, not in VM) @@ -142,15 +166,46 @@ write_files: permissions: '0755' content: | #!/bin/bash - set -euo pipefail + # Mount the host filesystem at /host. Two modes: + # (no args) — cold-boot: bind /host on itself, make it a shared + # mount point, then mount virtio-9p on top. The + # bind+shared step is what lets the docker bind + # mount (-v /host:/host:rshared) receive later + # propagation events. + # --post-resume — snapshot-resume: /host is already shared (set up + # at build time and preserved across the snapshot, + # plus the docker bind mount has rshared + # propagation). The host has just hot-plugged + # virtio-9p; mount it on /host and the new mount + # propagates into the running container. + set -uo pipefail mkdir -p /host + + # Idempotent: bind /host on itself once so it becomes a mount point + # with its own propagation, then make it shared. mount --make-shared + # requires a mount point, hence the bind first. if ! mountpoint -q /host; then - if ! mount -t 9p -o trans=virtio,version=9p2000.L hostfs /host; then - echo "Failed to mount host filesystem at /host" >&2 - exit 1 + mount --bind /host /host + fi + mount --make-shared /host + + if [ "${1:-}" = "--post-resume" ]; then + if mount -t 9p -o trans=virtio,version=9p2000.L hostfs /host; then + exit 0 fi + echo "post-resume 9p mount failed" >&2 + exit 1 fi + # Cold boot. In snapshot-build mode the host detaches virtfs (QEMU + # disallows migration while it's mounted), so the 9p mount may not be + # available — tolerate that and fall through to an empty /host. + if mount -t 9p -o trans=virtio,version=9p2000.L hostfs /host 2>/dev/null; then + exit 0 + fi + echo "host filesystem unavailable; continuing with empty /host" >&2 + exit 0 + - path: /usr/local/bin/run-stack-container permissions: '0755' content: | @@ -190,7 +245,7 @@ write_files: -v stack-clickhouse-data:/data/clickhouse \ -v stack-minio-data:/data/minio \ -v stack-inbucket-data:/data/inbucket \ - -v /host:/host \ + -v /host:/host:rshared \ stack-local-emulator 2>&1 | tee -a "$host_log" else exec docker run \ @@ -204,7 +259,7 @@ write_files: -v stack-clickhouse-data:/data/clickhouse \ -v stack-minio-data:/data/minio \ -v stack-inbucket-data:/data/inbucket \ - -v /host:/host \ + -v /host:/host:rshared \ stack-local-emulator fi @@ -522,6 +577,74 @@ write_files: fstrim -av 2>/dev/null || true log "slim-docker-image done." + - path: /usr/local/bin/wait-for-stack-ready + permissions: '0755' + content: | + #!/bin/bash + # Poll the stack container's backend + dashboard on the guest's own + # localhost until both respond healthy. Used at snapshot-build time to + # gate "emit STACK_SERVICES_READY" on the app actually being warm. + set -uo pipefail + + TIMEOUT="${STACK_READY_TIMEOUT:-600}" + BACKEND_PORT="${STACK_READY_BACKEND_PORT:-8102}" + DASHBOARD_PORT="${STACK_READY_DASHBOARD_PORT:-8101}" + + log() { /usr/local/bin/log-provision "wait-for-stack-ready: $*"; } + + start=$SECONDS + next_heartbeat=$((start + 30)) + log "waiting for backend:$BACKEND_PORT and dashboard:$DASHBOARD_PORT (timeout=${TIMEOUT}s)" + while true; do + backend_code=$(curl -s -o /dev/null -w '%{http_code}' --max-time 3 "http://127.0.0.1:${BACKEND_PORT}/health?db=1" 2>/dev/null || true) + dashboard_code=$(curl -s -o /dev/null -w '%{http_code}' --max-time 3 "http://127.0.0.1:${DASHBOARD_PORT}/handler/sign-in" 2>/dev/null || true) + if [ "$backend_code" = "200" ] && [ "$dashboard_code" = "200" ]; then + log "ready ($((SECONDS - start))s)" + exit 0 + fi + if [ "$SECONDS" -ge "$next_heartbeat" ]; then + log "still waiting (backend=$backend_code dashboard=$dashboard_code, $((SECONDS - start))s elapsed)" + next_heartbeat=$((SECONDS + 30)) + fi + if [ "$((SECONDS - start))" -ge "$TIMEOUT" ]; then + log "TIMEOUT after $((SECONDS - start))s (backend=$backend_code dashboard=$dashboard_code)" + docker ps -a 2>&1 | /usr/local/bin/log-provision-stream "wait-for-stack-ready: ps" || true + docker logs --tail 200 stack 2>&1 | /usr/local/bin/log-provision-stream "wait-for-stack-ready: stack" || true + systemctl status stack.service --no-pager -l 2>&1 | /usr/local/bin/log-provision-stream "wait-for-stack-ready: svc" || true + journalctl -u stack.service --no-pager -n 100 2>&1 | /usr/local/bin/log-provision-stream "wait-for-stack-ready: jrnl" || true + docker image ls 2>&1 | /usr/local/bin/log-provision-stream "wait-for-stack-ready: img" || true + exit 1 + fi + sleep 2 + done + + - path: /usr/local/bin/trigger-fast-rotate + permissions: '0755' + content: | + #!/bin/bash + # Called via qemu-guest-agent on every snapshot resume. Reads fresh + # secrets from stdin (key=value lines, written by the host via QGA's + # guest-exec input-data) and execs rotate-secrets inside the stack + # container with those values exported. + set -euo pipefail + + tmp="$(mktemp /var/run/stack-fresh-XXXXXX.env)" + cat > "$tmp" + chmod 0600 "$tmp" + + # shellcheck disable=SC1090 + set -a + source "$tmp" + set +a + rm -f "$tmp" + + exec docker exec \ + -e STACK_SEED_INTERNAL_PROJECT_PUBLISHABLE_CLIENT_KEY \ + -e STACK_SEED_INTERNAL_PROJECT_SECRET_SERVER_KEY \ + -e STACK_SEED_INTERNAL_PROJECT_SUPER_SECRET_ADMIN_KEY \ + -e CRON_SECRET \ + stack /usr/local/bin/rotate-secrets + - path: /etc/systemd/system/stack.service content: | [Unit] @@ -591,6 +714,14 @@ write_files: systemctl disable --now ssh || true systemctl mask ssh || true + # qemu-guest-agent: used by the host to inject fresh secrets + trigger + # rotate-secrets after a snapshot resume. Must be running INSIDE the VM + # at snapshot capture time — the virtio-serial port's "open" state is + # part of the migrated device state. If QGA wasn't connected at capture, + # the resumed VM's port stays closed and the host can't reach it. + systemctl enable qemu-guest-agent || true + systemctl start qemu-guest-agent || true + log_provision "installing emulator containers" bash /usr/local/bin/install-emulator-containers @@ -603,6 +734,53 @@ write_files: log_provision "starting slim-docker-image" bash /usr/local/bin/slim-docker-image + # Capture mode: bring the stack container up, wait for full + # readiness, emit STACK_SERVICES_READY, then wait indefinitely for the + # host build script to capture VM state over QMP (stop + migrate + quit). + # The VM never shuts itself down in this path — the host tears it down + # once the savevm file has been written. + # + # CI never sets STACK_EMULATOR_CAPTURE_SAVEVM=1 (snapshots aren't + # portable across accelerators, so they're captured locally on first + # `stack emulator pull`). This branch only fires for opt-in local + # builds run with EMULATOR_CAPTURE_SAVEVM=1. + if [ -f /etc/stack-build.env ] && grep -q '^STACK_EMULATOR_CAPTURE_SAVEVM=1' /etc/stack-build.env 2>/dev/null; then + log_provision "capture mode: starting stack.service" + systemctl start stack.service || true + + log_provision "waiting for backend + dashboard to be ready" + if ! /usr/local/bin/wait-for-stack-ready; then + log_provision "ERROR: stack services did not become ready" + exit 1 + fi + + # Ensure qemu-guest-agent is running so its virtio-serial port stays + # "open" in the snapshot — the host needs that port at runtime to + # trigger rotate-secrets. + log_provision "ensuring qemu-guest-agent is up" + systemctl restart qemu-guest-agent || true + sleep 2 + if ! systemctl is-active --quiet qemu-guest-agent; then + log_provision "ERROR: qemu-guest-agent failed to start" + systemctl status qemu-guest-agent --no-pager -l 2>&1 | /usr/local/bin/log-provision-stream "qga" + exit 1 + fi + log_provision "qemu-guest-agent active" + + log_provision "services ready; signalling STACK_SERVICES_READY" + if [ -n "${STACK_PROVISION_LOG_FILE:-}" ]; then + printf '%s\n' "STACK_SERVICES_READY" >> "$STACK_PROVISION_LOG_FILE" + fi + write_marker_to_consoles "STACK_SERVICES_READY" + sync || true + + # Clear the EXIT trap so the cleanup path doesn't mark this as failed + # when the host powers us off via QMP quit. + trap - EXIT + # Block forever; host will issue qmp quit after migrate completes. + while true; do sleep 3600; done + fi + log_provision "build pipeline complete" if [ -n "${STACK_PROVISION_LOG_FILE:-}" ]; then printf '%s\n' "STACK_CLOUD_INIT_DONE" >> "$STACK_PROVISION_LOG_FILE" diff --git a/docker/local-emulator/qemu/common.sh b/docker/local-emulator/qemu/common.sh index 1e3374dad4..f5d3392d9d 100755 --- a/docker/local-emulator/qemu/common.sh +++ b/docker/local-emulator/qemu/common.sh @@ -68,3 +68,142 @@ make_iso_from_dir() { exit 1 fi } + +# Send one or more QMP commands over the monitor socket. Stdin is a stream of +# JSON objects; qmp_capabilities is always sent first to exit negotiation mode. +# Keep stdin open briefly after writing so socat doesn't close before QEMU +# responds — QMP replies in milliseconds so 0.5s is plenty. +# +# Callers: build-image.sh capture flow, run-emulator.sh cmd_capture. +qmp_session() { + local sock="$1" + local payload + payload="$(cat)" + ( printf '%s\n' "$payload"; sleep 0.5 ) | socat -t30 - "UNIX-CONNECT:${sock}" +} + +# Drive the snapshot capture over QMP: +# 1. qmp_capabilities — exit negotiation mode. +# 2. stop — pause the VM so no more disk writes happen. +# 3. migrate-set-capabilities — enable mapped-ram + multifd for fast resume. +# 4. migrate to file: — streams RAM/device state out. +# 5. Poll query-migrate until status=completed (or failed). +# 6. quit — terminate QEMU cleanly. +# +# Depends on log/err/warn being defined by the sourcing script. +capture_vm_state() { + local sock="$1" + local guest_path="$2" + + if [ ! -S "$sock" ]; then + err "QMP monitor socket missing: $sock" + return 1 + fi + + log " QMP: stopping VM..." + { + printf '%s\n' '{"execute":"qmp_capabilities"}' + printf '%s\n' '{"execute":"stop"}' + } | qmp_session "$sock" >/dev/null || { + err "QMP stop failed" + return 1 + } + + log " QMP: enabling mapped-ram + multifd for fast resume..." + # mapped-ram: writes each RAM page to a fixed offset in the output file + # (vs the legacy streamed format). This lets the target QEMU mmap the file + # and fault pages lazily — and combined with multifd, load RAM in parallel. + # multifd-channels=4 matches our pinned SMP so the channels don't starve + # each other on the target's 4 vCPUs. + local caps_cmd params_cmd + caps_cmd='{"execute":"migrate-set-capabilities","arguments":{"capabilities":[{"capability":"mapped-ram","state":true},{"capability":"multifd","state":true}]}}' + params_cmd='{"execute":"migrate-set-parameters","arguments":{"multifd-channels":4}}' + local setup_resp + setup_resp=$({ + printf '%s\n' '{"execute":"qmp_capabilities"}' + printf '%s\n' "$caps_cmd" + printf '%s\n' "$params_cmd" + } | qmp_session "$sock") || { + err "QMP capabilities setup failed" + return 1 + } + if printf '%s' "$setup_resp" | grep -q '"error"[[:space:]]*:'; then + err "QMP capabilities returned error: $setup_resp" + return 1 + fi + + log " QMP: migrating RAM state to ${guest_path}..." + # Use file: migration (native QEMU) instead of exec: to avoid relying on a + # spawned shell finding zstd in PATH. Compressed as a separate host step + # after migrate completes. + local migrate_cmd + migrate_cmd=$(printf '{"execute":"migrate","arguments":{"uri":"file:%s"}}' "$guest_path") + local migrate_resp + migrate_resp=$({ + printf '%s\n' '{"execute":"qmp_capabilities"}' + printf '%s\n' "$migrate_cmd" + } | qmp_session "$sock") || { + err "QMP migrate failed" + return 1 + } + if printf '%s' "$migrate_resp" | grep -q '"error"[[:space:]]*:'; then + err "QMP migrate returned error: $migrate_resp" + return 1 + fi + + # Poll migration status. Migration runs in the background after the + # migrate command returns; we watch for "completed" or "failed". + local migrate_timeout=600 + local waited=0 + local last_heartbeat=0 + while [ "$waited" -lt "$migrate_timeout" ]; do + local status_line status + status_line=$({ + printf '%s\n' '{"execute":"qmp_capabilities"}' + printf '%s\n' '{"execute":"query-migrate"}' + } | qmp_session "$sock" 2>/dev/null || true) + status="$(printf '%s\n' "$status_line" | grep -o '"status"[[:space:]]*:[[:space:]]*"[a-z-]*"' | head -1 | sed -E 's/.*"([a-z-]+)".*/\1/')" + case "$status" in + completed) + log " QMP: migrate completed (${waited}s)" + break + ;; + failed|cancelled) + err " QMP: migrate ended with status=$status" + err " QMP response: $status_line" + return 1 + ;; + active|setup|device|"") + # still running + if [ "$((waited - last_heartbeat))" -ge 30 ]; then + local transferred + transferred=$(printf '%s' "$status_line" | grep -o '"transferred"[[:space:]]*:[[:space:]]*[0-9]*' | head -1 | sed -E 's/.*:[[:space:]]*([0-9]+).*/\1/') + log " QMP: migrate in progress (${waited}s, status=${status:-init}, transferred=${transferred:-0})" + last_heartbeat=$waited + fi + ;; + *) + log " QMP: migrate status=$status (${waited}s)" + ;; + esac + sleep 2 + waited=$((waited + 2)) + done + + if [ "$waited" -ge "$migrate_timeout" ]; then + err "QMP migrate timed out after ${migrate_timeout}s" + err "Last query-migrate response: $({ + printf '%s\n' '{"execute":"qmp_capabilities"}' + printf '%s\n' '{"execute":"query-migrate"}' + } | qmp_session "$sock" 2>/dev/null || true)" + return 1 + fi + + log " QMP: quitting VM..." + { + printf '%s\n' '{"execute":"qmp_capabilities"}' + printf '%s\n' '{"execute":"quit"}' + } | qmp_session "$sock" >/dev/null || true + + return 0 +} diff --git a/docker/local-emulator/qemu/run-emulator.sh b/docker/local-emulator/qemu/run-emulator.sh index ba905ca36d..7b52436e19 100755 --- a/docker/local-emulator/qemu/run-emulator.sh +++ b/docker/local-emulator/qemu/run-emulator.sh @@ -12,6 +12,22 @@ VM_RAM="${EMULATOR_RAM:-4096}" VM_CPUS="${EMULATOR_CPUS:-4}" PORT_PREFIX="${PORT_PREFIX:-${NEXT_PUBLIC_STACK_PORT_PREFIX:-81}}" READY_TIMEOUT="${EMULATOR_READY_TIMEOUT:-240}" +# Shorter timeout when resuming from a snapshot: services are already running, +# we only need to wait for rotate-secrets + Node restart (~3-10s). +SNAPSHOT_READY_TIMEOUT="${EMULATOR_SNAPSHOT_READY_TIMEOUT:-45}" +# Set to 1 to force a cold boot and ignore any shipped savevm file. +EMULATOR_NO_SNAPSHOT="${EMULATOR_NO_SNAPSHOT:-0}" +# Skip the post-resume secret rotation. Keeps the baked placeholder secrets +# in place — acceptable for tests and CI that don't reach the emulator over +# a shared network. Shaves ~2-3s off `emulator start`. +EMULATOR_NO_ROTATION="${EMULATOR_NO_ROTATION:-0}" +# Internal: set to 1 by cmd_capture to build QEMU with the snapshot-compatible +# device layout (phantom ISOs, no virtfs, pcie-root-port, pinned 4096MB/4CPU) +# without the `-incoming defer` that resume mode adds. The captured snapshot +# must be byte-compatible with what the resume path will later feed to QEMU. +EMULATOR_CAPTURING_SNAPSHOT="${EMULATOR_CAPTURING_SNAPSHOT:-0}" +# Force re-capture even if a .savevm.zst is already present. +EMULATOR_FORCE_CAPTURE="${EMULATOR_FORCE_CAPTURE:-0}" # Fixed host-side ports for the QEMU emulator (267xx range). # Only user-facing services are exposed; internal deps stay inside the VM. @@ -62,10 +78,67 @@ image_path() { echo "$IMAGE_DIR/stack-emulator-$ARCH.qcow2" } +savevm_path() { + echo "$IMAGE_DIR/stack-emulator-$ARCH.savevm.zst" +} + +# Cached, decompressed mapped-ram file. Created on first resume from the .zst +# and reused on subsequent resumes — mapped-ram format requires a seekable +# file, so we can't stream through zstd and use multifd at the same time. +savevm_raw_path() { + echo "$IMAGE_DIR/stack-emulator-$ARCH.savevm.raw" +} + runtime_iso_path() { echo "$VM_DIR/runtime-config.iso" } +snapshot_available() { + [ "$EMULATOR_NO_SNAPSHOT" != "1" ] && [ "$EMULATOR_CAPTURING_SNAPSHOT" != "1" ] && [ -s "$(savevm_path)" ] +} + +# True when QEMU must use the snapshot-compatible device layout — either to +# resume from an existing snapshot or to capture a new one. Resume adds +# `-incoming defer`; capture does not. Everything else (phantom ISOs, no +# virtfs, pcie-root-port, pinned RAM/SMP) matches. +snapshot_layout() { + snapshot_available || [ "$EMULATOR_CAPTURING_SNAPSHOT" = "1" ] +} + +# Ensure the decompressed mapped-ram cache is up-to-date with the shipped +# .zst. Compares mtime: if .raw is older or missing, re-decompress. +ensure_savevm_raw() { + local zst raw + zst="$(savevm_path)" + raw="$(savevm_raw_path)" + + local zst_ts raw_ts + case "$HOST_OS" in + darwin) + zst_ts="$(stat -f '%m' "$zst" 2>/dev/null || echo 0)" + raw_ts="$(stat -f '%m' "$raw" 2>/dev/null || echo 0)" + ;; + *) + zst_ts="$(stat -c '%Y' "$zst" 2>/dev/null || echo 0)" + raw_ts="$(stat -c '%Y' "$raw" 2>/dev/null || echo 0)" + ;; + esac + + if [ -s "$raw" ] && [ "$raw_ts" -ge "$zst_ts" ]; then + return 0 + fi + + log "Decompressing snapshot cache (one-time; ~2-3GB sparse)..." + local tmp="${raw}.tmp" + rm -f "$tmp" + if ! zstd -dc "$zst" > "$tmp"; then + err "Failed to decompress $zst" + rm -f "$tmp" + return 1 + fi + mv "$tmp" "$raw" +} + # Returns a fast fingerprint (size:mtime) of the base QEMU image. # Used to detect whether the image has changed since the overlay was created. base_image_fingerprint() { @@ -77,10 +150,62 @@ base_image_fingerprint() { esac } -prepare_runtime_config_iso() { +# Fingerprint used to detect stale overlays. Includes both the base qcow2 and +# the savevm file so the overlay is rebuilt whenever either input changes. The +# overlay disk must match the disk state the snapshot was taken against for +# -incoming resume to be consistent. +runtime_fingerprint() { + local base="$1" + local savevm="$2" + local base_fp savevm_fp + base_fp="$(base_image_fingerprint "$base")" + if [ -f "$savevm" ]; then + savevm_fp="$(base_image_fingerprint "$savevm")" + else + savevm_fp="no-savevm" + fi + printf '%s|%s\n' "$base_fp" "$savevm_fp" +} + +ensure_runtime_config_iso() { + # When invoked via stack-cli, the CLI writes the runtime ISO natively + # (packages/stack-cli/src/lib/iso.ts) immediately before spawning us and + # sets STACK_EMULATOR_CLI_WROTE_ISO=1. Trust it and skip regeneration — + # otherwise we'd fall through to make_iso_from_dir and require + # hdiutil/mkisofs/genisoimage, which is exactly the host dep the CLI path + # is designed to remove. + if [ "${STACK_EMULATOR_CLI_WROTE_ISO:-}" = "1" ] && [ -s "$(runtime_iso_path)" ]; then + return 0 + fi + # In capture mode, cmd_capture already wrote a specialized ISO with an + # empty STACK_EMULATOR_VM_DIR_HOST — required because virtfs is detached + # for snapshot compatibility, and run-stack-container would otherwise + # try to publish internal-pck to /host/... and restart-loop + # stack.service. Trust that write and don't overwrite it. + if [ "${EMULATOR_CAPTURING_SNAPSHOT:-}" = "1" ] && [ -s "$(runtime_iso_path)" ]; then + return 0 + fi + # Direct-shell invocation path: regenerate unconditionally. Port env vars + # (PORT_PREFIX, EMULATOR_*_PORT) may have changed since the last run, and + # an ISO cached from a prior invocation would silently override them. + write_runtime_config_iso "$VM_DIR" +} + +# Write a STACKCFG runtime-config.iso containing runtime.env + base.env. +# The VM_DIR_HOST arg is the path to publish internal-pck / stack.log to on +# /host; pass empty string to suppress publication (used by capture mode +# where /host isn't mounted — virtfs is detached for snapshot compatibility, +# so any host-side write would fail and restart-loop stack.service). +write_runtime_config_iso() { + local vm_dir_host="$1" + local base_env="$SCRIPT_DIR/../.env.development" + if [ ! -f "$base_env" ]; then + err "Cannot generate runtime config ISO: $base_env is missing." + err "Run 'pnpm run emulator:generate-env' first, or invoke via 'stack emulator start'." + exit 1 + fi + local cfg_dir="$VM_DIR/runtime-config" - local cfg_iso - cfg_iso="$(runtime_iso_path)" rm -rf "$cfg_dir" mkdir -p "$cfg_dir" { @@ -89,10 +214,10 @@ prepare_runtime_config_iso() { printf "STACK_EMULATOR_BACKEND_HOST_PORT=%s\n" "$EMULATOR_BACKEND_PORT" printf "STACK_EMULATOR_MINIO_HOST_PORT=%s\n" "$EMULATOR_MINIO_PORT" printf "STACK_EMULATOR_INBUCKET_HOST_PORT=%s\n" "$EMULATOR_INBUCKET_PORT" - printf "STACK_EMULATOR_VM_DIR_HOST=%s\n" "$VM_DIR" + printf "STACK_EMULATOR_VM_DIR_HOST=%s\n" "$vm_dir_host" } > "$cfg_dir/runtime.env" - cp "$SCRIPT_DIR/../.env.development" "$cfg_dir/base.env" - make_iso_from_dir "$cfg_iso" "STACKCFG" "$cfg_dir" + cp "$base_env" "$cfg_dir/base.env" + make_iso_from_dir "$(runtime_iso_path)" "STACKCFG" "$cfg_dir" } service_is_up() { @@ -145,7 +270,7 @@ wait_for_condition() { log "${label} ready in ${elapsed}s" return 0 fi - sleep 1 + sleep 0.2 elapsed=$((SECONDS - started)) printf "\r [%3ds] %s..." "$elapsed" "$label" done @@ -154,8 +279,9 @@ wait_for_condition() { } build_qemu_cmd() { - local base_img + local base_img savevm_file base_img="$(image_path)" + savevm_file="$(savevm_path)" if [ ! -f "$base_img" ]; then err "Missing QEMU image: $base_img" @@ -166,18 +292,36 @@ build_qemu_cmd() { mkdir -p "$VM_DIR" local fingerprint_file="$VM_DIR/base-image.fingerprint" local current_fp - current_fp="$(base_image_fingerprint "$base_img")" - if [ -f "$VM_DIR/disk.qcow2" ]; then - if [ -f "$fingerprint_file" ] && [ "$(cat "$fingerprint_file")" = "$current_fp" ]; then - log "Reusing existing overlay disk (changes persist)" - else - warn "QEMU base image has changed — recreating overlay." + current_fp="$(runtime_fingerprint "$base_img" "$savevm_file")" + + if snapshot_layout; then + # The savevm RAM state was captured against the base image's exact disk + # state. An overlay with writes from a previous session diverges from + # that point, so -incoming would resume RAM against inconsistent disk. + # Always start from a fresh overlay in the snapshot path; per-session + # state is not preserved. Users who want persistence can opt out with + # EMULATOR_NO_SNAPSHOT=1. Capture mode also needs a clean overlay so the + # snapshot we write is taken against the base's known disk state. + if [ -f "$VM_DIR/disk.qcow2" ]; then rm -f "$VM_DIR/disk.qcow2" "$fingerprint_file" fi - fi - if [ ! -f "$VM_DIR/disk.qcow2" ]; then qemu-img create -f qcow2 -b "$base_img" -F qcow2 "$VM_DIR/disk.qcow2" >/dev/null - base_image_fingerprint "$base_img" > "$fingerprint_file" + printf '%s' "$current_fp" > "$fingerprint_file" + else + # If the overlay was created against a different base or savevm, it will + # diverge from the snapshot's disk state — force a rebuild. + if [ -f "$VM_DIR/disk.qcow2" ]; then + if [ -f "$fingerprint_file" ] && [ "$(cat "$fingerprint_file")" = "$current_fp" ]; then + log "Reusing existing overlay disk (changes persist)" + else + warn "Base image or snapshot has changed — recreating overlay." + rm -f "$VM_DIR/disk.qcow2" "$fingerprint_file" + fi + fi + if [ ! -f "$VM_DIR/disk.qcow2" ]; then + qemu-img create -f qcow2 -b "$base_img" -F qcow2 "$VM_DIR/disk.qcow2" >/dev/null + printf '%s' "$current_fp" > "$fingerprint_file" + fi fi local qemu_bin machine cpu firmware_args=() @@ -213,28 +357,127 @@ build_qemu_cmd() { # are mutually exclusive. netdev+=",hostfwd=tcp:127.0.0.1:${PORT_PREFIX}14-:${PORT_PREFIX}14" - QEMU_CMD=( - "$qemu_bin" - -machine "$machine" - -accel "$ACCEL" - -cpu "$cpu" - "${firmware_args[@]}" - -boot order=c - -m "$VM_RAM" - -smp "$VM_CPUS" - -drive "file=$VM_DIR/disk.qcow2,format=qcow2,if=virtio" - -drive "file=$(runtime_iso_path),format=raw,if=virtio,readonly=on" - -netdev "$netdev" - -device virtio-net-pci,netdev=net0 - -device virtio-balloon-pci - -virtfs "local,path=/,mount_tag=hostfs,security_model=none" - -chardev "socket,id=monitor,path=$VM_DIR/monitor.sock,server=on,wait=off" - -mon "chardev=monitor,mode=control" - -serial "file:$VM_DIR/serial.log" - -display none - -daemonize - -pidfile "$VM_DIR/qemu.pid" - ) + # In snapshot-resume mode the QEMU command-line MUST match the device set + # used at snapshot capture time, otherwise migration replay fails (broken + # pipe / device tree mismatch). At capture time the build attaches: + # disk(if=virtio) + seed.iso + bundle.iso + runtime.iso (all if=virtio) + # netdev + virtio-net-pci + monitor + QGA virtio-serial + # SMP=4, RAM=4096 (pinned in build-image.sh snapshot mode) + # We mirror that exactly. The seed/bundle ISOs were used by cloud-init at + # build and are not needed at runtime, but their virtio-blk slots must + # exist so the migration replay matches device IDs. Runtime-only devices + # (virtfs, balloon) live at higher slots — extra at destination is fine. + local snapshot_args=() runtime_only_args=() snapshot_smp="$VM_CPUS" snapshot_ram="$VM_RAM" + if snapshot_layout; then + if snapshot_available; then + log "Snapshot found at $savevm_file — fast-resume enabled." + # -incoming defer: QEMU starts, waits for a QMP migrate-incoming command. + # We use that to set mapped-ram + multifd capabilities before loading, + # which enables parallel RAM restore (~2-3x faster than streamed decode). + snapshot_args+=(-incoming defer) + else + log "Capture mode: booting with snapshot-compatible layout (no -incoming)." + fi + snapshot_smp="${EMULATOR_SNAPSHOT_CPUS:-4}" + # RAM size is baked into the snapshot; migration replay requires an + # identical -m value. Pin to the build-time RAM (4096) and ignore + # EMULATOR_RAM — override via EMULATOR_SNAPSHOT_RAM if a different + # snapshot was produced. + snapshot_ram="${EMULATOR_SNAPSHOT_RAM:-4096}" + if [ "$snapshot_smp" != "$VM_CPUS" ]; then + log "Pinning SMP to ${snapshot_smp} for snapshot resume (build-time value)." + fi + if [ "$snapshot_ram" != "$VM_RAM" ]; then + log "Pinning RAM to ${snapshot_ram}MB for snapshot resume (ignoring EMULATOR_RAM=${VM_RAM})." + fi + + # Tiny placeholder ISOs to match the seed.iso / bundle.iso slots present + # at snapshot time. Their content doesn't matter (cloud-init has already + # run); only the virtio-blk slot count must match. + local seed_phantom="$VM_DIR/seed.phantom" + local bundle_phantom="$VM_DIR/bundle.phantom" + if [ ! -s "$seed_phantom" ]; then + dd if=/dev/zero of="$seed_phantom" bs=1M count=1 status=none + fi + if [ ! -s "$bundle_phantom" ]; then + dd if=/dev/zero of="$bundle_phantom" bs=1M count=1 status=none + fi + runtime_only_args+=( + -drive "file=$seed_phantom,format=raw,if=virtio,readonly=on" + -drive "file=$bundle_phantom,format=raw,if=virtio,readonly=on" + ) + else + # Cold-boot: include virtio-balloon and virtfs as before. + runtime_only_args+=( + -device virtio-balloon-pci + -virtfs "local,path=/,mount_tag=hostfs,security_model=none" + ) + fi + + if snapshot_layout; then + QEMU_CMD=( + "$qemu_bin" + -machine "$machine" + -accel "$ACCEL" + -cpu "$cpu" + "${firmware_args[@]}" + -boot order=c + -m "$snapshot_ram" + -smp "$snapshot_smp" + -drive "file=$VM_DIR/disk.qcow2,format=qcow2,if=virtio" + "${runtime_only_args[@]}" + -drive "file=$(runtime_iso_path),format=raw,if=virtio,readonly=on" + -netdev "$netdev" + -device virtio-net-pci,netdev=net0 + -chardev "socket,id=monitor,path=$VM_DIR/monitor.sock,server=on,wait=off" + -mon "chardev=monitor,mode=control" + -chardev "socket,path=$VM_DIR/qga.sock,server=on,wait=off,id=qga0" + -device virtio-serial + -device "virtserialport,chardev=qga0,name=org.qemu.guest_agent.0" + # Empty PCIe root port reserved for runtime hot-plug of virtio-9p. + # MUST be the last explicit -device entry — slot order has to mirror + # build-image.sh exactly or migration replay stalls in inmigrate. + -device "pcie-root-port,id=hostfs-port,bus=pcie.0,chassis=1" + # Pre-create the host-side fsdev backend so the post-resume QMP + # device_add can attach to it by id. -fsdev is host-only state — not + # part of the migrated device tree — so it's safe to add here even + # though the snapshot was captured without it. Going through -fsdev + # avoids the HMP fsdev_add command, whose error path is invisible + # via human-monitor-command (errors come back as a return string, + # not a QMP error). + -fsdev "local,id=hostfs,path=/,security_model=none" + ${snapshot_args[@]+"${snapshot_args[@]}"} + -serial "file:$VM_DIR/serial.log" + -display none + -daemonize + -pidfile "$VM_DIR/qemu.pid" + ) + else + QEMU_CMD=( + "$qemu_bin" + -machine "$machine" + -accel "$ACCEL" + -cpu "$cpu" + "${firmware_args[@]}" + -boot order=c + -m "$VM_RAM" + -smp "$snapshot_smp" + -drive "file=$VM_DIR/disk.qcow2,format=qcow2,if=virtio" + -drive "file=$(runtime_iso_path),format=raw,if=virtio,readonly=on" + -netdev "$netdev" + -device virtio-net-pci,netdev=net0 + "${runtime_only_args[@]}" + -chardev "socket,id=monitor,path=$VM_DIR/monitor.sock,server=on,wait=off" + -mon "chardev=monitor,mode=control" + -chardev "socket,path=$VM_DIR/qga.sock,server=on,wait=off,id=qga0" + -device virtio-serial + -device "virtserialport,chardev=qga0,name=org.qemu.guest_agent.0" + -serial "file:$VM_DIR/serial.log" + -display none + -daemonize + -pidfile "$VM_DIR/qemu.pid" + ) + fi } @@ -269,11 +512,225 @@ ensure_ports_free() { start_vm() { mkdir -p "$VM_DIR" : > "$VM_DIR/serial.log" - prepare_runtime_config_iso + ensure_runtime_config_iso build_qemu_cmd "${QEMU_CMD[@]}" } +# Send one or more QMP commands over the monitor socket. Each line of stdin is +# a JSON object; capabilities are always negotiated first. Keep stdin open +# briefly after writing so socat doesn't close before QEMU responds — QMP +# typically replies in milliseconds so 0.3s is enough. +qmp_send() { + if [ ! -S "$VM_DIR/monitor.sock" ]; then + return 1 + fi + local payload + payload="$(cat)" + { + printf '%s\n' '{"execute":"qmp_capabilities"}' + printf '%s\n' "$payload" + sleep 0.3 + } | socat -t5 - "UNIX-CONNECT:$VM_DIR/monitor.sock" 2>/dev/null +} + +# After -incoming defer, QEMU waits for a migrate-incoming command. This sets +# up mapped-ram + multifd capabilities and kicks off the RAM load from the +# decompressed cache file. Returns once the VM is running. +qmp_incoming_and_cont() { + local raw_file="$1" + + # Set caps + parameters before migrate-incoming, same as source. + local setup_resp + setup_resp=$( { + printf '%s\n' '{"execute":"migrate-set-capabilities","arguments":{"capabilities":[{"capability":"mapped-ram","state":true},{"capability":"multifd","state":true}]}}' + printf '%s\n' '{"execute":"migrate-set-parameters","arguments":{"multifd-channels":4}}' + } | qmp_send) + if printf '%s' "$setup_resp" | grep -q '"error"'; then + err "QMP caps setup failed: $setup_resp" + return 1 + fi + + # Kick off the incoming migration from the mapped-ram file. + local inc_cmd inc_resp + inc_cmd=$(printf '{"execute":"migrate-incoming","arguments":{"uri":"file:%s"}}' "$raw_file") + inc_resp=$(printf '%s\n' "$inc_cmd" | qmp_send) + if printf '%s' "$inc_resp" | grep -q '"error"'; then + err "QMP migrate-incoming failed: $inc_resp" + return 1 + fi + + # Poll until status reaches a runnable state, then cont. + local deadline=$((SECONDS + 60)) + while [ "$SECONDS" -lt "$deadline" ]; do + local out status + out=$(printf '%s\n' '{"execute":"query-status"}' | qmp_send || true) + status=$(printf '%s' "$out" | grep -o '"status"[[:space:]]*:[[:space:]]*"[a-z-]*"' | head -1 | sed -E 's/.*"([a-z-]+)".*/\1/') + case "$status" in + running) + return 0 + ;; + paused|postmigrate|prelaunch) + printf '%s\n' '{"execute":"cont"}' | qmp_send >/dev/null || true + return 0 + ;; + inmigrate|"") + ;; + *) + log "unexpected QMP status: $status" + ;; + esac + sleep 0.2 + done + return 1 +} + +# Placeholder PCK baked into the snapshot. Kept in sync with the value in +# docker/local-emulator/qemu/cloud-init/emulator/user-data. +SNAPSHOT_PLACEHOLDER_PCK="00000000000000000000000000000000ffffffffffffffffffffffffffffffff" + +# Write the internal PCK to the host path the CLI reads (see +# readInternalPck() in packages/stack-cli/src/commands/emulator.ts). In +# cold-boot mode the guest publishes this via virtfs/9p, but snapshot mode +# drops virtfs, so the host has to write it itself. +write_internal_pck_for_cli() { + local pck="$1" + (umask 077 && printf '%s' "$pck" > "$VM_DIR/internal-pck") +} + +# Drive qemu-guest-agent via its virtserialport socket. QGA speaks the same +# JSON protocol as QMP but over a separate channel. We use guest-sync to make +# sure the agent is responsive, then guest-exec to fire trigger-fast-rotate. +qga_send() { + if [ ! -S "$VM_DIR/qga.sock" ]; then + return 1 + fi + # socat closes the connection on stdin EOF before QGA can reply, so keep + # stdin open for a short window after writing the request to give the + # agent time to respond. QGA replies in milliseconds; the only reason this + # isn't 0.1s is to absorb scheduling jitter on a busy host. + local payload + payload="$(cat)" + ( printf '%s\n' "$payload"; sleep 0.5 ) | socat -t10 - "UNIX-CONNECT:$VM_DIR/qga.sock" 2>/dev/null +} + +qga_wait_ready() { + local deadline=$((SECONDS + 30)) + while [ "$SECONDS" -lt "$deadline" ]; do + local resp + resp=$(printf '%s\n' '{"execute":"guest-sync","arguments":{"id":424242}}' | qga_send || true) + if printf '%s' "$resp" | grep -q '"return":[[:space:]]*424242'; then + return 0 + fi + sleep 0.2 + done + return 1 +} + +# Hot-plug a virtio-9p device backed by host `/` after a snapshot resume. +# The snapshot was captured WITHOUT virtfs (QEMU disallows migration while +# 9p is mounted in the guest), so the resumed VM has no host filesystem +# available until we add one here. The fsdev backend was pre-created by +# the -fsdev option in build_qemu_cmd; we only need the device_add half. +qmp_hotplug_9p() { + local resp + resp=$(printf '%s\n' \ + '{"execute":"device_add","arguments":{"driver":"virtio-9p-pci","id":"hostfs-dev","fsdev":"hostfs","mount_tag":"hostfs","bus":"hostfs-port"}}' \ + | qmp_send) + if printf '%s' "$resp" | grep -q '"error"'; then + err "QMP device_add virtio-9p-pci failed: $resp" + return 1 + fi + return 0 +} + +# Run /usr/local/bin/mount-host-fs --post-resume in the guest. The script +# mounts the freshly-hot-plugged 9p device on /host, which is a shared +# mount point — so the new mount propagates into the running stack +# container's `-v /host:/host:rshared` bind mount without a container +# restart. +qga_mount_host_fs() { + local cmd resp pid status_resp exited exitcode + cmd='{"execute":"guest-exec","arguments":{"path":"/usr/local/bin/mount-host-fs","arg":["--post-resume"],"capture-output":true}}' + resp=$(printf '%s\n' "$cmd" | qga_send || true) + pid=$(printf '%s' "$resp" | grep -o '"pid"[[:space:]]*:[[:space:]]*[0-9]*' | head -1 | sed -E 's/.*:[[:space:]]*([0-9]+).*/\1/') + if [ -z "$pid" ]; then + err "guest-exec mount-host-fs did not return a pid; response: $resp" + return 1 + fi + local deadline=$((SECONDS + 20)) + while [ "$SECONDS" -lt "$deadline" ]; do + status_resp=$(printf '%s\n' "{\"execute\":\"guest-exec-status\",\"arguments\":{\"pid\":${pid}}}" | qga_send || true) + exited=$(printf '%s' "$status_resp" | grep -o '"exited"[[:space:]]*:[[:space:]]*\(true\|false\)' | head -1 | sed -E 's/.*:[[:space:]]*(true|false).*/\1/') + if [ "$exited" = "true" ]; then + exitcode=$(printf '%s' "$status_resp" | grep -o '"exitcode"[[:space:]]*:[[:space:]]*-\{0,1\}[0-9]*' | head -1 | sed -E 's/.*:[[:space:]]*(-?[0-9]+).*/\1/') + if [ "${exitcode:-0}" = "0" ]; then + log "host fs mounted in guest" + return 0 + fi + err "mount-host-fs exited with code ${exitcode:-unknown}; response: $status_resp" + return 1 + fi + sleep 0.2 + done + err "mount-host-fs did not complete within 20s" + return 1 +} + +qga_trigger_fast_rotate() { + # guest-exec returns a pid; we then poll guest-exec-status until the + # process exits, and surface its exit code. Capture output so a failure + # message is available in serial.log. We pipe the fresh-secrets env file + # (as base64) to the script via input-data — keeps secrets off the + # filesystem and avoids needing virtfs. + local fresh_pck fresh_ssk fresh_sak fresh_cron payload secrets_b64 resp pid + fresh_pck="$(openssl rand -hex 32)" + fresh_ssk="$(openssl rand -hex 32)" + fresh_sak="$(openssl rand -hex 32)" + fresh_cron="$(openssl rand -hex 32)" + payload=$( + printf 'STACK_SEED_INTERNAL_PROJECT_PUBLISHABLE_CLIENT_KEY=%s\n' "$fresh_pck" + printf 'STACK_SEED_INTERNAL_PROJECT_SECRET_SERVER_KEY=%s\n' "$fresh_ssk" + printf 'STACK_SEED_INTERNAL_PROJECT_SUPER_SECRET_ADMIN_KEY=%s\n' "$fresh_sak" + printf 'CRON_SECRET=%s\n' "$fresh_cron" + ) + # Publish the fresh PCK to the host path the CLI reads. Writing before the + # guest-exec so a --config-file flow that polls from another process can + # pick it up the moment rotation completes. + write_internal_pck_for_cli "$fresh_pck" + secrets_b64=$(printf '%s' "$payload" | base64 | tr -d '\n') + local cmd + cmd=$(printf '{"execute":"guest-exec","arguments":{"path":"/usr/local/bin/trigger-fast-rotate","capture-output":true,"input-data":"%s"}}' "$secrets_b64") + resp=$(printf '%s\n' "$cmd" | qga_send || true) + pid=$(printf '%s' "$resp" | grep -o '"pid"[[:space:]]*:[[:space:]]*[0-9]*' | head -1 | sed -E 's/.*:[[:space:]]*([0-9]+).*/\1/') + if [ -z "$pid" ]; then + err "guest-exec did not return a pid; response: $resp" + return 1 + fi + + # Rotation (sed + UPDATE + supervisorctl restart + node startup) fits well + # inside this window. + local deadline=$((SECONDS + 60)) + while [ "$SECONDS" -lt "$deadline" ]; do + local status_resp exited exitcode + status_resp=$(printf '%s\n' "{\"execute\":\"guest-exec-status\",\"arguments\":{\"pid\":${pid}}}" | qga_send || true) + exited=$(printf '%s' "$status_resp" | grep -o '"exited"[[:space:]]*:[[:space:]]*\(true\|false\)' | head -1 | sed -E 's/.*:[[:space:]]*(true|false).*/\1/') + if [ "$exited" = "true" ]; then + exitcode=$(printf '%s' "$status_resp" | grep -o '"exitcode"[[:space:]]*:[[:space:]]*-\{0,1\}[0-9]*' | head -1 | sed -E 's/.*:[[:space:]]*(-?[0-9]+).*/\1/') + if [ "${exitcode:-0}" = "0" ]; then + log "rotate-secrets completed." + return 0 + fi + err "rotate-secrets exited with code ${exitcode:-unknown}" + err "response: $status_resp" + return 1 + fi + sleep 0.2 + done + err "rotate-secrets did not complete within 60s" + return 1 +} + stop_vm() { if [ ! -f "$VM_DIR/qemu.pid" ]; then return 0 @@ -292,9 +749,10 @@ stop_vm() { kill -9 "$pid" 2>/dev/null || true fi fi - rm -f "$VM_DIR/qemu.pid" "$VM_DIR/monitor.sock" "$VM_DIR/serial.log" - rm -rf "$VM_DIR/runtime-config" - rm -f "$VM_DIR/runtime-config.iso" + rm -f "$VM_DIR/qemu.pid" "$VM_DIR/monitor.sock" "$VM_DIR/qga.sock" "$VM_DIR/serial.log" + # runtime-config.iso is left in place; ensure_runtime_config_iso regenerates + # it on the next start. `cmd_reset` wipes $RUN_DIR entirely when a full reset + # is wanted. } cmd_start() { @@ -305,18 +763,92 @@ cmd_start() { info "Arch: $ARCH | Accel: $ACCEL" info "Ports: Dashboard=$EMULATOR_DASHBOARD_PORT Backend=$EMULATOR_BACKEND_PORT MinIO=$EMULATOR_MINIO_PORT Inbucket=$EMULATOR_INBUCKET_PORT" + local using_snapshot=0 + if snapshot_available; then + if ! ensure_savevm_raw; then + warn "Snapshot decompression failed — falling back to cold boot." + snapshot_fallback_to_cold_boot + return + fi + using_snapshot=1 + fi + start_vm info "VM: ${VM_RAM}MB / ${VM_CPUS} CPUs" - if ! wait_for_condition "deps services" "$READY_TIMEOUT" deps_ready; then - tail_vm_logs - exit 1 - fi + if [ "$using_snapshot" = "1" ]; then + log "Resuming from snapshot (mapped-ram + multifd)..." + if ! qmp_incoming_and_cont "$(savevm_raw_path)"; then + warn "Snapshot resume did not reach a runnable state — falling back to cold boot." + snapshot_fallback_to_cold_boot + return + fi - if ! wait_for_condition "dashboard/backend" "$READY_TIMEOUT" app_ready; then - tail_vm_logs - exit 1 + log "VM resumed; waiting for guest agent..." + if ! qga_wait_ready; then + warn "Guest agent did not respond — falling back to cold boot." + snapshot_fallback_to_cold_boot + return + fi + + # Hot-plug the host filesystem. The snapshot was captured without + # virtfs, so the running container has an empty /host bind mount until + # we add the 9p device and mount it in the guest. Required for routes + # like /local-emulator/project that read user-supplied paths via /host. + log "Hot-plugging host filesystem..." + if ! qmp_hotplug_9p; then + warn "Failed to hot-plug 9p device — falling back to cold boot." + snapshot_fallback_to_cold_boot + return + fi + if ! qga_mount_host_fs; then + warn "Failed to mount host fs in guest — falling back to cold boot." + snapshot_fallback_to_cold_boot + return + fi + + if [ "$EMULATOR_NO_ROTATION" = "1" ]; then + warn "EMULATOR_NO_ROTATION=1: snapshot's placeholder secrets are in effect — do not expose this instance." + # The placeholder PCK is live in the running image; publish it to the + # host path so --config-file flows still work. + write_internal_pck_for_cli "$SNAPSHOT_PLACEHOLDER_PCK" + if ! wait_for_condition "services" "$SNAPSHOT_READY_TIMEOUT" all_ready; then + warn "Services did not respond after resume — falling back to cold boot." + tail_vm_logs + snapshot_fallback_to_cold_boot + return + fi + else + log "Generating fresh secrets + triggering rotation..." + if ! qga_trigger_fast_rotate; then + warn "Failed to trigger rotate-secrets — falling back to cold boot." + snapshot_fallback_to_cold_boot + return + fi + + # Wait for the *new* backend (post-supervisor-restart) to actually be + # listening. all_ready may briefly return true against the OLD Node + # processes between when supervisor sends SIGTERM and when the children + # die; sleep a beat so we measure the real readiness. + sleep 1 + if ! wait_for_condition "rotated services" "$SNAPSHOT_READY_TIMEOUT" all_ready; then + warn "Services did not recover after rotation — falling back to cold boot." + tail_vm_logs + snapshot_fallback_to_cold_boot + return + fi + fi + else + if ! wait_for_condition "deps services" "$READY_TIMEOUT" deps_ready; then + tail_vm_logs + exit 1 + fi + + if ! wait_for_condition "dashboard/backend" "$READY_TIMEOUT" app_ready; then + tail_vm_logs + exit 1 + fi fi log "All services are green." @@ -324,6 +856,20 @@ cmd_start() { info "Backend: http://localhost:${EMULATOR_BACKEND_PORT}" } +# If anything about the snapshot resume fails, stop the VM, wipe the overlay, +# and retry as a cold boot. Keeps the user unblocked even when the snapshot is +# broken (e.g. stale, incompatible host-arch/QEMU-version mismatch). +snapshot_fallback_to_cold_boot() { + warn "Retrying with cold boot (EMULATOR_NO_SNAPSHOT=1)..." + stop_vm + # Wipe the overlay + fingerprint so build_qemu_cmd re-creates a fresh one. + # runtime-config.iso is regenerated by ensure_runtime_config_iso on recursion. + rm -f "$VM_DIR/disk.qcow2" "$VM_DIR/base-image.fingerprint" \ + "$VM_DIR/seed.phantom" "$VM_DIR/bundle.phantom" + EMULATOR_NO_SNAPSHOT=1 + cmd_start +} + cmd_stop() { stop_vm log "QEMU emulator stopped." @@ -335,6 +881,100 @@ cmd_reset() { log "Emulator state reset. Next start will be a fresh boot." } +# Cold-boot the VM with the snapshot-compatible device layout, wait for all +# services to be healthy, then capture a snapshot via QMP migrate and compress +# it to .savevm.zst. Called by `stack emulator pull` so first-run users get a +# fast-resume snapshot that's guaranteed compatible with their host's QEMU +# version + accelerator (which CI-built snapshots can't guarantee across +# KVM/HVF/TCG). +cmd_capture() { + if [ ! -f "$(image_path)" ]; then + err "Missing qcow2: $(image_path). Run 'stack emulator pull' first." + exit 1 + fi + if [ -s "$(savevm_path)" ] && [ "$EMULATOR_FORCE_CAPTURE" != "1" ]; then + log "Snapshot already present at $(savevm_path); skipping capture." + log "Pass EMULATOR_FORCE_CAPTURE=1 to rebuild it." + return 0 + fi + if is_running; then + err "Emulator is already running; stop it first (stack emulator stop)." + exit 1 + fi + + # Start with a clean slate if we're force-recapturing; stale raw/zst would + # otherwise make snapshot_available() return true and flip QEMU into + # -incoming defer mode. + rm -f "$(savevm_path)" "$(savevm_raw_path)" + + ensure_ports_free + mkdir -p "$RUN_DIR" "$VM_DIR" + # Regenerate runtime-config.iso with STACK_EMULATOR_VM_DIR_HOST empty — + # virtfs is detached in capture mode, so run-stack-container's + # `install internal-pck → /host/$VM_DIR_HOST/...` would fail and restart-loop + # stack.service. Mirrors build-image.sh's CI runtime.env shape. + rm -f "$(runtime_iso_path)" + write_runtime_config_iso "" + + info "Cold-booting VM to capture local snapshot (one-time, ~1-3 min)..." + EMULATOR_CAPTURING_SNAPSHOT=1 + start_vm + info "VM: 4096MB / 4 CPUs (pinned for snapshot compatibility)" + + # Cold boot with snapshot-compatible layout drops virtfs, so stack.service + # starts without /host mounted — fine for capture; hostfs is hot-plugged on + # resume via qmp_hotplug_9p. + if ! wait_for_condition "all services" "$READY_TIMEOUT" all_ready; then + tail_vm_logs + stop_vm + err "Services did not come up; capture aborted." + exit 1 + fi + + local raw tmp_raw zst tmp_zst + raw="$(savevm_raw_path)" + tmp_raw="${raw}.capture.tmp" + zst="$(savevm_path)" + tmp_zst="${zst}.capture.tmp" + rm -f "$tmp_raw" "$tmp_zst" + + log "Capturing VM state via QMP (mapped-ram + multifd)..." + if ! capture_vm_state "$VM_DIR/monitor.sock" "$tmp_raw"; then + err "QMP capture failed." + stop_vm + exit 1 + fi + + # capture_vm_state sent QMP quit; wait for QEMU to exit, then clean sockets. + local waited=0 + while [ "$waited" -lt 30 ] && is_running; do + sleep 1 + waited=$((waited + 1)) + done + if is_running; then + warn "QEMU did not exit after QMP quit; forcing." + stop_vm + fi + rm -f "$VM_DIR/qemu.pid" "$VM_DIR/monitor.sock" "$VM_DIR/qga.sock" + + if [ ! -s "$tmp_raw" ]; then + err "Captured raw file is empty: $tmp_raw" + exit 1 + fi + + log "Compressing snapshot with zstd..." + zstd -1 -T0 -f -o "$tmp_zst" "$tmp_raw" + mv "$tmp_zst" "$zst" + # Keep the uncompressed file too — resume reads it directly via mapped-ram, + # and ensure_savevm_raw skips re-decompression when the raw's mtime >= zst's. + mv "$tmp_raw" "$raw" + touch -r "$zst" "$raw" + + local size + size="$(du -h "$zst" | cut -f1)" + log "Snapshot captured: $zst (${size})" +} + STATUS_FAILED=0 print_service_status() { @@ -382,12 +1022,12 @@ ACTION="start" while [[ $# -gt 0 ]]; do case "$1" in - start|stop|reset|status|bench) + start|stop|reset|status|bench|capture) ACTION="$1" shift ;; *) - echo "Usage: $0 [start|stop|reset|status|bench]" + echo "Usage: $0 [start|stop|reset|status|bench|capture]" exit 1 ;; esac @@ -399,4 +1039,5 @@ case "$ACTION" in reset) cmd_reset ;; status) cmd_status ;; bench) cmd_bench ;; + capture) cmd_capture ;; esac diff --git a/docker/local-emulator/rotate-secrets.sh b/docker/local-emulator/rotate-secrets.sh new file mode 100644 index 0000000000..d374446426 --- /dev/null +++ b/docker/local-emulator/rotate-secrets.sh @@ -0,0 +1,104 @@ +#!/bin/bash +# Rotate baked-in placeholder secrets with fresh host-generated values. +# +# Called inside the stack container by the emulator snapshot-resume path. +# Host writes fresh secrets to /host/stack-runtime/fresh-secrets.env before +# invoking this script (via `docker exec stack /usr/local/bin/rotate-secrets`). +# +# Flow: +# 1. Read fresh secrets from host-supplied env file. +# 2. Validate they are 64-char hex (the build placeholders are too). +# 3. Write rotated-secrets.env that app-entrypoint and run-cron-jobs source +# on restart. +# 4. Targeted sed across built files: swap the placeholder PCK for the fresh +# one (this is the only secret baked into JS via sentinel replacement at +# build time — SSK/SAK/CRON_SECRET flow through process.env only). +# 5. UPDATE the internal ApiKeySet row in Postgres. +# 6. supervisorctl restart stack-app + cron-jobs so the new values take +# effect in the running Node processes. + +set -euo pipefail + +OUTPUT=/run/stack-auth/rotated-secrets.env +WORK_DIR="${STACK_RUNTIME_WORK_DIR:-/app}" + +PLACEHOLDER_PCK="00000000000000000000000000000000ffffffffffffffffffffffffffffffff" + +log() { printf '[rotate-secrets] %s\n' "$*"; } + +# Fresh secrets arrive via env vars (passed by trigger-fast-rotate using +# `docker exec -e`). For backward compatibility, fall back to a file path if +# STACK_ROTATE_INPUT is set. +if [ -n "${STACK_ROTATE_INPUT:-}" ] && [ -f "$STACK_ROTATE_INPUT" ]; then + log "reading fresh secrets from $STACK_ROTATE_INPUT" + set -a + # shellcheck disable=SC1090 + source "$STACK_ROTATE_INPUT" + set +a +fi + +for var in STACK_SEED_INTERNAL_PROJECT_PUBLISHABLE_CLIENT_KEY \ + STACK_SEED_INTERNAL_PROJECT_SECRET_SERVER_KEY \ + STACK_SEED_INTERNAL_PROJECT_SUPER_SECRET_ADMIN_KEY \ + CRON_SECRET; do + val="${!var:-}" + if [ -z "$val" ]; then + log "ERROR: $var is missing from environment" + exit 1 + fi + if ! printf '%s' "$val" | grep -Eq '^[0-9a-fA-F]{64}$'; then + log "ERROR: $var is not a 64-char hex string" + exit 1 + fi +done + +mkdir -p "$(dirname "$OUTPUT")" +umask 077 +{ + printf 'STACK_SEED_INTERNAL_PROJECT_PUBLISHABLE_CLIENT_KEY=%s\n' "$STACK_SEED_INTERNAL_PROJECT_PUBLISHABLE_CLIENT_KEY" + printf 'STACK_SEED_INTERNAL_PROJECT_SECRET_SERVER_KEY=%s\n' "$STACK_SEED_INTERNAL_PROJECT_SECRET_SERVER_KEY" + printf 'STACK_SEED_INTERNAL_PROJECT_SUPER_SECRET_ADMIN_KEY=%s\n' "$STACK_SEED_INTERNAL_PROJECT_SUPER_SECRET_ADMIN_KEY" + printf 'CRON_SECRET=%s\n' "$CRON_SECRET" + # Mirror these so process.env lookups in Node match env after restart. + printf 'NEXT_PUBLIC_STACK_PUBLISHABLE_CLIENT_KEY=%s\n' "$STACK_SEED_INTERNAL_PROJECT_PUBLISHABLE_CLIENT_KEY" + printf 'STACK_SECRET_SERVER_KEY=%s\n' "$STACK_SEED_INTERNAL_PROJECT_SECRET_SERVER_KEY" + printf 'STACK_SUPER_SECRET_ADMIN_KEY=%s\n' "$STACK_SEED_INTERNAL_PROJECT_SUPER_SECRET_ADMIN_KEY" +} > "$OUTPUT" +chmod 0600 "$OUTPUT" +log "wrote $OUTPUT" + +# The PCK is baked into built JS via STACK_ENV_VAR_SENTINEL replacement at +# container start (see /app-entrypoint.sh). Swap the placeholder hex for the +# fresh value across the built tree. Only *.js files need patching; this +# runs in ~1s on the standalone Next.js bundles. +if [ "$STACK_SEED_INTERNAL_PROJECT_PUBLISHABLE_CLIENT_KEY" != "$PLACEHOLDER_PCK" ]; then + log "rewriting PCK placeholder in $WORK_DIR" + # grep -rl narrows the find to only files that contain the placeholder, so + # the follow-up sed doesn't walk the whole tree. + mapfile -t files < <(grep -rl --include='*.js' "$PLACEHOLDER_PCK" "$WORK_DIR/apps" 2>/dev/null || true) + if [ "${#files[@]}" -gt 0 ]; then + sed -i "s|${PLACEHOLDER_PCK}|${STACK_SEED_INTERNAL_PROJECT_PUBLISHABLE_CLIENT_KEY}|g" "${files[@]}" + log "patched ${#files[@]} file(s)" + else + log "no files contained the placeholder (already rotated?)" + fi +fi + +# Update the internal ApiKeySet row so existing dashboard sessions keep +# working with the new keys. Values are already validated as hex above, so +# inlining is safe. +if [ -n "${STACK_DATABASE_CONNECTION_STRING:-}" ]; then + log "updating internal ApiKeySet" + psql "$STACK_DATABASE_CONNECTION_STRING" -v ON_ERROR_STOP=1 < { + it("renders B / KB / MB / GB across unit boundaries", () => { + expect(formatBytes(0)).toBe("0 B"); + expect(formatBytes(1)).toBe("1 B"); + expect(formatBytes(1023)).toBe("1023 B"); + expect(formatBytes(1024)).toBe("1.0 KB"); + expect(formatBytes(1536)).toBe("1.5 KB"); + expect(formatBytes(1024 * 1024)).toBe("1.0 MB"); + expect(formatBytes(1024 * 1024 * 1024)).toBe("1.0 GB"); + expect(formatBytes(1024 * 1024 * 1024 * 1024)).toBe("1.0 TB"); + }); + + it("switches precision at v>=10 within a unit", () => { + expect(formatBytes(1024 * 10)).toBe("10 KB"); + expect(formatBytes(1024 * 9.5)).toBe("9.5 KB"); + }); + + it("returns '?' for non-finite and negative values", () => { + expect(formatBytes(NaN)).toBe("?"); + expect(formatBytes(Infinity)).toBe("?"); + expect(formatBytes(-1)).toBe("?"); + }); + + it("caps at TB for very large values", () => { + // Even if we exceed TB, we don't walk off the end of the units array. + const huge = 1024 ** 6; // exabyte-scale + expect(formatBytes(huge)).toMatch(/ TB$/); + }); +}); + +describe("formatDuration", () => { + it("uses s/m/h units at the right boundaries", () => { + expect(formatDuration(0)).toBe("0s"); + expect(formatDuration(59)).toBe("59s"); + expect(formatDuration(60)).toBe("1m00s"); + expect(formatDuration(61)).toBe("1m01s"); + expect(formatDuration(3599)).toBe("59m59s"); + expect(formatDuration(3600)).toBe("1h00m"); + expect(formatDuration(3660)).toBe("1h01m"); + }); + + it("rounds seconds to integers", () => { + expect(formatDuration(59.4)).toBe("59s"); + expect(formatDuration(59.9)).toBe("1m00s"); + }); + + it("returns '?' for non-finite and negative values", () => { + expect(formatDuration(NaN)).toBe("?"); + expect(formatDuration(Infinity)).toBe("?"); + expect(formatDuration(-1)).toBe("?"); + }); +}); + +describe("renderProgressLine", () => { + it("renders a known-size progress bar with percent, size, speed, and ETA", () => { + const line = renderProgressLine(1024, 2048, 512); + expect(line).toContain("50.0%"); + expect(line).toContain("/"); + expect(line).toContain("/s"); + expect(line).toContain("eta"); + }); + + it("hides the percent / ETA fields when total size is unknown (total=0)", () => { + const line = renderProgressLine(1024, 0, 512); + expect(line).not.toContain("%"); + expect(line).not.toContain("eta"); + expect(line).toContain("/s"); + }); + + it("clamps percent at 100 if downloaded overshoots total (rounding)", () => { + const line = renderProgressLine(2050, 2048, 100); + expect(line).toContain("100.0%"); + }); + + it("handles bytesPerSec = 0 by suppressing ETA", () => { + const line = renderProgressLine(512, 2048, 0); + expect(line).not.toContain("eta"); + }); +}); + +describe("envPort", () => { + const SAVED = process.env.__TEST_PORT; + beforeEach(() => { + delete process.env.__TEST_PORT; + }); + afterEach(() => { + if (SAVED === undefined) delete process.env.__TEST_PORT; + else process.env.__TEST_PORT = SAVED; + }); + + it("returns the fallback when the env var is not set", () => { + expect(envPort("__TEST_PORT", 1234)).toBe(1234); + }); + + it("parses a valid integer value", () => { + process.env.__TEST_PORT = "9876"; + expect(envPort("__TEST_PORT", 1234)).toBe(9876); + }); + + it("rejects zero and negative values", () => { + process.env.__TEST_PORT = "0"; + expect(() => envPort("__TEST_PORT", 1234)).toThrow(/Invalid __TEST_PORT/); + process.env.__TEST_PORT = "-5"; + expect(() => envPort("__TEST_PORT", 1234)).toThrow(/Invalid __TEST_PORT/); + }); + + it("rejects non-integer and non-numeric values", () => { + process.env.__TEST_PORT = "3.14"; + expect(() => envPort("__TEST_PORT", 1234)).toThrow(/Invalid __TEST_PORT/); + process.env.__TEST_PORT = "not-a-port"; + expect(() => envPort("__TEST_PORT", 1234)).toThrow(/Invalid __TEST_PORT/); + }); + + it("treats empty string as not set (returns fallback)", () => { + // Regression target: earlier versions sometimes parsed "" as 0 and threw. + process.env.__TEST_PORT = ""; + expect(envPort("__TEST_PORT", 1234)).toBe(1234); + }); +}); + +describe("resolveArch", () => { + it("accepts explicit arm64 / amd64", () => { + expect(resolveArch("arm64")).toBe("arm64"); + expect(resolveArch("amd64")).toBe("amd64"); + }); + + it("throws on unsupported explicit arch", () => { + expect(() => resolveArch("mips")).toThrow(/Invalid architecture/); + expect(() => resolveArch("x86")).toThrow(/Invalid architecture/); + }); + + it("maps the current process arch when raw is undefined", () => { + const expected = process.arch === "arm64" ? "arm64" : process.arch === "x64" ? "amd64" : null; + if (expected === null) { + expect(() => resolveArch()).toThrow(/Invalid architecture/); + } else { + expect(resolveArch()).toBe(expected); + } + }); +}); + +describe("platformInstallHint", () => { + it("uses brew on darwin and apt on linux", () => { + const spy = vi.spyOn(process, "platform", "get"); + try { + spy.mockReturnValue("darwin"); + expect(platformInstallHint("foo-linux", "foo-mac")).toContain("brew install foo-mac"); + spy.mockReturnValue("linux"); + expect(platformInstallHint("foo-linux", "foo-mac")).toContain("apt install foo-linux"); + spy.mockReturnValue("win32"); + expect(platformInstallHint("foo-linux", "foo-mac")).toContain("install foo-mac"); + } finally { + spy.mockRestore(); + } + }); +}); diff --git a/packages/stack-cli/src/commands/emulator.ts b/packages/stack-cli/src/commands/emulator.ts index d52463b15b..3a67d26c07 100644 --- a/packages/stack-cli/src/commands/emulator.ts +++ b/packages/stack-cli/src/commands/emulator.ts @@ -1,23 +1,43 @@ import { Command } from "commander"; import { execFileSync, spawn } from "child_process"; -import { existsSync, mkdirSync, readFileSync, renameSync, unlinkSync } from "fs"; +import extract from "extract-zip"; +import { chmodSync, createWriteStream, existsSync, mkdirSync, readFileSync, renameSync, unlinkSync } from "fs"; import { homedir } from "os"; import { dirname, join, resolve } from "path"; +import { Readable } from "stream"; +import { pipeline } from "stream/promises"; import { fileURLToPath } from "url"; import { CliError } from "../lib/errors.js"; +import { writeIso } from "../lib/iso.js"; const DEFAULT_EMULATOR_BACKEND_PORT = 26701; - -function emulatorBackendPort(): number { - const raw = process.env.EMULATOR_BACKEND_PORT; - if (!raw) return DEFAULT_EMULATOR_BACKEND_PORT; +const DEFAULT_EMULATOR_DASHBOARD_PORT = 26700; +const DEFAULT_EMULATOR_MINIO_PORT = 26702; +const DEFAULT_EMULATOR_INBUCKET_PORT = 26703; +const DEFAULT_PORT_PREFIX = "81"; +const GITHUB_API = "https://api.github.com"; +const DEFAULT_REPO = "stack-auth/stack-auth"; +const AARCH64_FIRMWARE_PATHS = [ + "/opt/homebrew/share/qemu/edk2-aarch64-code.fd", + "/usr/share/qemu/edk2-aarch64-code.fd", + "/usr/share/AAVMF/AAVMF_CODE.fd", + "/usr/share/qemu-efi-aarch64/QEMU_EFI.fd", +]; + +export function envPort(name: string, fallback: number): number { + const raw = process.env[name]; + if (!raw) return fallback; const parsed = Number(raw); if (!Number.isInteger(parsed) || parsed <= 0) { - throw new CliError(`Invalid EMULATOR_BACKEND_PORT: ${raw}`); + throw new CliError(`Invalid ${name}: ${raw}`); } return parsed; } +function emulatorBackendPort(): number { + return envPort("EMULATOR_BACKEND_PORT", DEFAULT_EMULATOR_BACKEND_PORT); +} + function emulatorHome(): string { return process.env.STACK_EMULATOR_HOME ?? join(homedir(), ".stack", "emulator"); } @@ -37,11 +57,13 @@ function internalPckPath(): string { async function readInternalPck(timeoutMs = 60_000): Promise { const path = internalPckPath(); const deadline = Date.now() + timeoutMs; - let delay = 250; + let delay = 50; while (Date.now() < deadline) { - if (existsSync(path)) { + try { const contents = readFileSync(path, "utf-8").trim(); if (contents) return contents; + } catch (e) { + if ((e as NodeJS.ErrnoException).code !== "ENOENT") throw e; } await new Promise((r) => setTimeout(r, delay)); delay = Math.min(delay * 2, 2000); @@ -82,26 +104,71 @@ async function fetchEmulatorCredentials(pck: string, backendPort: number, config }; } -function gh(args: string[]): string { +// Resolve a GitHub auth token. We try GITHUB_TOKEN first so users can pin a +// PAT, then fall back to `gh auth token` if the gh CLI is installed and +// signed in. If neither works we return undefined — public release downloads +// still work (anonymous, lower rate limit) but artifact downloads fail with a +// clear error at the call site. +function githubToken(): string | undefined { + if (process.env.GITHUB_TOKEN) return process.env.GITHUB_TOKEN; try { - return execFileSync("gh", args, { encoding: "utf-8", stdio: ["pipe", "pipe", "pipe"] }).trim(); - } catch (err: unknown) { - if (err instanceof Error && "stderr" in err && typeof err.stderr === "string") { - throw new CliError(`GitHub CLI error: ${err.stderr}`); - } - throw new CliError("GitHub CLI (gh) is required. Install: https://cli.github.com/"); + const out = execFileSync("gh", ["auth", "token"], { + encoding: "utf-8", + stdio: ["pipe", "pipe", "pipe"], + }).trim(); + return out || undefined; + } catch { + return undefined; } } +async function ghApi(path: string): Promise { + const token = githubToken(); + const headers: Record = { + Accept: "application/vnd.github+json", + "X-GitHub-Api-Version": "2022-11-28", + }; + if (token) headers.Authorization = `Bearer ${token}`; + const res = await fetch(`${GITHUB_API}${path}`, { headers }); + if (!res.ok) { + const body = await res.text().catch(() => ""); + const hint = res.status === 401 || res.status === 403 + ? " (set GITHUB_TOKEN or run `gh auth login` for higher rate limits / private access)" + : ""; + throw new CliError(`GitHub API ${res.status} ${res.statusText} for ${path}${hint}${body ? `: ${body.slice(0, 300)}` : ""}`); + } + return await (res.json() as Promise); +} + function emulatorScriptsDir(): string { const here = dirname(fileURLToPath(import.meta.url)); const bundled = join(here, "emulator"); - if (existsSync(join(bundled, "run-emulator.sh"))) return bundled; + if (existsSync(join(bundled, "run-emulator.sh"))) return ensureExecutable(bundled); const repo = resolve(here, "../../../docker/local-emulator/qemu"); - if (existsSync(join(repo, "run-emulator.sh"))) return repo; + if (existsSync(join(repo, "run-emulator.sh"))) return ensureExecutable(repo); throw new CliError("Emulator scripts not found in CLI bundle."); } +// npm pack strips the execute bit from non-`bin` files, so restore it here. +function ensureExecutable(scriptsDir: string): string { + try { + chmodSync(join(scriptsDir, "run-emulator.sh"), 0o755); + } catch { + // best-effort + } + return scriptsDir; +} + +function baseEnvPath(): string { + // Lives one directory up from the scripts dir in both bundled and repo + // layouts (dist/.env.development vs docker/local-emulator/.env.development). + const path = resolve(emulatorScriptsDir(), "..", ".env.development"); + if (!existsSync(path)) { + throw new CliError(`Emulator base.env not found at ${path}`); + } + return path; +} + function emulatorSpawnEnv(extra?: Record): NodeJS.ProcessEnv { return { ...process.env, @@ -111,6 +178,33 @@ function emulatorSpawnEnv(extra?: Record): NodeJS.ProcessEnv { }; } +// Generate the runtime config ISO that the VM mounts via STACKCFG. Replaces +// the hdiutil/mkisofs/genisoimage host dep — see ../lib/iso.ts. +function prepareRuntimeConfigIso(): void { + const vmDir = join(emulatorRunDir(), "vm"); + mkdirSync(vmDir, { recursive: true }); + const portPrefix = process.env.PORT_PREFIX ?? process.env.NEXT_PUBLIC_STACK_PORT_PREFIX ?? DEFAULT_PORT_PREFIX; + const dashboardPort = envPort("EMULATOR_DASHBOARD_PORT", DEFAULT_EMULATOR_DASHBOARD_PORT); + const backendPort = envPort("EMULATOR_BACKEND_PORT", DEFAULT_EMULATOR_BACKEND_PORT); + const minioPort = envPort("EMULATOR_MINIO_PORT", DEFAULT_EMULATOR_MINIO_PORT); + const inbucketPort = envPort("EMULATOR_INBUCKET_PORT", DEFAULT_EMULATOR_INBUCKET_PORT); + + const runtimeEnv = [ + `STACK_EMULATOR_PORT_PREFIX=${portPrefix}`, + `STACK_EMULATOR_DASHBOARD_HOST_PORT=${dashboardPort}`, + `STACK_EMULATOR_BACKEND_HOST_PORT=${backendPort}`, + `STACK_EMULATOR_MINIO_HOST_PORT=${minioPort}`, + `STACK_EMULATOR_INBUCKET_HOST_PORT=${inbucketPort}`, + `STACK_EMULATOR_VM_DIR_HOST=${vmDir}`, + "", + ].join("\n"); + const baseEnv = readFileSync(baseEnvPath()); + writeIso(join(vmDir, "runtime-config.iso"), "STACKCFG", [ + { name: "runtime.env", data: Buffer.from(runtimeEnv, "utf-8") }, + { name: "base.env", data: baseEnv }, + ]); +} + function runEmulator(action: string, env?: Record): Promise { const scriptsDir = emulatorScriptsDir(); mkdirSync(emulatorRunDir(), { recursive: true }); @@ -141,82 +235,327 @@ function isEmulatorRunning(): boolean { } async function startEmulator(arch: "arm64" | "amd64"): Promise { - mkdirSync(emulatorImageDir(), { recursive: true }); const img = join(emulatorImageDir(), `stack-emulator-${arch}.qcow2`); if (!existsSync(img)) { console.log("No emulator image found. Pulling latest..."); - pullRelease(arch); + await pullRelease(arch); + // Capture now so this and all subsequent starts resume fast. Skipping it + // would cold-boot today plus every future start (we never auto-capture). + await captureLocalSnapshot(arch); } - await runEmulator("start", { EMULATOR_ARCH: arch }); + prepareRuntimeConfigIso(); + // Signal to run-emulator.sh that runtime-config.iso was written by the CLI + // via lib/iso.ts; the shell's ensure_runtime_config_iso should trust it and + // skip its own regeneration (which would otherwise require the + // hdiutil/mkisofs/genisoimage host dep the TS writer replaces). + await runEmulator("start", { EMULATOR_ARCH: arch, STACK_EMULATOR_CLI_WROTE_ISO: "1" }); } -function resolveArch(raw?: string): "arm64" | "amd64" { +export function resolveArch(raw?: string): "arm64" | "amd64" { const arch = raw ?? (process.arch === "arm64" ? "arm64" : process.arch === "x64" ? "amd64" : null); if (arch === "arm64" || arch === "amd64") return arch; throw new CliError(`Invalid architecture: ${raw ?? process.arch}. Expected arm64 or amd64.`); } -function pullRelease(arch: "arm64" | "amd64", opts: { repo?: string; branch?: string; tag?: string } = {}) { - const repo = opts.repo ?? "stack-auth/stack-auth"; +type ReleaseAsset = { name: string, url: string, size: number }; +type ReleaseResponse = { assets: ReleaseAsset[] }; + +async function pullRelease(arch: "arm64" | "amd64", opts: { repo?: string, branch?: string, tag?: string } = {}) { + const repo = opts.repo ?? DEFAULT_REPO; const branch = opts.branch ?? "dev"; const tag = opts.tag ?? `emulator-${branch}-latest`; - const asset = `stack-emulator-${arch}.qcow2`; const imageDir = emulatorImageDir(); mkdirSync(imageDir, { recursive: true }); + + const diskAsset = `stack-emulator-${arch}.qcow2`; + + const release = await ghApi(`/repos/${repo}/releases/tags/${tag}`); + const diskMatch = release.assets.find((a) => a.name === diskAsset); + if (!diskMatch) { + throw new CliError(`Asset ${diskAsset} not found in release ${tag}. Run 'stack emulator list-releases' to see available releases.`); + } + const token = githubToken(); + await downloadReleaseAsset(diskMatch, imageDir, diskAsset, token, tag); +} + +// Cold-boot the VM, wait for services, capture a snapshot via QMP, compress, +// stop. Runs once per qcow2 download so subsequent `stack emulator start`s +// resume in ~3-8s. Snapshots are always captured on the user's own machine +// because QEMU migration state isn't portable across accelerators +// (KVM/HVF/TCG) or `-cpu max` feature sets. +async function captureLocalSnapshot(arch: "arm64" | "amd64"): Promise { + preflightForVmStart("pull", arch); + prepareRuntimeConfigIso(); + console.log("Capturing local snapshot (first-time, ~1-3 min cold boot + capture)..."); + await runEmulator("capture", { EMULATOR_ARCH: arch }); +} + +async function downloadReleaseAsset( + match: ReleaseAsset, + imageDir: string, + asset: string, + token: string | undefined, + tag: string, +): Promise { const dest = join(imageDir, asset); const tmpDest = `${dest}.download`; - console.log(`Pulling ${asset} from release ${tag}...`); + const headers: Record = { Accept: "application/octet-stream" }; + if (token) headers.Authorization = `Bearer ${token}`; try { - execFileSync("gh", ["release", "download", tag, "--repo", repo, "--pattern", asset, "--output", tmpDest, "--clobber"], { stdio: "inherit" }); + await downloadWithProgress(match.url, headers, tmpDest, match.size); } catch (err) { if (existsSync(tmpDest)) unlinkSync(tmpDest); - throw new CliError(`Failed to download ${asset} from release ${tag}: ${err instanceof Error ? err.message : err}\nRun 'stack emulator list-releases' to see available releases.`); + if (err instanceof CliError) throw err; + throw new CliError(`Failed to download ${asset} from release ${tag}: ${err instanceof Error ? err.message : err}`); } renameSync(tmpDest, dest); console.log(`Downloaded: ${dest}`); } +async function downloadWithProgress(url: string, headers: Record, dest: string, totalBytes?: number): Promise { + const res = await fetch(url, { headers, redirect: "follow" }); + if (!res.ok || !res.body) { + throw new CliError(`Download failed (${res.status} ${res.statusText}): ${url}`); + } + const total = totalBytes ?? (Number(res.headers.get("content-length")) || 0); + const isTty = Boolean(process.stderr.isTTY); + const startedAt = Date.now(); + let downloaded = 0; + let lastRender = 0; + + const render = (final: boolean) => { + const now = Date.now(); + if (!final && now - lastRender < 100) return; + lastRender = now; + const elapsed = Math.max(0.001, (now - startedAt) / 1000); + const speed = downloaded / elapsed; + const line = renderProgressLine(downloaded, total, speed); + if (isTty) { + process.stderr.write(`\r\x1b[2K${line}`); + } else if (final) { + process.stderr.write(`${line}\n`); + } + }; + + const body = Readable.fromWeb(res.body as Parameters[0]); + body.on("data", (chunk: Buffer) => { + downloaded += chunk.byteLength; + render(false); + }); + await pipeline(body, createWriteStream(dest)); + render(true); + if (isTty) process.stderr.write("\n"); +} + +export function renderProgressLine(downloaded: number, total: number, bytesPerSec: number): string { + const barWidth = 30; + const pct = total > 0 ? Math.min(100, (downloaded / total) * 100) : 0; + const filled = total > 0 ? Math.round((downloaded / total) * barWidth) : 0; + const bar = "█".repeat(filled) + "░".repeat(Math.max(0, barWidth - filled)); + const pctStr = total > 0 ? `${pct.toFixed(1).padStart(5)}%` : " ? "; + const sizeStr = total > 0 ? `${formatBytes(downloaded)}/${formatBytes(total)}` : formatBytes(downloaded); + const speedStr = `${formatBytes(bytesPerSec)}/s`; + const etaStr = total > 0 && bytesPerSec > 0 ? ` eta ${formatDuration((total - downloaded) / bytesPerSec)}` : ""; + return ` [${bar}] ${pctStr} ${sizeStr} ${speedStr}${etaStr}`; +} + +export function formatBytes(bytes: number): string { + if (!Number.isFinite(bytes) || bytes < 0) return "?"; + const units = ["B", "KB", "MB", "GB", "TB"]; + let v = bytes; + let i = 0; + while (v >= 1024 && i < units.length - 1) { + v /= 1024; + i++; + } + return `${v.toFixed(v < 10 && i > 0 ? 1 : 0)} ${units[i]}`; +} + +export function formatDuration(seconds: number): string { + if (!Number.isFinite(seconds) || seconds < 0) return "?"; + const s = Math.round(seconds); + if (s < 60) return `${s}s`; + const m = Math.floor(s / 60); + const rs = s % 60; + if (m < 60) return `${m}m${rs.toString().padStart(2, "0")}s`; + const h = Math.floor(m / 60); + const rm = m % 60; + return `${h}h${rm.toString().padStart(2, "0")}m`; +} + +// --- Dependency preflight --------------------------------------------------- + +type BinarySpec = { name: string, install: string }; + +function commandExists(bin: string): boolean { + try { + execFileSync(process.platform === "win32" ? "where" : "which", [bin], { stdio: "pipe" }); + return true; + } catch { + return false; + } +} + +export function platformInstallHint(linuxPkg: string, macPkg: string): string { + switch (process.platform) { + case "darwin": { + return `brew install ${macPkg}`; + } + case "linux": { + return `apt install ${linuxPkg} (or your distro's equivalent)`; + } + default: { + return `install ${macPkg}`; + } + } +} + +function bin(name: string, linuxPkg: string, macPkg: string): BinarySpec { + return { name, install: platformInstallHint(linuxPkg, macPkg) }; +} + +function requireBinaries(commandName: string, bins: BinarySpec[]): void { + const missing = bins.filter((b) => !commandExists(b.name)); + if (missing.length === 0) return; + const lines = missing.map((b) => ` - ${b.name} → ${b.install}`); + throw new CliError( + `\`stack emulator ${commandName}\` requires the following missing binaries:\n${lines.join("\n")}`, + ); +} + +function warnIfMissing(commandName: string, bins: BinarySpec[]): void { + const missing = bins.filter((b) => !commandExists(b.name)); + if (missing.length === 0) return; + for (const b of missing) { + console.warn(`[stack emulator ${commandName}] optional dep '${b.name}' missing — feature degraded. Install: ${b.install}`); + } +} + +function aarch64FirmwareAvailable(): boolean { + return AARCH64_FIRMWARE_PATHS.some((p) => existsSync(p)); +} + +function commonVmBins(): BinarySpec[] { + return [ + bin("qemu-img", "qemu-utils", "qemu"), + bin("socat", "socat", "socat"), + bin("curl", "curl", "curl"), + bin("nc", "ncat", "netcat"), + bin("lsof", "lsof", "lsof"), + bin("openssl", "openssl", "openssl"), + ]; +} + +function archSpecificQemuBin(arch: "arm64" | "amd64"): BinarySpec { + if (arch === "arm64") { + return bin("qemu-system-aarch64", "qemu-system-arm", "qemu"); + } + return bin("qemu-system-x86_64", "qemu-system-x86", "qemu"); +} + +function preflightForVmStart(commandName: string, arch: "arm64" | "amd64"): void { + requireBinaries(commandName, [archSpecificQemuBin(arch), ...commonVmBins()]); + warnIfMissing(commandName, [bin("zstd", "zstd", "zstd")]); + if (arch === "arm64" && !aarch64FirmwareAvailable()) { + throw new CliError( + `aarch64 UEFI firmware not found. Looked in:\n${AARCH64_FIRMWARE_PATHS.map((p) => ` - ${p}`).join("\n")}\n` + + `Install: ${platformInstallHint("qemu-efi-aarch64", "qemu")}`, + ); + } +} + +// --- Workflow run / artifact downloads (replaces `gh run download`) --------- + +type WorkflowRunsResponse = { workflow_runs: { id: number }[] }; +type ArtifactsResponse = { artifacts: { id: number, name: string, size_in_bytes: number }[] }; +type PullResponse = { head: { ref: string } }; + +async function downloadArtifactByName(repo: string, runId: string, name: string, destDir: string): Promise { + const token = githubToken(); + if (!token) { + throw new CliError( + "Downloading workflow run artifacts requires authentication. Set GITHUB_TOKEN or run `gh auth login`.", + ); + } + const list = await ghApi(`/repos/${repo}/actions/runs/${runId}/artifacts?per_page=100`); + const match = list.artifacts.find((a) => a.name === name); + if (!match) return false; + const zipPath = join(destDir, `${name}.zip`); + console.log(`Downloading artifact '${name}' from run ${runId}...`); + await downloadWithProgress( + `${GITHUB_API}/repos/${repo}/actions/artifacts/${match.id}/zip`, + { Accept: "application/vnd.github+json", Authorization: `Bearer ${token}` }, + zipPath, + match.size_in_bytes, + ); + await extract(zipPath, { dir: destDir }); + unlinkSync(zipPath); + return true; +} + export function registerEmulatorCommand(program: Command) { const emulator = program.command("emulator").description("Manage the QEMU local emulator"); emulator .command("pull") - .description("Download an emulator image from GitHub Releases or a PR build") + .description("Download an emulator image from GitHub Releases or a PR build, then capture a local fast-start snapshot") .option("--arch ", "Target architecture (default: current system arch)") .option("--branch ", "Release branch (default: dev)") .option("--tag ", "Specific release tag (default: latest)") .option("--repo ", "GitHub repository (default: stack-auth/stack-auth)") .option("--pr ", "Pull from a PR's CI artifacts") .option("--run ", "Pull from a specific workflow run's artifacts") - .action(async (opts) => { + .option("--skip-snapshot", "Download only the qcow2; skip the one-time local snapshot capture") + .action(async (opts: { arch?: string, repo?: string, branch?: string, tag?: string, pr?: string, run?: string, skipSnapshot?: boolean }) => { const arch = resolveArch(opts.arch); - const repo = opts.repo ?? "stack-auth/stack-auth"; + const repo = opts.repo ?? DEFAULT_REPO; if (opts.run || opts.pr) { - let runId = opts.run as string | undefined; + let runId = opts.run; if (!runId) { console.log(`Finding latest successful build for PR #${opts.pr}...`); - const { headRefName } = JSON.parse(gh(["pr", "view", opts.pr, "--repo", repo, "--json", "headRefName"])); - const runs = JSON.parse(gh(["run", "list", "--repo", repo, "--workflow", "qemu-emulator-build.yaml", "--branch", headRefName, "--status", "success", "--limit", "1", "--json", "databaseId"])); - if (runs.length === 0) throw new CliError(`No successful build found for PR #${opts.pr} (branch: ${headRefName}).`); - runId = String(runs[0].databaseId); + const pr = await ghApi(`/repos/${repo}/pulls/${opts.pr}`); + const headRefName = pr.head.ref; + const runs = await ghApi( + `/repos/${repo}/actions/workflows/qemu-emulator-build.yaml/runs?branch=${encodeURIComponent(headRefName)}&status=success&per_page=1`, + ); + if (runs.workflow_runs.length === 0) { + throw new CliError(`No successful build found for PR #${opts.pr} (branch: ${headRefName}).`); + } + runId = String(runs.workflow_runs[0].id); } const imageDir = emulatorImageDir(); mkdirSync(imageDir, { recursive: true }); const dest = join(imageDir, `stack-emulator-${arch}.qcow2`); + const snapshotDest = join(imageDir, `stack-emulator-${arch}.savevm.zst`); + const snapshotRawDest = join(imageDir, `stack-emulator-${arch}.savevm.raw`); if (existsSync(dest)) unlinkSync(dest); - console.log(`Downloading qemu-emulator-${arch} from workflow run ${runId}...`); - try { - execFileSync("gh", ["run", "download", runId, "--repo", repo, "--name", `qemu-emulator-${arch}`, "--dir", imageDir], { stdio: "inherit" }); - } catch (err) { - throw new CliError(`Failed to download artifact from run ${runId}: ${err instanceof Error ? err.message : err}`); + // Stale snapshots from a previous pull would resume against the new + // qcow2 and crash; wipe them so capture rebuilds cleanly. + if (existsSync(snapshotDest)) unlinkSync(snapshotDest); + if (existsSync(snapshotRawDest)) unlinkSync(snapshotRawDest); + const downloaded = await downloadArtifactByName(repo, runId, `qemu-emulator-${arch}`, imageDir); + if (!downloaded) { + throw new CliError(`Artifact qemu-emulator-${arch} not found in workflow run ${runId}.`); } if (!existsSync(dest)) throw new CliError(`Expected image not found at ${dest} after download.`); console.log(`Downloaded: ${dest}`); } else { - pullRelease(arch, { repo, branch: opts.branch, tag: opts.tag }); + // Same stale-snapshot concern as the PR branch above. + const imageDir = emulatorImageDir(); + const snapshotDest = join(imageDir, `stack-emulator-${arch}.savevm.zst`); + const snapshotRawDest = join(imageDir, `stack-emulator-${arch}.savevm.raw`); + if (existsSync(snapshotDest)) unlinkSync(snapshotDest); + if (existsSync(snapshotRawDest)) unlinkSync(snapshotRawDest); + await pullRelease(arch, { repo, branch: opts.branch, tag: opts.tag }); + } + + if (opts.skipSnapshot) { + console.log("--skip-snapshot: not capturing a local snapshot. First `stack emulator start` will cold-boot."); + } else { + await captureLocalSnapshot(arch); } }); @@ -227,6 +566,7 @@ export function registerEmulatorCommand(program: Command) { .option("--config-file ", "Path to a config file; when set, credentials for this project are printed to stdout as JSON") .action(async (opts: { arch?: string, configFile?: string }) => { const arch = resolveArch(opts.arch); + preflightForVmStart("start", arch); let resolvedConfigFile: string | undefined; if (opts.configFile) { @@ -257,6 +597,7 @@ export function registerEmulatorCommand(program: Command) { .option("--config-file ", "Path to a config file; fetches credentials and injects STACK_PROJECT_ID / STACK_PUBLISHABLE_CLIENT_KEY / STACK_SECRET_SERVER_KEY into the child") .action(async (cmd: string, opts: { arch?: string, configFile?: string }) => { const arch = resolveArch(opts.arch); + preflightForVmStart("run", arch); let resolvedConfigFile: string | undefined; if (opts.configFile) { @@ -281,11 +622,17 @@ export function registerEmulatorCommand(program: Command) { const apiUrl = `http://127.0.0.1:${backendPort}`; childEnv.STACK_PROJECT_ID = creds.project_id; childEnv.NEXT_PUBLIC_STACK_PROJECT_ID = creds.project_id; + childEnv.VITE_STACK_PROJECT_ID = creds.project_id; + childEnv.EXPO_PUBLIC_STACK_PROJECT_ID = creds.project_id; childEnv.STACK_PUBLISHABLE_CLIENT_KEY = creds.publishable_client_key; childEnv.NEXT_PUBLIC_STACK_PUBLISHABLE_CLIENT_KEY = creds.publishable_client_key; + childEnv.VITE_STACK_PUBLISHABLE_CLIENT_KEY = creds.publishable_client_key; + childEnv.EXPO_PUBLIC_STACK_PUBLISHABLE_CLIENT_KEY = creds.publishable_client_key; childEnv.STACK_SECRET_SERVER_KEY = creds.secret_server_key; childEnv.STACK_API_URL = apiUrl; childEnv.NEXT_PUBLIC_STACK_API_URL = apiUrl; + childEnv.VITE_STACK_API_URL = apiUrl; + childEnv.EXPO_PUBLIC_STACK_API_URL = apiUrl; } const child = spawn(cmd, { shell: true, stdio: "inherit", env: childEnv }); @@ -304,25 +651,61 @@ export function registerEmulatorCommand(program: Command) { process.exit(exitCode); } else { console.log("\nStopping emulator..."); + const warnStopFailed = (e: unknown) => { + const msg = e instanceof Error ? e.message : String(e); + process.stderr.write(`Failed to stop emulator cleanly: ${msg}\n`); + }; runEmulator("stop") - .catch(() => { /* best-effort stop */ }) + .catch(warnStopFailed) .finally(() => process.exit(exitCode)); } }); }); - emulator.command("stop").description("Stop the emulator (data preserved; use 'reset' to clear)").action(() => runEmulator("stop")); - emulator.command("reset").description("Reset emulator state for a fresh boot").action(() => runEmulator("reset")); - emulator.command("status").description("Show emulator and service health").action(() => runEmulator("status")); + emulator + .command("stop") + .description("Stop the emulator (data preserved; use 'reset' to clear)") + .action(() => { + requireBinaries("stop", [bin("socat", "socat", "socat")]); + return runEmulator("stop"); + }); + + emulator + .command("reset") + .description("Reset emulator state for a fresh boot") + .action(() => { + requireBinaries("reset", [bin("socat", "socat", "socat")]); + return runEmulator("reset"); + }); + + emulator + .command("status") + .description("Show emulator and service health") + .action(() => { + requireBinaries("status", [ + bin("curl", "curl", "curl"), + bin("nc", "ncat", "netcat"), + ]); + return runEmulator("status"); + }); emulator .command("list-releases") .description("List available emulator releases") .option("--repo ", "GitHub repository (default: stack-auth/stack-auth)") - .action((opts) => { - const repo = opts.repo ?? "stack-auth/stack-auth"; + .action(async (opts) => { + const repo = opts.repo ?? DEFAULT_REPO; console.log(`Available emulator releases from ${repo}:\n`); - const lines = gh(["release", "list", "--repo", repo, "--limit", "20"]).split("\n").filter((l) => l.toLowerCase().includes("emulator")); + type Release = { tag_name: string, name: string | null, published_at: string | null, draft: boolean, prerelease: boolean }; + const releases = await ghApi(`/repos/${repo}/releases?per_page=50`); + const lines = releases + .filter((r) => (r.tag_name + " " + (r.name ?? "")).toLowerCase().includes("emulator")) + .slice(0, 20) + .map((r) => { + const status = r.draft ? "Draft" : r.prerelease ? "Pre-release" : "Latest"; + const date = r.published_at ? r.published_at.slice(0, 10) : ""; + return `${r.tag_name}\t${status}\t${date}`; + }); if (lines.length === 0) console.log("No emulator releases found."); else for (const line of lines) console.log(line); }); diff --git a/packages/stack-cli/src/lib/iso.test.ts b/packages/stack-cli/src/lib/iso.test.ts new file mode 100644 index 0000000000..91f069e7a5 --- /dev/null +++ b/packages/stack-cli/src/lib/iso.test.ts @@ -0,0 +1,259 @@ +import { describe, expect, it } from "vitest"; +import { buildIso, type IsoFile } from "./iso.js"; + +const SECTOR = 2048; + +// --- Test helpers: a minimal ISO 9660 parser, just enough to walk the +// directory records we produce so tests can assert the bytes we emitted really +// are addressable at the offsets claimed in the directory records. + +function readSector(iso: Buffer, sector: number): Buffer { + return iso.subarray(sector * SECTOR, (sector + 1) * SECTOR); +} + +function readVolumeDescriptor(iso: Buffer, sector: number): { type: number, id: string } { + const buf = readSector(iso, sector); + return { type: buf[0], id: buf.toString("ascii", 1, 6) }; +} + +type DirRecord = { + lenDr: number, + extentSector: number, + dataLength: number, + isDir: boolean, + fileId: Buffer, +}; + +function parseDirRecords(sector: Buffer): DirRecord[] { + const records: DirRecord[] = []; + let offset = 0; + while (offset < sector.length) { + const lenDr = sector[offset]; + if (lenDr === 0) break; + const extentSector = sector.readUInt32LE(offset + 2); + const dataLength = sector.readUInt32LE(offset + 10); + const flags = sector[offset + 25]; + const lenFi = sector[offset + 32]; + const fileId = sector.subarray(offset + 33, offset + 33 + lenFi); + records.push({ + lenDr, + extentSector, + dataLength, + isDir: (flags & 0x02) !== 0, + fileId: Buffer.from(fileId), + }); + offset += lenDr; + } + return records; +} + +// Follow PVD → root dir → pull file bytes by ISO-9660 name ("NAME.EXT;1"). +function readIsoFile(iso: Buffer, isoName: string): Buffer | null { + const pvd = readSector(iso, 16); + const rootSector = pvd.readUInt32LE(156 + 2); + const rootRecords = parseDirRecords(readSector(iso, rootSector)); + const match = rootRecords.find((r) => r.fileId.toString("ascii") === isoName); + if (!match) return null; + const start = match.extentSector * SECTOR; + return iso.subarray(start, start + match.dataLength); +} + +// Same, but follow the Joliet SVD (so names are UCS-2 BE). +function readJolietFile(iso: Buffer, name: string): Buffer | null { + const svd = readSector(iso, 17); + if (svd[0] !== 2) return null; + const rootSector = svd.readUInt32LE(156 + 2); + const rootRecords = parseDirRecords(readSector(iso, rootSector)); + const expected = Buffer.alloc(name.length * 2); + for (let i = 0; i < name.length; i++) expected.writeUInt16BE(name.charCodeAt(i), i * 2); + const match = rootRecords.find((r) => r.fileId.equals(expected)); + if (!match) return null; + const start = match.extentSector * SECTOR; + return iso.subarray(start, start + match.dataLength); +} + +function sampleFile(name: string, size: number, byte = 0x41): IsoFile { + return { name, data: Buffer.alloc(size, byte) }; +} + +describe("buildIso — structural invariants", () => { + it("emits the ISO 9660 standard identifiers at sectors 16, 17, 18", () => { + const iso = buildIso("STACKCFG", [{ name: "a.txt", data: Buffer.from("hi") }]); + expect(readVolumeDescriptor(iso, 16)).toEqual({ type: 1, id: "CD001" }); + expect(readVolumeDescriptor(iso, 17)).toEqual({ type: 2, id: "CD001" }); + expect(readVolumeDescriptor(iso, 18)).toEqual({ type: 0xff, id: "CD001" }); + }); + + it("stores the volume identifier verbatim in the PVD for blkid discovery", () => { + const iso = buildIso("STACKCFG", [{ name: "a.txt", data: Buffer.from("x") }]); + const pvd = readSector(iso, 16); + expect(pvd.toString("ascii", 40, 40 + 8)).toBe("STACKCFG"); + }); + + it("stores the volume identifier in the Joliet SVD as UCS-2 BE", () => { + const iso = buildIso("STACKCFG", [{ name: "a.txt", data: Buffer.from("x") }]); + const svd = readSector(iso, 17); + const ucs = svd.subarray(40, 40 + 16); + let decoded = ""; + for (let i = 0; i < ucs.length; i += 2) decoded += String.fromCharCode(ucs.readUInt16BE(i)); + expect(decoded).toBe("STACKCFG"); + }); + + it("sets the Joliet escape sequence %/E", () => { + const iso = buildIso("STACKCFG", [{ name: "a.txt", data: Buffer.from("x") }]); + const svd = readSector(iso, 17); + expect(svd[88]).toBe(0x25); + expect(svd[89]).toBe(0x2f); + expect(svd[90]).toBe(0x45); + }); + + it("declares a volume space size equal to the emitted sector count", () => { + const iso = buildIso("STACKCFG", [{ name: "a.txt", data: Buffer.from("hello world") }]); + const pvd = readSector(iso, 16); + const declared = pvd.readUInt32LE(80); + expect(iso.length).toBe(declared * SECTOR); + }); +}); + +describe("buildIso — file round-trip", () => { + it("makes files readable by ISO 9660 name", () => { + const iso = buildIso("STACKCFG", [ + { name: "runtime.env", data: Buffer.from("KEY=value\n") }, + { name: "base.env", data: Buffer.from("FOO=bar\n") }, + ]); + expect(readIsoFile(iso, "RUNTIME.ENV;1")?.toString()).toBe("KEY=value\n"); + expect(readIsoFile(iso, "BASE.ENV;1")?.toString()).toBe("FOO=bar\n"); + }); + + it("makes files readable by Joliet (lowercase) name", () => { + const iso = buildIso("STACKCFG", [ + { name: "runtime.env", data: Buffer.from("KEY=value\n") }, + { name: "base.env", data: Buffer.from("FOO=bar\n") }, + ]); + expect(readJolietFile(iso, "runtime.env")?.toString()).toBe("KEY=value\n"); + expect(readJolietFile(iso, "base.env")?.toString()).toBe("FOO=bar\n"); + }); + + it("preserves exact file contents byte-for-byte", () => { + const content = Buffer.from([0x00, 0xff, 0x7f, 0x80, 0x41, 0x42, 0x43]); + const iso = buildIso("STACKCFG", [{ name: "bin.dat", data: content }]); + expect(readJolietFile(iso, "bin.dat")?.equals(content)).toBe(true); + }); + + it("handles files whose length is exactly one sector", () => { + const content = Buffer.alloc(SECTOR, 0x37); + const iso = buildIso("STACKCFG", [{ name: "one.bin", data: content }]); + expect(readJolietFile(iso, "one.bin")?.equals(content)).toBe(true); + }); + + it("handles files that span multiple sectors", () => { + const content = Buffer.alloc(SECTOR * 3 + 17, 0x55); + const iso = buildIso("STACKCFG", [{ name: "big.bin", data: content }]); + expect(readJolietFile(iso, "big.bin")?.equals(content)).toBe(true); + }); + + it("keeps files byte-exact at the claimed extent sector across multi-file layouts", () => { + // Fingerprint each file so we can tell them apart even if extents shift. + const files: IsoFile[] = [ + { name: "alpha.bin", data: Buffer.alloc(SECTOR + 5, 0xaa) }, + { name: "beta.bin", data: Buffer.alloc(SECTOR * 2, 0xbb) }, + { name: "gamma.bin", data: Buffer.alloc(42, 0xcc) }, + ]; + const iso = buildIso("STACKCFG", files); + for (const f of files) { + expect(readJolietFile(iso, f.name)?.equals(f.data)).toBe(true); + } + }); +}); + +describe("buildIso — edge cases", () => { + it("handles empty files without misaligning subsequent file extents", () => { + // Regression: `padToSector(Buffer.alloc(0))` used to return a 0-byte + // buffer, but the layout reserved 1 sector for the empty file — the next + // file was then read from the empty file's reserved slot. + const files: IsoFile[] = [ + { name: "empty.txt", data: Buffer.alloc(0) }, + { name: "after.txt", data: Buffer.from("marker\n") }, + ]; + const iso = buildIso("STACKCFG", files); + expect(readJolietFile(iso, "empty.txt")?.length).toBe(0); + expect(readJolietFile(iso, "after.txt")?.toString()).toBe("marker\n"); + // And: the declared volume space size must cover every emitted byte. + const pvd = readSector(iso, 16); + expect(iso.length).toBe(pvd.readUInt32LE(80) * SECTOR); + }); + + it("writes the exact file length in the directory record (not padded to sector)", () => { + const content = Buffer.from("abc"); + const iso = buildIso("STACKCFG", [{ name: "tiny.txt", data: content }]); + const svd = readSector(iso, 17); + const rootSector = svd.readUInt32LE(156 + 2); + const records = parseDirRecords(readSector(iso, rootSector)); + const file = records.find((r) => !r.isDir); + expect(file?.dataLength).toBe(3); + }); + + it("places the root directory records for . and .. pointing at the root extent", () => { + const iso = buildIso("STACKCFG", [{ name: "x.txt", data: Buffer.from("1") }]); + const svd = readSector(iso, 17); + const rootSector = svd.readUInt32LE(156 + 2); + const records = parseDirRecords(readSector(iso, rootSector)); + expect(records.length).toBeGreaterThanOrEqual(2); + expect(records[0].fileId.equals(Buffer.from([0x00]))).toBe(true); + expect(records[1].fileId.equals(Buffer.from([0x01]))).toBe(true); + expect(records[0].isDir).toBe(true); + expect(records[0].extentSector).toBe(rootSector); + expect(records[1].extentSector).toBe(rootSector); + }); + + it("truncates volume identifiers longer than 32 bytes rather than corrupting the PVD", () => { + const longId = "A".repeat(64); + const iso = buildIso(longId, [{ name: "x.txt", data: Buffer.from("1") }]); + const pvd = readSector(iso, 16); + expect(pvd.toString("ascii", 40, 40 + 32)).toBe("A".repeat(32)); + // Sector 17 should still be the Joliet SVD, not clobbered. + expect(pvd[881]).toBe(1); + expect(readVolumeDescriptor(iso, 17).type).toBe(2); + }); + + it("rejects an input set whose root directory record overflows one sector", () => { + // Each Joliet dir record for an N-char name is 33 + 2N + (2N even ? 1 : 0) + // ≈ 2N + 34 bytes. A sector is 2048. Thirty 30-char names → ~1860 bytes + // plus "." + ".." (68) → fits. Eighty of them → well over a sector. + const many: IsoFile[] = Array.from({ length: 80 }, (_, i) => ({ + name: `file-${String(i).padStart(3, "0")}-padding-padding.bin`, + data: Buffer.from("x"), + })); + expect(() => buildIso("STACKCFG", many)).toThrow(/Root directory exceeds/); + }); + + it("produces a sector-aligned buffer regardless of file sizes", () => { + for (const size of [0, 1, SECTOR - 1, SECTOR, SECTOR + 1, SECTOR * 5 - 1]) { + const iso = buildIso("STACKCFG", [sampleFile("a.bin", size)]); + expect(iso.length % SECTOR).toBe(0); + } + }); +}); + +describe("buildIso — multiple file sector layout", () => { + it("assigns non-overlapping extents to all files", () => { + const files: IsoFile[] = [ + sampleFile("a.bin", 10, 0x01), + sampleFile("b.bin", SECTOR, 0x02), + sampleFile("c.bin", SECTOR * 2 + 500, 0x03), + sampleFile("d.bin", 1, 0x04), + ]; + const iso = buildIso("STACKCFG", files); + const svd = readSector(iso, 17); + const rootSector = svd.readUInt32LE(156 + 2); + const records = parseDirRecords(readSector(iso, rootSector)).filter((r) => !r.isDir); + + // Extents must be strictly ordered and non-overlapping. + const sorted = [...records].sort((a, b) => a.extentSector - b.extentSector); + for (let i = 1; i < sorted.length; i++) { + const prev = sorted[i - 1]; + const prevEndSector = prev.extentSector + Math.max(1, Math.ceil(prev.dataLength / SECTOR)); + expect(sorted[i].extentSector).toBeGreaterThanOrEqual(prevEndSector); + } + }); +}); diff --git a/packages/stack-cli/src/lib/iso.ts b/packages/stack-cli/src/lib/iso.ts new file mode 100644 index 0000000000..6b8ac1bb12 --- /dev/null +++ b/packages/stack-cli/src/lib/iso.ts @@ -0,0 +1,399 @@ +// Minimal ISO 9660 + Joliet writer used to package the runtime config blob +// that the emulator VM mounts at boot via /dev/disk/by-label/STACKCFG. +// +// Replaces the host-side dependency on hdiutil/mkisofs/genisoimage. Only the +// subset of ECMA-119 needed for a single-level root directory of small UTF-8 +// text files is implemented: PVD + Joliet SVD + path tables + root dir + file +// data. Names are emitted in both ISO 9660 ("BASE.ENV;1") and Joliet +// (lower-case UCS-2) form so Linux mounts the Joliet view by default and the +// guest's `source /mnt/stack-runtime/runtime.env` works unchanged. + +import { writeFileSync } from "fs"; + +const SECTOR = 2048; + +function bothEndian32(n: number): Buffer { + const b = Buffer.alloc(8); + b.writeUInt32LE(n, 0); + b.writeUInt32BE(n, 4); + return b; +} + +function bothEndian16(n: number): Buffer { + const b = Buffer.alloc(4); + b.writeUInt16LE(n, 0); + b.writeUInt16BE(n, 2); + return b; +} + +function padString(s: string, len: number, fill = " "): Buffer { + const buf = Buffer.alloc(len, fill.charCodeAt(0)); + buf.write(s.slice(0, len), 0, "ascii"); + return buf; +} + +function ucs2BE(s: string): Buffer { + const buf = Buffer.alloc(s.length * 2); + for (let i = 0; i < s.length; i++) { + buf.writeUInt16BE(s.charCodeAt(i), i * 2); + } + return buf; +} + +function padUcs2BE(s: string, byteLen: number): Buffer { + const buf = Buffer.alloc(byteLen); + const wholeChars = Math.floor(byteLen / 2); + for (let i = 0; i < wholeChars; i++) { + buf.writeUInt16BE(i < s.length ? s.charCodeAt(i) : 0x0020, i * 2); + } + // Odd-length fields (e.g. 37-byte Copyright/Abstract/Bibliographic IDs) get + // a trailing space byte; spec allows either NUL or 0x20 padding. + if (byteLen % 2 === 1) { + buf[byteLen - 1] = 0x20; + } + return buf; +} + +function dirRecordingDate(d: Date): Buffer { + const buf = Buffer.alloc(7); + buf[0] = d.getUTCFullYear() - 1900; + buf[1] = d.getUTCMonth() + 1; + buf[2] = d.getUTCDate(); + buf[3] = d.getUTCHours(); + buf[4] = d.getUTCMinutes(); + buf[5] = d.getUTCSeconds(); + buf[6] = 0; + return buf; +} + +function volumeDate(d: Date): Buffer { + const pad = (n: number, w: number) => String(n).padStart(w, "0"); + const s = + pad(d.getUTCFullYear(), 4) + + pad(d.getUTCMonth() + 1, 2) + + pad(d.getUTCDate(), 2) + + pad(d.getUTCHours(), 2) + + pad(d.getUTCMinutes(), 2) + + pad(d.getUTCSeconds(), 2) + + "00"; + const buf = Buffer.alloc(17); + buf.write(s, 0, 16, "ascii"); + buf[16] = 0; + return buf; +} + +const UNUSED_VOLUME_DATE = (() => { + const buf = Buffer.alloc(17, "0".charCodeAt(0)); + buf[16] = 0; + return buf; +})(); + +// Encodes an ISO 9660 file identifier ("FILENAME.EXT;1"). Caller must pass an +// already-uppercased 8.3 name without the version suffix. +function isoFileIdentifier(name: string): Buffer { + const upper = name.toUpperCase(); + return Buffer.from(`${upper};1`, "ascii"); +} + +// Builds a single directory record. `idBytes` is the file identifier bytes +// (ASCII for ISO, UCS-2 BE for Joliet); `idForDot` overrides with a single +// 0x00 / 0x01 byte for "." / ".." entries. +function buildDirRecord( + extentSector: number, + dataLength: number, + isDir: boolean, + recDate: Buffer, + idBytes: Buffer, +): Buffer { + const lenFi = idBytes.length; + const pad = lenFi % 2 === 0 ? 1 : 0; + const lenDr = 33 + lenFi + pad; + const buf = Buffer.alloc(lenDr); + buf[0] = lenDr; + buf[1] = 0; + bothEndian32(extentSector).copy(buf, 2); + bothEndian32(dataLength).copy(buf, 10); + recDate.copy(buf, 18); + buf[25] = isDir ? 0x02 : 0x00; + buf[26] = 0; + buf[27] = 0; + bothEndian16(1).copy(buf, 28); + buf[32] = lenFi; + idBytes.copy(buf, 33); + return buf; +} + +function buildRootDirEntries( + rootSector: number, + rootSize: number, + recDate: Buffer, + files: { idBytes: Buffer, sector: number, size: number }[], +): Buffer { + const records: Buffer[] = []; + records.push(buildDirRecord(rootSector, rootSize, true, recDate, Buffer.from([0x00]))); + records.push(buildDirRecord(rootSector, rootSize, true, recDate, Buffer.from([0x01]))); + for (const f of files) { + records.push(buildDirRecord(f.sector, f.size, false, recDate, f.idBytes)); + } + + // Records may not span sector boundaries; pack them with sector padding. + const sectors: Buffer[] = []; + let current = Buffer.alloc(0); + for (const r of records) { + if (current.length + r.length > SECTOR) { + sectors.push(Buffer.concat([current, Buffer.alloc(SECTOR - current.length)])); + current = Buffer.alloc(0); + } + current = Buffer.concat([current, r]); + } + if (current.length > 0) { + sectors.push(Buffer.concat([current, Buffer.alloc(SECTOR - current.length)])); + } + return Buffer.concat(sectors); +} + +// Single-entry path table for the root directory. Used for both L (LE) and M +// (BE) tables; pass writeUInt32LE/BE accordingly. +function buildPathTable(rootSector: number, byteOrder: "LE" | "BE"): Buffer { + const buf = Buffer.alloc(10); + buf[0] = 1; // LEN_DI + buf[1] = 0; // EAR length + if (byteOrder === "LE") { + buf.writeUInt32LE(rootSector, 2); + buf.writeUInt16LE(1, 6); + } else { + buf.writeUInt32BE(rootSector, 2); + buf.writeUInt16BE(1, 6); + } + buf[8] = 0; // root identifier + buf[9] = 0; // pad + return buf; +} + +function padToSector(buf: Buffer): Buffer { + const rem = buf.length % SECTOR; + if (rem === 0) return buf; + return Buffer.concat([buf, Buffer.alloc(SECTOR - rem)]); +} + +// Build a Volume Descriptor (PVD or Joliet SVD). `joliet` switches volume-name +// fields to UCS-2 BE and sets the Joliet escape sequence. +function buildVolumeDescriptor(opts: { + joliet: boolean, + volumeId: string, + volumeSpaceSize: number, + pathTableSize: number, + lPathSector: number, + mPathSector: number, + rootDirRecord: Buffer, + date: Buffer, +}): Buffer { + const buf = Buffer.alloc(SECTOR); + buf[0] = opts.joliet ? 2 : 1; + buf.write("CD001", 1, 5, "ascii"); + buf[6] = 1; + buf[7] = 0; + + // System Identifier (32 bytes) + if (opts.joliet) { + padUcs2BE("", 32).copy(buf, 8); + } else { + padString("", 32).copy(buf, 8); + } + + // Volume Identifier (32 bytes) — must be "STACKCFG" so udev exposes it as + // /dev/disk/by-label/STACKCFG. blkid reads from PVD by default but Joliet + // takes precedence when both are present. + if (opts.joliet) { + padUcs2BE(opts.volumeId, 32).copy(buf, 40); + } else { + padString(opts.volumeId, 32).copy(buf, 40); + } + + bothEndian32(opts.volumeSpaceSize).copy(buf, 80); + + if (opts.joliet) { + // Escape sequence for UCS-2 Level 3 ("%/E") at offset 88 (32 bytes). + buf[88] = 0x25; + buf[89] = 0x2f; + buf[90] = 0x45; + } + + bothEndian16(1).copy(buf, 120); // Volume Set Size + bothEndian16(1).copy(buf, 124); // Volume Sequence Number + bothEndian16(SECTOR).copy(buf, 128); // Logical Block Size + bothEndian32(opts.pathTableSize).copy(buf, 132); + buf.writeUInt32LE(opts.lPathSector, 140); + buf.writeUInt32LE(0, 144); // optional L + buf.writeUInt32BE(opts.mPathSector, 148); + buf.writeUInt32BE(0, 152); // optional M + + opts.rootDirRecord.copy(buf, 156); + + const padFn = opts.joliet + ? (s: string, n: number) => padUcs2BE(s, n) + : (s: string, n: number) => padString(s, n); + + padFn("", 128).copy(buf, 190); // Volume Set Identifier + padFn("", 128).copy(buf, 318); // Publisher Identifier + padFn("", 128).copy(buf, 446); // Data Preparer Identifier + padFn("", 128).copy(buf, 574); // Application Identifier + padFn("", 37).copy(buf, 702); // Copyright File Identifier + padFn("", 37).copy(buf, 739); // Abstract File Identifier + padFn("", 37).copy(buf, 776); // Bibliographic File Identifier + + opts.date.copy(buf, 813); // Creation + opts.date.copy(buf, 830); // Modification + UNUSED_VOLUME_DATE.copy(buf, 847); // Expiration + UNUSED_VOLUME_DATE.copy(buf, 864); // Effective + + buf[881] = 1; // File Structure Version + return buf; +} + +function buildVolumeDescriptorTerminator(): Buffer { + const buf = Buffer.alloc(SECTOR); + buf[0] = 0xff; + buf.write("CD001", 1, 5, "ascii"); + buf[6] = 1; + return buf; +} + +export type IsoFile = { name: string, data: Buffer }; + +export function buildIso(volumeId: string, files: IsoFile[]): Buffer { + const date = new Date(); + const recDate = dirRecordingDate(date); + const volDateBuf = volumeDate(date); + + // Compute per-file directory record sizes for both views. + const isoEntries = files.map((f) => ({ + file: f, + idBytes: isoFileIdentifier(f.name), + })); + const jolietEntries = files.map((f) => ({ + file: f, + idBytes: ucs2BE(f.name), + })); + + // We need root sector + size before we know file sectors — but file sectors + // depend only on the root dir size, which depends only on the file count. + // Compute the root dir buffer twice if needed (sizes are stable since they + // depend only on identifier bytes, not on file extents). + const dirRecLen = (lenFi: number) => 33 + lenFi + (lenFi % 2 === 0 ? 1 : 0); + const isoRootSize = 34 + 34 + isoEntries.reduce((acc, e) => acc + dirRecLen(e.idBytes.length), 0); + const jolietRootSize = 34 + 34 + jolietEntries.reduce((acc, e) => acc + dirRecLen(e.idBytes.length), 0); + if (isoRootSize > SECTOR || jolietRootSize > SECTOR) { + throw new Error(`Root directory exceeds ${SECTOR} bytes; multi-sector root not supported.`); + } + + // Sector layout. + const sysAreaSectors = 16; + const pvdSector = sysAreaSectors; + const svdSector = pvdSector + 1; + const termSector = svdSector + 1; + const isoLPathSector = termSector + 1; + const isoMPathSector = isoLPathSector + 1; + const jolietLPathSector = isoMPathSector + 1; + const jolietMPathSector = jolietLPathSector + 1; + const isoRootSector = jolietMPathSector + 1; + const jolietRootSector = isoRootSector + 1; + let nextSector = jolietRootSector + 1; + + const fileLayout = files.map((f) => { + const sector = nextSector; + const sectors = Math.max(1, Math.ceil(f.data.length / SECTOR)); + nextSector += sectors; + return { file: f, sector, size: f.data.length }; + }); + + const totalSectors = nextSector; + const pathTableSize = 10; + + // Root directory record inside the volume descriptor (BP 157-190 of PVD/SVD): + // same layout as a regular dir record but the identifier is the single byte 0x00. + const rootIdent = Buffer.from([0x00]); + const isoRootDirRecordVD = buildDirRecord(isoRootSector, SECTOR, true, recDate, rootIdent); + const jolietRootDirRecordVD = buildDirRecord(jolietRootSector, SECTOR, true, recDate, rootIdent); + + const pvd = buildVolumeDescriptor({ + joliet: false, + volumeId, + volumeSpaceSize: totalSectors, + pathTableSize, + lPathSector: isoLPathSector, + mPathSector: isoMPathSector, + rootDirRecord: isoRootDirRecordVD, + date: volDateBuf, + }); + + const svd = buildVolumeDescriptor({ + joliet: true, + volumeId, + volumeSpaceSize: totalSectors, + pathTableSize, + lPathSector: jolietLPathSector, + mPathSector: jolietMPathSector, + rootDirRecord: jolietRootDirRecordVD, + date: volDateBuf, + }); + + const term = buildVolumeDescriptorTerminator(); + const isoLPath = padToSector(buildPathTable(isoRootSector, "LE")); + const isoMPath = padToSector(buildPathTable(isoRootSector, "BE")); + const jolietLPath = padToSector(buildPathTable(jolietRootSector, "LE")); + const jolietMPath = padToSector(buildPathTable(jolietRootSector, "BE")); + + const isoRoot = buildRootDirEntries( + isoRootSector, + SECTOR, + recDate, + isoEntries.map((e, i) => ({ + idBytes: e.idBytes, + sector: fileLayout[i].sector, + size: fileLayout[i].size, + })), + ); + const jolietRoot = buildRootDirEntries( + jolietRootSector, + SECTOR, + recDate, + jolietEntries.map((e, i) => ({ + idBytes: e.idBytes, + sector: fileLayout[i].sector, + size: fileLayout[i].size, + })), + ); + + // Each file must occupy the exact number of sectors the layout reserved for + // it. An empty file reserves 1 sector (via Math.max(1, …)) but + // padToSector(Buffer.alloc(0)) returns 0 bytes — that would desync every + // subsequent file's extent. Explicitly pad to the reserved size instead. + const fileBuffers = fileLayout.map((f) => { + const reservedSectors = Math.max(1, Math.ceil(f.file.data.length / SECTOR)); + const reservedBytes = reservedSectors * SECTOR; + if (f.file.data.length === reservedBytes) return f.file.data; + const out = Buffer.alloc(reservedBytes); + f.file.data.copy(out, 0); + return out; + }); + + return Buffer.concat([ + Buffer.alloc(sysAreaSectors * SECTOR), + pvd, + svd, + term, + isoLPath, + isoMPath, + jolietLPath, + jolietMPath, + isoRoot, + jolietRoot, + ...fileBuffers, + ]); +} + +export function writeIso(path: string, volumeId: string, files: IsoFile[]): void { + const buf = buildIso(volumeId, files); + writeFileSync(path, buf); +} diff --git a/packages/stack-cli/vitest.config.ts b/packages/stack-cli/vitest.config.ts new file mode 100644 index 0000000000..0caeccbe44 --- /dev/null +++ b/packages/stack-cli/vitest.config.ts @@ -0,0 +1,19 @@ +import { defineConfig, mergeConfig } from 'vitest/config'; +import sharedConfig from '../../vitest.shared'; + +export default mergeConfig( + sharedConfig, + defineConfig({ + test: { + // Override the shared `maxWorkers: 8` — with it set, tinypool defaults + // minThreads to the host's available parallelism, producing + // "minThreads/maxThreads must not conflict" on machines with >8 cores. + poolOptions: { + threads: { + minThreads: 1, + maxThreads: 4, + }, + }, + }, + }), +); diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 57c63c24b1..5ec346963c 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -746,7 +746,7 @@ importers: version: 1.166.6(crossws@0.4.4(srvx@0.8.16)) nitro: specifier: ^3.0.0 - version: 3.0.0(@electric-sql/pglite@0.3.2)(chokidar@4.0.3)(lru-cache@11.2.2)(mysql2@3.15.3)(rolldown@1.0.0-rc.3)(vite@7.3.1(@types/node@22.19.0)(jiti@2.6.1)(lightningcss@1.30.1)(terser@5.44.0)(tsx@4.21.0)(yaml@2.8.0))(xml2js@0.6.2) + version: 3.0.0(@electric-sql/pglite@0.3.2)(chokidar@4.0.3)(lru-cache@11.2.2)(mysql2@3.15.3)(vite@7.3.1(@types/node@22.19.0)(jiti@2.6.1)(lightningcss@1.30.1)(terser@5.44.0)(tsx@4.21.0)(yaml@2.8.0))(xml2js@0.6.2) react: specifier: 19.2.1 version: 19.2.1 @@ -2097,6 +2097,9 @@ importers: commander: specifier: ^13.1.0 version: 13.1.0 + extract-zip: + specifier: ^2.0.1 + version: 2.0.1 jiti: specifier: ^2.4.2 version: 2.6.1 @@ -11395,6 +11398,7 @@ packages: basic-ftp@5.2.0: resolution: {integrity: sha512-VoMINM2rqJwJgfdHq6RiUudKt2BV+FY5ZFezP/ypmwayk68+NzzAQy4XXLlqsGD4MCzq3DrmNFD/uUmBJuGoXw==} engines: {node: '>=10.0.0'} + deprecated: Security vulnerability fixed in 5.2.1, please upgrade bcrypt@6.0.0: resolution: {integrity: sha512-cU8v/EGSrnH+HnxV2z0J7/blxH8gq7Xh2JFT6Aroax7UohdmiJJlxApMxtKfuI7z68NvvVcmR78k2LbT6efhRg==} @@ -33362,7 +33366,7 @@ snapshots: debug: 4.4.3 enhanced-resolve: 5.17.1 eslint: 8.57.1 - eslint-module-utils: 2.12.0(@typescript-eslint/parser@6.21.0(eslint@8.57.1)(typescript@5.9.3))(eslint-import-resolver-node@0.3.9)(eslint-import-resolver-typescript@3.6.3)(eslint@8.57.1) + eslint-module-utils: 2.12.0(@typescript-eslint/parser@6.21.0(eslint@8.57.1)(typescript@5.9.3))(eslint-import-resolver-node@0.3.9)(eslint-import-resolver-typescript@3.6.3(@typescript-eslint/parser@6.21.0(eslint@8.57.1)(typescript@5.9.3))(eslint-import-resolver-node@0.3.9)(eslint-plugin-import@2.31.0(@typescript-eslint/parser@8.56.1(eslint@8.57.1)(typescript@5.9.3))(eslint@8.57.1))(eslint@8.57.1))(eslint@8.57.1) fast-glob: 3.3.3 get-tsconfig: 4.8.1 is-bun-module: 1.2.1 @@ -33405,7 +33409,7 @@ snapshots: transitivePeerDependencies: - supports-color - eslint-module-utils@2.12.0(@typescript-eslint/parser@6.21.0(eslint@8.57.1)(typescript@5.9.3))(eslint-import-resolver-node@0.3.9)(eslint-import-resolver-typescript@3.6.3)(eslint@8.57.1): + eslint-module-utils@2.12.0(@typescript-eslint/parser@6.21.0(eslint@8.57.1)(typescript@5.9.3))(eslint-import-resolver-node@0.3.9)(eslint-import-resolver-typescript@3.6.3(@typescript-eslint/parser@6.21.0(eslint@8.57.1)(typescript@5.9.3))(eslint-import-resolver-node@0.3.9)(eslint-plugin-import@2.31.0(@typescript-eslint/parser@8.56.1(eslint@8.57.1)(typescript@5.9.3))(eslint@8.57.1))(eslint@8.57.1))(eslint@8.57.1): dependencies: debug: 3.2.7 optionalDependencies: @@ -33483,7 +33487,7 @@ snapshots: doctrine: 2.1.0 eslint: 8.57.1 eslint-import-resolver-node: 0.3.9 - eslint-module-utils: 2.12.0(@typescript-eslint/parser@6.21.0(eslint@8.57.1)(typescript@5.9.3))(eslint-import-resolver-node@0.3.9)(eslint-import-resolver-typescript@3.6.3)(eslint@8.57.1) + eslint-module-utils: 2.12.0(@typescript-eslint/parser@6.21.0(eslint@8.57.1)(typescript@5.9.3))(eslint-import-resolver-node@0.3.9)(eslint-import-resolver-typescript@3.6.3(@typescript-eslint/parser@6.21.0(eslint@8.57.1)(typescript@5.9.3))(eslint-import-resolver-node@0.3.9)(eslint-plugin-import@2.31.0(@typescript-eslint/parser@8.56.1(eslint@8.57.1)(typescript@5.9.3))(eslint@8.57.1))(eslint@8.57.1))(eslint@8.57.1) hasown: 2.0.2 is-core-module: 2.15.1 is-glob: 4.0.3 @@ -37347,7 +37351,7 @@ snapshots: jsonpath-plus: 10.4.0 lodash.topath: 4.5.2 - nitro@3.0.0(@electric-sql/pglite@0.3.2)(chokidar@4.0.3)(lru-cache@11.2.2)(mysql2@3.15.3)(rolldown@1.0.0-rc.3)(vite@7.3.1(@types/node@22.19.0)(jiti@2.6.1)(lightningcss@1.30.1)(terser@5.44.0)(tsx@4.21.0)(yaml@2.8.0))(xml2js@0.6.2): + nitro@3.0.0(@electric-sql/pglite@0.3.2)(chokidar@4.0.3)(lru-cache@11.2.2)(mysql2@3.15.3)(vite@7.3.1(@types/node@22.19.0)(jiti@2.6.1)(lightningcss@1.30.1)(terser@5.44.0)(tsx@4.21.0)(yaml@2.8.0))(xml2js@0.6.2): dependencies: consola: 3.4.2 cookie-es: 2.0.0 @@ -37367,7 +37371,6 @@ snapshots: unenv: 2.0.0-rc.21 unstorage: 2.0.0-alpha.3(chokidar@4.0.3)(db0@0.3.4(@electric-sql/pglite@0.3.2)(mysql2@3.15.3))(lru-cache@11.2.2)(ofetch@1.5.1) optionalDependencies: - rolldown: 1.0.0-rc.3 vite: 7.3.1(@types/node@22.19.0)(jiti@2.6.1)(lightningcss@1.30.1)(terser@5.44.0)(tsx@4.21.0)(yaml@2.8.0) xml2js: 0.6.2 transitivePeerDependencies: