diff --git a/build.sh b/build.sh
index 0c2f679..a4c6f99 100755
--- a/build.sh
+++ b/build.sh
@@ -39,6 +39,14 @@ KERNEL_EXPERIMENT=${KERNEL_EXPERIMENT:-none}
 KERNEL_ORDER_FILE=${KERNEL_ORDER_FILE:-}
 KERNEL_SYSCALL_TABLE=${KERNEL_SYSCALL_TABLE:-}
 KERNEL_CONFIG_FRAGMENT=${KERNEL_CONFIG_FRAGMENT:-}
+# DWARF policy. The default ('none') matches the production image:
+# CONFIG_DEBUG_INFO_NONE=y, addr2line cannot resolve any symbol, and
+# scripts/subsystem-rollup.py exits with the documented "DWARF
+# missing" code. Set KERNEL_DEBUG_INFO=reduced to ship REDUCED DWARF
+# so the diagnostic rollup can attribute symbols to source dirs. The
+# deployed Image is stripped of debug sections before bootwrapper
+# packing, so this knob only changes vmlinux artifacts and link time.
+KERNEL_DEBUG_INFO=${KERNEL_DEBUG_INFO:-none}
 KERNEL_REPORT_DIR=${KERNEL_REPORT_DIR:-${ROOTDIR}/profiles/kernel-pgo}
 PGO_WORKLOAD_FILE=${PGO_WORKLOAD_FILE:-${ROOTDIR}/configs/pgo-workload.txt}
 PGO_BASE_CONFIG_FRAGMENT=${PGO_BASE_CONFIG_FRAGMENT:-${ROOTDIR}/configs/kernel-pgo-prune.config}
@@ -74,6 +82,7 @@ image_fingerprint() {
         printf 'KERNEL_ORDER_FILE=%s\n' "${KERNEL_ORDER_FILE}"
         printf 'KERNEL_SYSCALL_TABLE=%s\n' "${KERNEL_SYSCALL_TABLE}"
         printf 'KERNEL_CONFIG_FRAGMENT=%s\n' "${KERNEL_CONFIG_FRAGMENT}"
+        printf 'KERNEL_DEBUG_INFO=%s\n' "${KERNEL_DEBUG_INFO}"
         if [ -n "${KERNEL_ORDER_FILE}" ] && [ -f "${KERNEL_ORDER_FILE}" ]; then
             sha256sum "${KERNEL_ORDER_FILE}"
         fi
@@ -759,11 +768,35 @@ build_linux() {
     # default contributes ~180KB of static .data via _printk_rb_static_infos.
     sed -i "/^CONFIG_LOG_BUF_SHIFT=/d" .config
     echo "CONFIG_LOG_BUF_SHIFT=12" >>.config
-    # No DWARF in vmlinux: shortens the kernel link and shrinks build
-    # artifacts. CONFIG_DEBUG_INFO is a hidden bool selected by the
-    # DWARF4/5 choice options; once DEBUG_INFO_NONE wins, it disappears
-    # from .config rather than emitting an explicit "not set" line.
-    echo "CONFIG_DEBUG_INFO_NONE=y" >>.config
+    # DWARF policy. Production: CONFIG_DEBUG_INFO_NONE=y (no DWARF in
+    # vmlinux, fastest link, smallest build artifacts). Diagnostic:
+    # KERNEL_DEBUG_INFO=reduced enables CONFIG_DEBUG_INFO_REDUCED=y so
+    # scripts/subsystem-rollup.py can attribute every .text symbol to
+    # a source directory via addr2line. CONFIG_DEBUG_INFO is a hidden
+    # bool selected by the DWARF choice options; we never set it
+    # directly. The deployed Image is stripped of debug sections, so
+    # the diagnostic build does not change shipped image size.
+    case "${KERNEL_DEBUG_INFO}" in
+    none)
+        echo "CONFIG_DEBUG_INFO_NONE=y" >>.config
+        ;;
+    reduced)
+        # The "Debug information" choice block requires exactly one
+        # positive selection. Pick TOOLCHAIN_DEFAULT (which selects
+        # the hidden CONFIG_DEBUG_INFO bool), then layer
+        # DEBUG_INFO_REDUCED on top -- it is a depends-on-DEBUG_INFO
+        # modifier, not a choice member. Stating only DEBUG_INFO_NONE
+        # off would leave the choice under-specified and olddefconfig
+        # would silently fall back to the kconfig default.
+        echo "# CONFIG_DEBUG_INFO_NONE is not set" >>.config
+        echo "CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y" >>.config
+        echo "CONFIG_DEBUG_INFO_REDUCED=y" >>.config
+        ;;
+    *)
+        echo "ERROR: KERNEL_DEBUG_INFO must be 'none' or 'reduced' (got '${KERNEL_DEBUG_INFO}')"
+        exit 1
+        ;;
+    esac
     # Drop the ARM EABI unwind tables (.ARM.exidx/.ARM.extab, ~75KB).
     # Requires patch 0010 to introduce UNWINDER_NONE on Thumb-2.
     echo "# CONFIG_UNWINDER_ARM is not set" >>.config
@@ -804,6 +837,19 @@ build_linux() {
     echo "# CONFIG_SHMEM is not set" >>.config
     echo "# CONFIG_SECURITY is not set" >>.config
 
+    # Initrd decompressor pruning: the embedded initramfs is gzip-compressed
+    # (CONFIG_INITRAMFS_COMPRESSION_GZIP=y). Every other RD_* selector
+    # defaults to y under EXPERT and pulls a full decompressor library into
+    # the image -- olddefconfig silently restores them after defconfig.
+    # Sub-bucket rollup measured RD_ZSTD = 36,942 bytes (lib/zstd),
+    # RD_LZ4 = 10,972 bytes (lib/lz4), RD_XZ = 6,598 bytes (lib/xz) of
+    # dead .text in the production vmlinux. RD_ZSTD also pulls
+    # lib/xxhash.c (~3KB). Keep RD_GZIP=y as the boot-path requirement;
+    # explicitly disable the rest.
+    echo "# CONFIG_RD_ZSTD is not set" >>.config
+    echo "# CONFIG_RD_LZ4 is not set" >>.config
+    echo "# CONFIG_RD_XZ is not set" >>.config
+
     # Serial-only target: drop the VT terminal layer and accessibility
     # console support.  CONFIG_TTY stays on -- the AMBA PL011 console
     # driver depends on it.
@@ -871,7 +917,6 @@ build_linux() {
         "# CONFIG_BLOCK is not set" \
         "CONFIG_SLUB_TINY=y" \
         "CONFIG_LOG_BUF_SHIFT=12" \
-        "CONFIG_DEBUG_INFO_NONE=y" \
         "CONFIG_UNWINDER_NONE=y" \
         "# CONFIG_IO_URING is not set" \
         "# CONFIG_FUTEX is not set" \
@@ -905,7 +950,10 @@ build_linux() {
         "# CONFIG_SECCOMP is not set" \
         "# CONFIG_KEYS is not set" \
         "# CONFIG_STACKPROTECTOR is not set" \
-        "# CONFIG_DEBUG_BUGVERBOSE is not set"; do
+        "# CONFIG_DEBUG_BUGVERBOSE is not set" \
+        "# CONFIG_RD_ZSTD is not set" \
+        "# CONFIG_RD_LZ4 is not set" \
+        "# CONFIG_RD_XZ is not set"; do
         if ! grep -q "^${opt}\$" .config; then
             echo "ERROR: expected '${opt}' in .config after olddefconfig"
             exit 1
@@ -920,6 +968,28 @@ build_linux() {
         exit 1
     fi
 
+    # DWARF policy is mode-conditional: production keeps DEBUG_INFO_NONE=y,
+    # the diagnostic build keeps DEBUG_INFO_REDUCED=y. The other side must
+    # not survive olddefconfig in either mode.
+    case "${KERNEL_DEBUG_INFO}" in
+    none)
+        if ! grep -q "^CONFIG_DEBUG_INFO_NONE=y\$" .config; then
+            echo "ERROR: expected 'CONFIG_DEBUG_INFO_NONE=y' in .config (KERNEL_DEBUG_INFO=none)"
+            exit 1
+        fi
+        if grep -q "^CONFIG_DEBUG_INFO_REDUCED=y\$" .config; then
+            echo "ERROR: CONFIG_DEBUG_INFO_REDUCED=y survived olddefconfig despite KERNEL_DEBUG_INFO=none"
+            exit 1
+        fi
+        ;;
+    reduced)
+        if ! grep -q "^CONFIG_DEBUG_INFO_REDUCED=y\$" .config; then
+            echo "ERROR: expected 'CONFIG_DEBUG_INFO_REDUCED=y' in .config (KERNEL_DEBUG_INFO=reduced)"
+            exit 1
+        fi
+        ;;
+    esac
+
     # Negative-guard for symbols whose `# CONFIG_X is not set` line gets
     # stripped by olddefconfig because their `depends on` clause is unmet
     # under the current build state (NOMMU, NET=n, SYSFS=n, MULTIUSER=n,
@@ -942,6 +1012,22 @@ build_linux() {
         fi
     done
 
+    # Decompressor library guard. RD_ZSTD/RD_LZ4/RD_XZ disabled above
+    # must cascade to ZSTD_DECOMPRESS / LZ4_DECOMPRESS / XZ_DEC, the
+    # umbrella DECOMPRESS_* hidden bools, and XXHASH (selected by
+    # ZSTD_DECOMPRESS, also pulled by BCACHE / BTRFS but those need
+    # BLOCK=y which this target lacks). If anything else still
+    # selects them (a future fs/ or net/ enable, e.g. squashfs+zstd),
+    # we must catch that drift loudly so the size win does not
+    # silently regress.
+    for sym in ZSTD_DECOMPRESS ZSTD_COMMON LZ4_DECOMPRESS XZ_DEC \
+               XXHASH DECOMPRESS_ZSTD DECOMPRESS_LZ4 DECOMPRESS_XZ; do
+        if grep -q "^CONFIG_${sym}=y\$" .config; then
+            echo "ERROR: CONFIG_${sym}=y survived olddefconfig (decompressor guard tripped)"
+            exit 1
+        fi
+    done
+
     if [ "${KERNEL_EXPERIMENT}" = "llvm-order-use" ]; then
         run_logged "build" kernel_make -j${MAKE_JOBS} KALLSYMS_EXTRA_PASS=1
     else
diff --git a/configs/subsystem-budget.txt b/configs/subsystem-budget.txt
new file mode 100644
index 0000000..cd3857a
--- /dev/null
+++ b/configs/subsystem-budget.txt
@@ -0,0 +1,34 @@
+# Per-bucket vmlinux .text byte ceilings.
+#
+# Format:  <bucket>  <ceiling-bytes>  [<noise-band-pct>]
+#
+# - Bucket names match scripts/subsystem-rollup.py output. Run a
+#   diagnostic build (KERNEL_DEBUG_INFO=reduced) and inspect
+#   profiles/kernel-pgo/none/subsystem-rollup.txt for the live names.
+# - The noise band absorbs run-to-run variance from GCC LTO
+#   re-deciding what to inline when nothing semantic changed. Default
+#   is 2.0%. Start there, then tighten after observing a week of
+#   clean builds. <icf-merged> tends to be jitterier than real
+#   subsystems -- a wider band there is reasonable.
+# - A breach is "actual > limit * (1 + band/100)". The total-bytes
+#   gate is the coarse safeguard; this layer answers WHICH bucket
+#   regressed.
+#
+# How to populate:
+#   1. KERNEL_DEBUG_INFO=reduced ./build.sh linux bootwrapper
+#   2. Read profiles/kernel-pgo/none/subsystem-rollup.txt for the
+#      observed sizes.
+#   3. Pick ceilings 5-10% above each observed value -- enough room
+#      for legitimate growth without masking regressions.
+#
+# Example values (uncomment and tune to your build):
+#   kernel              260000   2.0
+#   mm                   80000   2.0
+#   fs                   20000   2.0
+#   arch/arm            120000   2.0
+#   drivers/tty          25000   2.0
+#   drivers/clocksource  10000   2.0
+#   lib                  70000   2.0
+#   crypto                5000   2.0
+#   <icf-merged>         30000   5.0
+#   <compiler-partition>  5000   5.0
diff --git a/scripts/check-subsystem-budget.py b/scripts/check-subsystem-budget.py
new file mode 100755
index 0000000..b66c653
--- /dev/null
+++ b/scripts/check-subsystem-budget.py
@@ -0,0 +1,157 @@
+#!/usr/bin/env python3
+
+# Diff vmlinux .text subsystem rollup against per-bucket byte budgets.
+#
+# The total-bytes regression gate is the coarse safeguard: it catches
+# the image getting bigger overall. It cannot tell a 3% drop in
+# drivers/ from a 3% growth in mm/ that cancels out. This script reads
+# scripts/subsystem-rollup.py's table and compares each bucket against
+# configs/subsystem-budget.txt with a per-bucket noise band -- LTO
+# re-decides what to inline between rebuilds, so identical sources
+# still produce small per-bucket fluctuations. Default band is +/- 2%;
+# tighten after observing run-to-run variance over a week of clean
+# builds.
+#
+# Exit codes:
+#   0 -- all buckets within band, OR no budget rules active
+#   1 -- one or more buckets exceed (limit * (1 + band/100))
+#   2 -- missing/unreadable inputs (rollup or budget file)
+
+import argparse
+import pathlib
+import sys
+
+DEFAULT_BAND_PCT = 2.0
+
+
+def read_budget(path):
+    rules = {}
+    for raw in path.read_text().splitlines():
+        line = raw.split("#", 1)[0].strip()
+        if not line:
+            continue
+        parts = line.split()
+        if len(parts) < 2:
+            print(
+                f"check-subsystem-budget: ignoring malformed rule "
+                f"in {path}: {raw!r}",
+                file=sys.stderr,
+            )
+            continue
+        bucket = parts[0]
+        try:
+            limit = int(parts[1])
+        except ValueError:
+            print(
+                f"check-subsystem-budget: non-integer limit in {path}: "
+                f"{raw!r}",
+                file=sys.stderr,
+            )
+            continue
+        try:
+            band = float(parts[2]) if len(parts) >= 3 else DEFAULT_BAND_PCT
+        except ValueError:
+            band = DEFAULT_BAND_PCT
+        rules[bucket] = (limit, band)
+    return rules
+
+
+def read_rollup(path):
+    rows = {}
+    for raw in path.read_text().splitlines():
+        if not raw or raw.startswith("#"):
+            continue
+        # Tab-delimited. Bucket names contain dashes and angle brackets,
+        # so split only on tab; leading whitespace is reserved for header
+        # commentary that the # filter above already drops.
+        parts = raw.split("\t")
+        if len(parts) < 2:
+            continue
+        bucket = parts[0]
+        try:
+            rows[bucket] = int(parts[1])
+        except ValueError:
+            continue
+    return rows
+
+
+def main(argv):
+    ap = argparse.ArgumentParser(
+        description="Compare subsystem rollup against per-bucket budgets."
+    )
+    ap.add_argument("--rollup", required=True, type=pathlib.Path)
+    ap.add_argument("--budget", required=True, type=pathlib.Path)
+    ap.add_argument(
+        "--output",
+        required=True,
+        type=pathlib.Path,
+        help="Where to write the human-readable status table.",
+    )
+    args = ap.parse_args(argv)
+
+    if not args.rollup.exists():
+        print(
+            f"check-subsystem-budget: rollup not found: {args.rollup}",
+            file=sys.stderr,
+        )
+        return 2
+    if not args.budget.exists():
+        print(
+            f"check-subsystem-budget: budget not found: {args.budget}",
+            file=sys.stderr,
+        )
+        return 2
+
+    budgets = read_budget(args.budget)
+    rollup = read_rollup(args.rollup)
+
+    if not budgets:
+        # Empty file is a deliberate state: the operator has staged the
+        # gate but not pinned ceilings yet (typical after the first
+        # diagnostic build). Emit a status note and succeed.
+        args.output.parent.mkdir(parents=True, exist_ok=True)
+        args.output.write_text(
+            "# subsystem budget check\n"
+            "# no active rules in budget file -- nothing to gate\n"
+        )
+        return 0
+
+    breaches = []
+    lines = [
+        f"# subsystem budget check (default band = +/- {DEFAULT_BAND_PCT}%)",
+        f"# rollup: {args.rollup}",
+        f"# budget: {args.budget}",
+        "# bucket\tactual\tlimit\tband_pct\tdelta_vs_limit\tstatus",
+    ]
+    for bucket, (limit, band) in sorted(budgets.items()):
+        actual = rollup.get(bucket, 0)
+        delta = actual - limit
+        ceiling = int(limit * (1 + band / 100.0))
+        status = "BREACH" if actual > ceiling else "ok"
+        if status == "BREACH":
+            breaches.append((bucket, actual, limit, band, delta))
+        lines.append(
+            f"{bucket}\t{actual}\t{limit}\t{band:.1f}\t{delta:+d}\t{status}"
+        )
+
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+    args.output.write_text("\n".join(lines) + "\n")
+
+    if breaches:
+        print(
+            f"check-subsystem-budget: {len(breaches)} bucket(s) breach "
+            "the budget (after noise band):",
+            file=sys.stderr,
+        )
+        for bucket, actual, limit, band, delta in breaches:
+            print(
+                f"  {bucket}: {actual} > {limit} "
+                f"({delta:+d} bytes, band {band:.1f}%)",
+                file=sys.stderr,
+            )
+        return 1
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv[1:]))
diff --git a/scripts/collect-kernel-profile.sh b/scripts/collect-kernel-profile.sh
index 7bb8ce7..4311f0d 100755
--- a/scripts/collect-kernel-profile.sh
+++ b/scripts/collect-kernel-profile.sh
@@ -37,7 +37,7 @@ fi
 rm -f "${TRACE_LOG}" "${CONSOLE_LOG}" "${MANIFEST_LOG}" \
     "${PROFILE_PREFIX}_ld_profile.txt" "${PROFILE_PREFIX}_summary.txt" \
     "${PROFILE_PREFIX}_hits.txt" "${PROFILE_PREFIX}_concentration.txt" \
-    "${PROFILE_PREFIX}_syscalls.txt"
+    "${PROFILE_PREFIX}_syscalls.txt" "${PROFILE_PREFIX}_bootcost.txt"
 
 # exec,in_asm: TB execution counts + disassembly for SVC site detection.
 # Add "cpu" (QEMU_LOG=exec,cpu,in_asm) to enable R7-based syscall number
@@ -66,3 +66,4 @@ echo "  ${PROFILE_PREFIX}_hits.txt"
 echo "  ${PROFILE_PREFIX}_concentration.txt"
 echo "  ${PROFILE_PREFIX}_summary.txt"
 echo "  ${PROFILE_PREFIX}_syscalls.txt"
+echo "  ${PROFILE_PREFIX}_bootcost.txt"
diff --git a/scripts/kernel-size-report.sh b/scripts/kernel-size-report.sh
index b32cbdb..1d55020 100755
--- a/scripts/kernel-size-report.sh
+++ b/scripts/kernel-size-report.sh
@@ -11,7 +11,9 @@ MODE=${5:-full}
 TARGET=arm-uclinuxfdpiceabi
 SIZE_TOOL=${ROOTDIR}/toolchain/bin/${TARGET}-size
 NM_TOOL=${ROOTDIR}/toolchain/bin/${TARGET}-nm
-TEXT_ROLLUP=${ROOTDIR}/scripts/text-rollup.py
+SUBSYSTEM_ROLLUP=${ROOTDIR}/scripts/subsystem-rollup.py
+SUBSYSTEM_BUDGET_CHECK=${ROOTDIR}/scripts/check-subsystem-budget.py
+SUBSYSTEM_BUDGET_FILE=${ROOTDIR}/configs/subsystem-budget.txt
 BLOAT_O_METER=${LINUXDIR}/scripts/bloat-o-meter
 
 mkdir -p "${OUTDIR}"
@@ -148,7 +150,7 @@ report_bloat_o_meter() {
     # bloat-o-meter shells out to `${prefix}nm` directly, so it needs the
     # cross toolchain on PATH. build.sh exports PATH already; running this
     # script standalone does not. Run in a subshell so the modified PATH
-    # does not leak into report_text_rollup or anything downstream.
+    # does not leak into report_subsystem_rollup or anything downstream.
     (
         PATH=${ROOTDIR}/toolchain/bin:${PATH}
         export PATH
@@ -171,28 +173,73 @@ report_bloat_o_meter() {
     )
 }
 
-# Subsystem rollup is opt-out only when DWARF is missing or addr2line
-# fails. A missing rollup is acceptable; an empty stale file is not.
-report_text_rollup() {
-    # Use -r, not -x: the script is invoked via `python3 ${TEXT_ROLLUP}`,
-    # so the +x bit is not part of the runtime contract. A checkout that
-    # preserves contents but drops the execute bit (zip extraction, some
-    # rsync flags) would otherwise silently disable the rollup.
-    if [ ! -r "${TEXT_ROLLUP}" ] || [ ! -f "${OUTDIR}/vmlinux.current" ]; then
-        rm -f "${OUTDIR}/text-rollup.txt" "${OUTDIR}/text-rollup.err"
+# LTO-aware subsystem rollup. The Python tool fails with exit code 2
+# when DWARF is missing (production builds, CONFIG_DEBUG_INFO_NONE=y);
+# the shell layer treats that as a documented skip rather than a build
+# failure. The diagnostic build (KERNEL_DEBUG_INFO=reduced) ships
+# enough DWARF for addr2line to attribute symbols, and the rollup
+# emits subsystem-rollup.txt + .svg + .html under OUTDIR.
+report_subsystem_rollup() {
+    # Use -r, not -x: the script runs via `python3`, so the +x bit
+    # is not part of the runtime contract. A checkout that preserves
+    # contents but drops the execute bit (zip extraction, some rsync
+    # flags) would otherwise silently disable the rollup.
+    if [ ! -r "${SUBSYSTEM_ROLLUP}" ] || [ ! -f "${OUTDIR}/vmlinux.current" ]; then
+        rm -f "${OUTDIR}/subsystem-rollup.txt" \
+            "${OUTDIR}/subsystem-rollup.err" \
+            "${OUTDIR}/subsystem-rollup-bars.svg" \
+            "${OUTDIR}/subsystem-rollup-tree.html" \
+            "${OUTDIR}/subsystem-rollup-deep.txt" \
+            "${OUTDIR}/subsystem-rollup-deep.html"
         return 0
     fi
 
-    if python3 "${TEXT_ROLLUP}" \
+    # --deep kernel --deep lib drives the per-bucket source-file
+    # breakdown the methodology refers to. The two big parents account
+    # for ~45% of resident .text after the RD_*-cleanup work, and the
+    # depth-2 view (kernel/sched, lib/zstd, ...) is what surfaces
+    # actionable single-knob disables. Add more --deep flags here when
+    # another parent bucket grows large enough to warrant drilling.
+    if python3 "${SUBSYSTEM_ROLLUP}" \
             --vmlinux "${OUTDIR}/vmlinux.current" \
             --linux-tree "${LINUXDIR}" \
-            --output "${OUTDIR}/text-rollup.txt" 2>"${OUTDIR}/text-rollup.err"; then
-        rm -f "${OUTDIR}/text-rollup.err"
+            --toolchain-bin "${ROOTDIR}/toolchain/bin" \
+            --output "${OUTDIR}/subsystem-rollup.txt" \
+            --deep kernel --deep lib \
+            2>"${OUTDIR}/subsystem-rollup.err"; then
+        rm -f "${OUTDIR}/subsystem-rollup.err"
     else
-        rm -f "${OUTDIR}/text-rollup.txt"
+        # Failure modes:
+        #   exit 1 -- input/tooling problem (loud); leave .err for triage
+        #   exit 2 -- DWARF missing (production build); expected, drop
+        #             the stale outputs but keep .err so the operator
+        #             can confirm the cause if it surprises them.
+        rm -f "${OUTDIR}/subsystem-rollup.txt" \
+            "${OUTDIR}/subsystem-rollup-bars.svg" \
+            "${OUTDIR}/subsystem-rollup-tree.html" \
+            "${OUTDIR}/subsystem-rollup-deep.txt" \
+            "${OUTDIR}/subsystem-rollup-deep.html"
     fi
 }
 
+# Compare the subsystem rollup against configs/subsystem-budget.txt.
+# Warn-only: a breach prints to stderr and writes a status file but
+# does not abort the build. The total-bytes regression gate stays the
+# coarse gate; this layer answers WHICH bucket regressed.
+report_subsystem_budget() {
+    if [ ! -r "${SUBSYSTEM_BUDGET_CHECK}" ] \
+        || [ ! -r "${SUBSYSTEM_BUDGET_FILE}" ] \
+        || [ ! -f "${OUTDIR}/subsystem-rollup.txt" ]; then
+        rm -f "${OUTDIR}/subsystem-budget.txt"
+        return 0
+    fi
+
+    python3 "${SUBSYSTEM_BUDGET_CHECK}" \
+        --rollup "${OUTDIR}/subsystem-rollup.txt" \
+        --budget "${SUBSYSTEM_BUDGET_FILE}" \
+        --output "${OUTDIR}/subsystem-budget.txt" || true
+}
+
 case "${MODE}" in
 files)
     report_file_sizes
@@ -205,7 +252,8 @@ full)
     snapshot_vmlinux
     report_section_delta
     report_bloat_o_meter
-    report_text_rollup
+    report_subsystem_rollup
+    report_subsystem_budget
     ;;
 *)
     echo "ERROR: unknown mode '${MODE}' (expected: full | files)" >&2
diff --git a/scripts/qemu-trace-to-orderfile.py b/scripts/qemu-trace-to-orderfile.py
index d003bbd..0055fe8 100755
--- a/scripts/qemu-trace-to-orderfile.py
+++ b/scripts/qemu-trace-to-orderfile.py
@@ -20,6 +20,138 @@
 ORDER_MAX_SYMBOLS = 256
 ORDER_TARGET_HIT_RATIO = 0.80
 
+# GCC LTO/IPA clones share a base function but suffix the symbol name.
+# Roll them up so a bootcost roll-up does not under-count work that LTO
+# split into a private clone.
+LTO_SUFFIX_RE = re.compile(r"\.(lto_priv|constprop|part|isra|cold|local|fold)\.\d+$")
+
+# Buckets used by the bootcost roll-up.  Patterns match against the
+# LTO-stripped base symbol name.  Order matters: a symbol joins the first
+# bucket it matches, so place narrower buckets first.
+BOOTCOST_BUCKETS = (
+    (
+        "context_switch",
+        (
+            "__switch_to",
+            "cpu_switch_to",
+        ),
+    ),
+    (
+        "scheduler",
+        (
+            "__schedule",
+            "schedule",
+            "schedule_idle",
+            "do_idle",
+            "cpuhp_report_idle_dead",
+            "pick_next_task_fair",
+            "pick_next_task",
+            "dequeue_entities",
+            "dequeue_task_fair",
+            "enqueue_task_fair",
+            "update_curr",
+            "update_load_avg",
+            "__cond_resched",
+            "__sched_setscheduler",
+        ),
+    ),
+    (
+        "syscall_entry",
+        (
+            "vector_swi",
+            "ret_fast_syscall",
+            "syscall_trace_enter",
+            "syscall_trace_exit",
+        ),
+    ),
+    (
+        "exec_path",
+        (
+            "do_execveat_common",
+            "kernel_execve",
+            "bprm_execve",
+            "bprm_execve_security",
+            "load_elf_fdpic_binary",
+            "elf_fdpic_map_file",
+            "elf_fdpic_fetch_phdrs",
+            "setup_arg_pages",
+            "copy_string_kernel",
+            "sys_execve",
+            "sys_execveat",
+            "__se_sys_execve",
+            "__se_sys_execveat",
+        ),
+    ),
+    (
+        "fork_clone",
+        (
+            "kernel_clone",
+            "copy_process",
+            "wake_up_new_task",
+            "sys_clone",
+            "sys_vfork",
+            "__se_sys_clone",
+            "__se_sys_vfork",
+        ),
+    ),
+    (
+        "softirq_irq",
+        (
+            "__do_softirq",
+            "____do_softirq",
+            "irq_enter",
+            "irq_exit",
+            "handle_IRQ",
+            "asm_do_IRQ",
+        ),
+    ),
+)
+
+
+def _strip_lto_suffix(name):
+    while True:
+        stripped = LTO_SUFFIX_RE.sub("", name)
+        if stripped == name:
+            return stripped
+        name = stripped
+
+
+def compute_bootcost(counts, total_hits):
+    bucket_index = {}
+    for bucket_name, members in BOOTCOST_BUCKETS:
+        for member in members:
+            bucket_index.setdefault(member, bucket_name)
+
+    bucket_totals = collections.OrderedDict(
+        (name, 0) for name, _ in BOOTCOST_BUCKETS
+    )
+    bucket_members = collections.defaultdict(list)
+    matched_total = 0
+
+    for symbol, hits in counts.items():
+        base = _strip_lto_suffix(symbol)
+        bucket = bucket_index.get(base)
+        if bucket is None:
+            continue
+        bucket_totals[bucket] += hits
+        bucket_members[bucket].append((base, symbol, hits))
+        matched_total += hits
+
+    for bucket in bucket_members:
+        bucket_members[bucket].sort(key=lambda item: (-item[2], item[0]))
+
+    bucket_ratios = collections.OrderedDict()
+    for name, hits in bucket_totals.items():
+        bucket_ratios[name] = (hits / total_hits) if total_hits else 0.0
+
+    return {
+        "bucket_totals": bucket_totals,
+        "bucket_ratios": bucket_ratios,
+        "bucket_members": bucket_members,
+        "matched_total": matched_total,
+        "matched_ratio": (matched_total / total_hits) if total_hits else 0.0,
+    }
+
 
 def resolve_nm(vmlinux: pathlib.Path):
     cross_compile = os.environ.get("CROSS_COMPILE", "")
@@ -215,6 +347,35 @@ def write_outputs(prefix: pathlib.Path, counts, first_seen, total, matched, sysc
         )
         handle.write(f"layout_ordering_reason={concentration['layout_ordering_reason']}\n")
 
+    bootcost = compute_bootcost(counts, matched)
+    bootcost_path = prefix.with_name(prefix.name + "_bootcost.txt")
+    with bootcost_path.open("w", encoding="utf-8") as handle:
+        handle.write("# Boot-cost roll-up (TB executions per scheduler/exec/syscall bucket).\n")
+        handle.write("# Counts are translation-block executions, NOT cycles: QEMU's MPS2-AN386\n")
+        handle.write("# model maps the entire DWT block (0xe0001000) to a RAZ/WI default handler\n")
+        handle.write("# (qemu hw/arm/armv7m.c, ppb_default_ops), so DWT_CYCCNT reads as zero and\n")
+        handle.write("# any kernel-side cycle-delta instrumentation would silently produce 0.\n")
+        handle.write(f"matched_kernel_blocks={matched}\n")
+        handle.write(f"bootcost_matched_blocks={bootcost['matched_total']}\n")
+        handle.write(f"bootcost_matched_ratio={bootcost['matched_ratio']:.4f}\n")
+        for bucket_name in bootcost["bucket_totals"]:
+            handle.write(
+                f"{bucket_name}_hits={bootcost['bucket_totals'][bucket_name]}\n"
+            )
+            handle.write(
+                f"{bucket_name}_ratio={bootcost['bucket_ratios'][bucket_name]:.4f}\n"
+            )
+        handle.write("members:\n")
+        for bucket_name, _ in BOOTCOST_BUCKETS:
+            members = bootcost["bucket_members"].get(bucket_name, [])
+            if not members:
+                handle.write(f"  [{bucket_name}] (no hits)\n")
+                continue
+            handle.write(f"  [{bucket_name}]\n")
+            for base, symbol, hits in members:
+                tag = symbol if symbol == base else f"{symbol} -> {base}"
+                handle.write(f"    {hits:8d} {tag}\n")
+
     summary = prefix.with_name(prefix.name + "_summary.txt")
     with summary.open("w", encoding="utf-8") as handle:
         handle.write("profile_source=qemu-system-arm-system-mode\n")
@@ -233,6 +394,11 @@ def write_outputs(prefix: pathlib.Path, counts, first_seen, total, matched, sysc
         )
         handle.write(f"layout_ordering_reason={concentration['layout_ordering_reason']}\n")
         handle.write(f"detected_syscalls={len(syscall_counts)}\n")
+        for bucket_name in bootcost["bucket_totals"]:
+            handle.write(
+                f"bootcost_{bucket_name}_ratio={bootcost['bucket_ratios'][bucket_name]:.4f}\n"
+            )
+        handle.write(f"bootcost_total_ratio={bootcost['matched_ratio']:.4f}\n")
         handle.write("top_symbols:\n")
         for name, count in counts.most_common(80):
             handle.write(f"{count:8d} {name}\n")
diff --git a/scripts/subsystem-rollup.py b/scripts/subsystem-rollup.py
new file mode 100755
index 0000000..2c5e1e3
--- /dev/null
+++ b/scripts/subsystem-rollup.py
@@ -0,0 +1,973 @@
+#!/usr/bin/env python3
+
+# LTO-aware vmlinux .text rollup by source subsystem.
+#
+# Maps every sized text symbol back to a top-level kernel directory
+# (init/, kernel/, mm/, fs/, drivers/, net/, security/, ...). The five
+# normalization rules below are the only way the rollup numbers are
+# trustworthy under GCC LTO; without them the table silently double-
+# counts ICF-merged code or miscredits constprop/isra clones.
+#
+#   1. GCC clone suffixes are stripped before bucket lookup but their
+#      bytes still sum into the parent bucket. A `.constprop.0` clone
+#      is a constant-specialized COPY of the function body, not an
+#      alias; the bytes are real. Suffixes can stack
+#      (foo.constprop.0.isra.0) so stripping iterates.
+#   2. GCC IPA-ICF (ld.bfd has no --icf) preserves merged names as
+#      aliases pointing at the same start address. nm-then-bucket would
+#      double-count. We dedup by start address; multi-name groups land
+#      in <icf-merged> because true pre-fold provenance is not
+#      recoverable from the final vmlinux alone.
+#   3. Cross-TU inlining: the surviving symbol's source attribution is
+#      the caller's file, not the inlinee's. The rollup answers "where
+#      the code lives in the image," not "where it was written."
+#   4. Compiler section partitioning (.text.hot/.text.unlikely/...) is
+#      metadata, not a bucket. Symbols inside still resolve normally.
+#      Bytes the linker emitted that no nm symbol claims (alignment
+#      padding, partition fragments without an owner) fall through to
+#      <compiler-partition>.
+#   5. addr2line -i emits the full inline stack per query. We use -p
+#      mode and the "(inlined by) " continuation marker to delimit
+#      stacks reliably, then pick the OUTERMOST frame for attribution
+#      (matches "where the code lives" framing).
+#
+# Production builds run without DWARF (CONFIG_DEBUG_INFO_NONE=y).
+# addr2line then resolves nothing and the rollup would degrade to
+# ~99.7% <unknown>. This script fails with a non-zero exit when more
+# than half of .text is unresolved, so the caller can skip the rollup
+# rather than emit a misleading file. Diagnostic builds opt in via
+# KERNEL_DEBUG_INFO=reduced (CONFIG_DEBUG_INFO_REDUCED=y).
+#
+# Outputs (written next to --output):
+#   subsystem-rollup.txt        primary table consumed by the gate
+#   subsystem-rollup-bars.svg   horizontal bars, sorted by bytes
+#   subsystem-rollup-tree.html  D3 treemap, hover-to-drill
+#   subsystem-rollup-deep.txt   per-bucket 2nd-level + top-file
+#                               breakdown (only when --deep is set)
+#   subsystem-rollup-deep.html  styled-table version of the deep
+#                               breakdown (only when --deep is set)
+
+import argparse
+import collections
+import json
+import os
+import pathlib
+import posixpath
+import re
+import shutil
+import subprocess
+import sys
+import xml.sax.saxutils
+
+TEXT_TYPES = {"T", "t", "W", "w"}
+
+
+def is_resident_text_section(section):
+    # We want resident text only. Linux's vmlinux.lds.S typically
+    # collapses per-function .text.<funcname> sections into a single
+    # .text after --gc-sections, so the common case is exact ".text".
+    # Allow .text.hot / .text.unlikely / .text.<funcname> in case the
+    # linker keeps any per-bucket fragments separate. Reject
+    # .init.text, .exit.text, .head.text, .ref.text -- those are not
+    # part of resident .text and would inflate the rollup by ~7%.
+    return section == ".text" or section.startswith(".text.")
+
+TOP_LEVEL_DIRS = frozenset({
+    "arch", "block", "certs", "crypto", "drivers", "fs", "init",
+    "io_uring", "ipc", "kernel", "lib", "mm", "net", "rust", "samples",
+    "scripts", "security", "sound", "tools", "usr", "virt",
+})
+
+# Subdirectories large enough to deserve a one-deeper bucket so a
+# single driver family does not hide inside the parent.
+SPLIT_ONE_DEEPER = frozenset({"arch", "drivers", "sound", "net"})
+
+# Iterative strip set. Patterns must be anchored to end-of-name and may
+# stack: GCC emits names like `foo.constprop.0.isra.0` after multiple
+# IPA passes. The dot-suffix family is restricted -- do NOT generalize
+# to "trim any trailing dot segment," GCC also produces legitimate
+# non-clone dot-suffixes (e.g. `__cfi_*`).
+CLONE_SUFFIX_RE = re.compile(
+    r"\.(?:lto_priv|constprop|isra|part|cold|localalias)(?:\.\d+)?$"
+)
+
+# addr2line -p -f -i: first frame on its own line, outer frames each
+# prefixed " (inlined by) ". We read until the next un-prefixed line
+# to know the previous query's stack is complete.
+INLINED_BY_PREFIX = " (inlined by) "
+
+# "func at file:line" or "func at file:line (discriminator N)".
+PRETTY_LINE_RE = re.compile(
+    r"^(?P<func>.*?) at (?P<path>.*?):[0-9?]+"
+    r"(?:\s+\(discriminator.*\))?$"
+)
+
+UNKNOWN_LOCATION = "??"
+UNRESOLVED_FAIL_RATIO = 0.5
+
+
+def resolve_tool(name, toolchain_bin):
+    # Precedence: --toolchain-bin > CROSS_COMPILE > PATH. Stray
+    # CROSS_COMPILE pointing at a different toolchain would otherwise
+    # silently mismatch nm and addr2line and corrupt attribution.
+    candidates = []
+    if toolchain_bin is not None:
+        candidates.append(str(toolchain_bin / ("arm-uclinuxfdpiceabi-" + name)))
+    cross_compile = os.environ.get("CROSS_COMPILE", "")
+    if cross_compile:
+        candidates.append(cross_compile + name)
+    candidates.append("arm-uclinuxfdpiceabi-" + name)
+
+    for candidate in candidates:
+        if pathlib.Path(candidate).is_absolute():
+            resolved = candidate if pathlib.Path(candidate).exists() else None
+        else:
+            resolved = shutil.which(candidate)
+        if resolved and pathlib.Path(resolved).exists():
+            return resolved
+    raise FileNotFoundError(f"unable to locate arm-uclinuxfdpiceabi-{name}")
+
+
+def collect_text_symbols(nm, vmlinux):
+    # `-f sysv` is the only nm format that exposes the ELF section per
+    # symbol, which we need to exclude .init.text and friends. The
+    # output is pipe-delimited:
+    #   Name | Value | Class | Type | Size | Line | Section
+    # Older nm emits a leading header and a blank line; both are
+    # filtered by the field-count check below.
+    cmd = [nm, "-n", "-f", "sysv", "--defined-only", str(vmlinux)]
+    proc = subprocess.run(cmd, check=True, text=True, capture_output=True)
+    rows = []
+    for line in proc.stdout.splitlines():
+        if "|" not in line:
+            continue
+        fields = [f.strip() for f in line.split("|")]
+        if len(fields) < 7:
+            continue
+        name, value, klass, _type, size_str, _src_line, section = fields
+        if len(klass) != 1 or klass not in TEXT_TYPES:
+            continue
+        if not is_resident_text_section(section):
+            continue
+        try:
+            addr = int(value, 16)
+            size = int(size_str, 16)
+        except ValueError:
+            continue
+        if size == 0:
+            continue
+        # Thumb function symbols carry the low bit set; addr2line and
+        # the linker want the even instruction address.
+        rows.append((addr & ~1, size, name))
+    return rows
+
+
+def normalize_clone(name):
+    canonical = name
+    stripped = False
+    while True:
+        m = CLONE_SUFFIX_RE.search(canonical)
+        if not m:
+            return canonical, stripped
+        canonical = canonical[: m.start()]
+        stripped = True
+
+
+def parse_pretty_line(line):
+    m = PRETTY_LINE_RE.match(line)
+    if not m:
+        return None, None
+    return m.group("func"), m.group("path")
+
+
+def resolve_inline_stacks(addr2line, vmlinux, addresses):
+    if not addresses:
+        return []
+    cmd = [addr2line, "-e", str(vmlinux), "-p", "-f", "-i"]
+    payload = "\n".join(f"0x{a:x}" for a in addresses) + "\n"
+    proc = subprocess.run(
+        cmd, input=payload, check=True, text=True, capture_output=True
+    )
+
+    stacks = []
+    current = None
+    for raw in proc.stdout.splitlines():
+        if raw.startswith(INLINED_BY_PREFIX):
+            if current is None:
+                # Continuation without a head: corrupt output. Drop
+                # the line; the unresolved tally will surface it.
+                continue
+            current.append(parse_pretty_line(raw[len(INLINED_BY_PREFIX):]))
+        else:
+            if current is not None:
+                stacks.append(current)
+            current = [parse_pretty_line(raw)]
+    if current is not None:
+        stacks.append(current)
+    if len(stacks) != len(addresses):
+        raise RuntimeError(
+            f"addr2line returned {len(stacks)} inline stacks for "
+            f"{len(addresses)} addresses; pretty-print parser desynced"
+        )
+    return stacks
+
+
+def normalize_path(path, tree_prefix):
+    if path is None or path in ("", UNKNOWN_LOCATION):
+        return None
+    # Lex-normalize `.` / `..` segments before any further routing.
+    # DWARF emits paths like `lib/../scripts/dtc/libfdt/fdt_ro.c` for
+    # cross-tree includes (libfdt is built from `scripts/dtc/libfdt/`
+    # but referenced relative to `lib/`). Without normpath, both
+    # `bucket_for()` and the depth-2 splitter see `lib` as the head
+    # and produce the meaningless key `lib/..`. posixpath.normpath
+    # is filesystem-free; it will not follow symlinks or stat the
+    # path, only collapse the segments.
+    path = posixpath.normpath(path)
+    if tree_prefix and (path == tree_prefix or path.startswith(tree_prefix + "/")):
+        return path[len(tree_prefix):].lstrip("/")
+    if path.startswith("/"):
+        # Build path stayed absolute. Walk components for the first
+        # known top-level kernel dir.
+        parts = path.split("/")
+        for idx, part in enumerate(parts):
+            if part in TOP_LEVEL_DIRS:
+                return "/".join(parts[idx:])
+        return path  # external -- bucket_for() will route to <external>
+    return path
+
+
+def bucket_for(path):
+    if path is None:
+        return "<unknown>"
+    if path.startswith("/"):
+        return "<external>"
+    parts = path.split("/")
+    head = parts[0]
+    if head not in TOP_LEVEL_DIRS:
+        return head or "<unknown>"
+    if head in SPLIT_ONE_DEEPER and len(parts) > 1:
+        return f"{head}/{parts[1]}"
+    return head
+
+
+def attribute_outermost(stack, tree_prefix):
+    # stack is innermost-first; the outermost frame ("where the code
+    # lives in the image") is the last entry. Walk from the end so a
+    # well-resolved outer frame wins over an unresolvable inner frame.
+    for func, path in reversed(stack):
+        normed = normalize_path(path, tree_prefix)
+        if normed is not None:
+            return bucket_for(normed), normed
+    return "<unknown>", None
+
+
+def text_section_total(size_tool, vmlinux):
+    try:
+        proc = subprocess.run(
+            [size_tool, "-A", str(vmlinux)],
+            check=True, text=True, capture_output=True,
+        )
+    except (FileNotFoundError, subprocess.CalledProcessError):
+        return None
+    total = 0
+    for line in proc.stdout.splitlines():
+        fields = line.split()
+        if len(fields) < 2:
+            continue
+        if not is_resident_text_section(fields[0]):
+            continue
+        try:
+            total += int(fields[1])
+        except ValueError:
+            continue
+    return total or None
+
+
+class BucketAgg:
+    __slots__ = ("bytes", "symbols", "icf", "clones", "files", "icf_groups")
+
+    def __init__(self):
+        self.bytes = 0
+        self.symbols = 0
+        self.icf = 0
+        self.clones = 0
+        self.files = collections.defaultdict(
+            lambda: {"bytes": 0, "symbols": 0}
+        )
+        self.icf_groups = []  # filled only for the <icf-merged> bucket
+
+
+def build_deduped_entries(rows):
+    # Group by start address. Multiple distinct names at the same
+    # address after clone-suffix stripping is the IPA-ICF signature.
+    by_addr = collections.defaultdict(list)
+    for addr, size, name in rows:
+        by_addr[addr].append((size, name))
+
+    addrs = sorted(by_addr.keys())
+    entries = []
+    for i, addr in enumerate(addrs):
+        items = by_addr[addr]
+        names = sorted({n for _, n in items})
+        canonicals = []
+        any_clone = False
+        for n in names:
+            cn, was_clone = normalize_clone(n)
+            canonicals.append(cn)
+            any_clone = any_clone or was_clone
+        is_icf = len({c for c in canonicals}) > 1
+        # Symbols at the same address should all carry the same size,
+        # but defend against weak/strong overlays by taking the max.
+        size = max(s for s, _ in items)
+        # Clamp by next address so ARM entry-point macros that emit
+        # multiple T symbols with overlapping ends do not double-count.
+        next_addr = addrs[i + 1] if i + 1 < len(addrs) else addr + size
+        effective = min(size, max(0, next_addr - addr))
+        if effective <= 0:
+            continue
+        entries.append({
+            "addr": addr,
+            "size": effective,
+            "aliases": names,
+            "is_icf": is_icf,
+            "is_clone": any_clone,
+        })
+    return entries
+
+
+def aggregate(entries, stacks, tree_prefix):
+    buckets = collections.defaultdict(BucketAgg)
+    total_bytes = 0
+    for entry, stack in zip(entries, stacks):
+        if entry["is_icf"]:
+            # True pre-fold provenance is not reliably recoverable from
+            # the final vmlinux. Bucket the bytes once under <icf-merged>
+            # and stash the alias list for the treemap drill-down.
+            bucket_name, normed = "<icf-merged>", None
+        else:
+            bucket_name, normed = attribute_outermost(stack, tree_prefix)
+
+        b = buckets[bucket_name]
+        b.bytes += entry["size"]
+        b.symbols += 1
+        if entry["is_icf"]:
+            b.icf += 1
+            b.icf_groups.append({
+                "addr": entry["addr"],
+                "size": entry["size"],
+                "aliases": entry["aliases"],
+            })
+        if entry["is_clone"]:
+            b.clones += 1
+
+        file_key = normed if normed else "<unfiled>"
+        b.files[file_key]["bytes"] += entry["size"]
+        b.files[file_key]["symbols"] += 1
+        total_bytes += entry["size"]
+    return buckets, total_bytes
+
+
+def write_table(buckets, grand_total, path):
+    rows = sorted(buckets.items(), key=lambda kv: kv[1].bytes, reverse=True)
+    lines = [
+        "# subsystem-rollup -- vmlinux .text by source bucket",
+        "# Attribution answers 'where the code lives in the final image,'",
+        "# not 'where it was originally written.' Under -flto, leaf",
+        "# functions are inlined into callers; the surviving address",
+        "# belongs to the caller's file.",
+        "# Special buckets: <icf-merged> (IPA-ICF folded, pre-fold",
+        "# origin not recoverable), <compiler-partition> (linker-emitted",
+        "# bytes outside any nm symbol), <unknown> (DWARF resolution",
+        "# failed), <external> (path outside the kernel tree).",
+        f"# total bytes: {grand_total}",
+        "#",
+        "# bucket\tbytes\tpercent\tsymbols\ticf_merged\tlto_clones",
+    ]
+    for name, b in rows:
+        pct = (b.bytes * 100.0 / grand_total) if grand_total else 0.0
+        lines.append(
+            f"{name}\t{b.bytes}\t{pct:.2f}\t{b.symbols}"
+            f"\t{b.icf}\t{b.clones}"
+        )
+    path.write_text("\n".join(lines) + "\n")
+
+
+def _esc(s):
+    # Escape both quote styles in addition to <>&. The current HTML
+    # template uses _esc() only in tag content, but the same helper
+    # is the obvious place a future caller would reach for when
+    # filling an attribute value -- keep it safe in both contexts so
+    # a future change does not silently introduce an XSS surface.
+    return xml.sax.saxutils.escape(
+        str(s), {'"': "&quot;", "'": "&apos;"}
+    )
+
+
+def write_svg(buckets, grand_total, path):
+    rows = sorted(buckets.items(), key=lambda kv: kv[1].bytes, reverse=True)
+    if not rows:
+        path.write_text(
+            '<svg xmlns="http://www.w3.org/2000/svg" width="10" height="10"/>\n'
+        )
+        return
+
+    chart_w = 720
+    label_w = 240
+    text_w = 180
+    bar_h = 22
+    pad = 4
+    height = pad * 2 + len(rows) * (bar_h + pad)
+    width = label_w + chart_w + text_w
+    max_bytes = max(b.bytes for _, b in rows) or 1
+
+    parts = [
+        '<?xml version="1.0" encoding="UTF-8"?>',
+        f'<svg xmlns="http://www.w3.org/2000/svg" width="{width}" '
+        f'height="{height}" font-family="ui-monospace, monospace" '
+        'font-size="13">',
+        f'<rect width="{width}" height="{height}" fill="#fafafa"/>',
+        f'<line x1="{label_w}" y1="0" x2="{label_w}" y2="{height}" '
+        'stroke="#ddd"/>',
+    ]
+    for i, (name, b) in enumerate(rows):
+        y = pad + i * (bar_h + pad)
+        bar_w = max(1, int(chart_w * b.bytes / max_bytes))
+        pct = (b.bytes * 100.0 / grand_total) if grand_total else 0.0
+        color = "#c84030" if name.startswith("<") else "#4682b4"
+        parts.append(
+            f'<text x="{label_w - 8}" y="{y + bar_h - 6}" '
+            f'text-anchor="end" fill="#222">{_esc(name)}</text>'
+        )
+        parts.append(
+            f'<rect x="{label_w}" y="{y}" width="{bar_w}" '
+            f'height="{bar_h - 2}" fill="{color}"/>'
+        )
+        parts.append(
+            f'<text x="{label_w + bar_w + 6}" y="{y + bar_h - 6}" '
+            f'fill="#222">{b.bytes:,} ({pct:.1f}%)</text>'
+        )
+    parts.append("</svg>")
+    path.write_text("\n".join(parts) + "\n")
+
+
+TREEMAP_HTML = """<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="utf-8"/>
+<title>vmlinux .text rollup -- subsystem treemap</title>
+<style>
+body { margin: 0; font-family: ui-sans-serif, system-ui, sans-serif; background: #fafafa; color: #222; }
+header { padding: 12px 20px; border-bottom: 1px solid #ddd; }
+header h1 { margin: 0; font-size: 15px; }
+header p { margin: 4px 0 0; font-size: 12px; color: #555; }
+header code { background: #eee; padding: 1px 4px; border-radius: 3px; }
+#chart { width: 100%; height: calc(100vh - 90px); display: block; }
+.tile rect { stroke: #fff; stroke-width: 1; }
+.tile:hover rect { stroke: #000; stroke-width: 2; }
+.tile text { font-size: 11px; fill: #fff; pointer-events: none; }
+.tooltip {
+  position: absolute; pointer-events: none;
+  background: #222; color: #fff;
+  padding: 6px 10px; font-size: 12px;
+  border-radius: 4px; opacity: 0;
+  max-width: 360px; word-break: break-all;
+}
+</style>
+</head>
+<body>
+<header>
+  <h1>vmlinux .text rollup -- subsystem treemap</h1>
+  <p>Total: __TOTAL__ bytes. Hover for details. Special buckets:
+     <code>&lt;icf-merged&gt;</code>, <code>&lt;unknown&gt;</code>,
+     <code>&lt;compiler-partition&gt;</code>, <code>&lt;external&gt;</code>.</p>
+</header>
+<svg id="chart"></svg>
+<div class="tooltip" id="tip"></div>
+<script src="https://d3js.org/d3.v7.min.js"></script>
+<script>
+const data = __DATA__;
+const svg = d3.select("#chart");
+const tip = d3.select("#tip");
+const palette = d3.scaleOrdinal(d3.schemeTableau10);
+
+function render() {
+  const w = svg.node().clientWidth;
+  const h = svg.node().clientHeight;
+  svg.selectAll("*").remove();
+
+  const root = d3.hierarchy(data)
+    .sum(d => d.value || 0)
+    .sort((a, b) => b.value - a.value);
+  d3.treemap().size([w, h]).paddingInner(1).round(true)(root);
+
+  const tile = svg.selectAll("g")
+    .data(root.leaves())
+    .join("g")
+    .attr("class", "tile")
+    .attr("transform", d => `translate(${d.x0},${d.y0})`);
+
+  tile.append("rect")
+    .attr("width", d => Math.max(0, d.x1 - d.x0))
+    .attr("height", d => Math.max(0, d.y1 - d.y0))
+    .attr("fill", d => palette(d.parent ? d.parent.data.name : "root"));
+
+  tile.append("text")
+    .attr("x", 4).attr("y", 12)
+    .text(d => {
+      const W = d.x1 - d.x0;
+      const H = d.y1 - d.y0;
+      if (W < 60 || H < 16) return "";
+      return `${d.parent.data.name}/${d.data.name}`;
+    });
+
+  tile.on("mousemove", function(event, d) {
+    tip.style("opacity", 1)
+       .style("left", (event.pageX + 14) + "px")
+       .style("top", (event.pageY + 14) + "px")
+       .html(
+         `<b>${d.parent.data.name}/${d.data.name}</b><br/>` +
+         `${d.value.toLocaleString()} bytes<br/>` +
+         `${(d.data.symbols || 0)} symbols`
+       );
+  }).on("mouseleave", () => tip.style("opacity", 0));
+}
+
+render();
+window.addEventListener("resize", render);
+</script>
+</body>
+</html>
+"""
+
+
+def write_deep_table(buckets, deep_buckets, path, top_files=20):
+    # Per-bucket source-file breakdown for buckets the operator named.
+    # Same `BucketAgg.files` data that feeds the treemap, rolled up two
+    # ways: by 2nd-level subdirectory (`kernel/sched/fair.c` ->
+    # `kernel/sched`) and as a flat top-N file list. The `<unfiled>`
+    # pseudo-key inside `b.files` (entries the aggregator could not
+    # attribute to a path) is suppressed -- it duplicates the parent
+    # bucket's <unknown>/<external> bookkeeping and would make the
+    # depth-2 totals overshoot.
+    lines = [
+        "# subsystem-rollup-deep -- per-bucket source breakdown",
+        "# Same attribution rules as subsystem-rollup.txt; this file",
+        "# drills one level deeper for the buckets named via --deep.",
+        f"# requested buckets: {', '.join(deep_buckets)}",
+        "",
+    ]
+    for bname in deep_buckets:
+        b = buckets.get(bname)
+        if b is None or not b.files:
+            lines.append(f"## {bname} -- not present in this rollup")
+            lines.append("")
+            continue
+
+        bucket_total = b.bytes
+        attributed = [
+            (fname, finfo) for fname, finfo in b.files.items()
+            if fname != "<unfiled>"
+        ]
+        n_files = len(attributed)
+        lines.append(
+            f"## {bname} -- {bucket_total} bytes across {n_files} "
+            f"source files"
+        )
+        lines.append("")
+
+        depth2 = collections.defaultdict(
+            lambda: {"bytes": 0, "symbols": 0, "files": set()}
+        )
+        for fname, finfo in attributed:
+            parts = fname.split("/")
+            # parts[:2] keeps single-component names like
+            # "kernel/workqueue.c" intact AND collapses
+            # "kernel/sched/fair.c" -> "kernel/sched".
+            key = "/".join(parts[: min(2, len(parts))])
+            d = depth2[key]
+            d["bytes"] += finfo["bytes"]
+            d["symbols"] += finfo["symbols"]
+            d["files"].add(fname)
+
+        lines.append(f"# {bname} by 2nd-level subdirectory")
+        lines.append("# subdirectory\tbytes\tpercent\tsymbols\tfiles")
+        for key, d in sorted(
+            depth2.items(), key=lambda kv: kv[1]["bytes"], reverse=True
+        ):
+            pct = d["bytes"] * 100.0 / bucket_total if bucket_total else 0.0
+            lines.append(
+                f"{key}\t{d['bytes']}\t{pct:.2f}\t"
+                f"{d['symbols']}\t{len(d['files'])}"
+            )
+        lines.append("")
+
+        lines.append(f"# {bname} top {top_files} source files")
+        lines.append("# file\tbytes\tpercent\tsymbols")
+        for fname, finfo in sorted(
+            attributed, key=lambda kv: kv[1]["bytes"], reverse=True
+        )[:top_files]:
+            pct = finfo["bytes"] * 100.0 / bucket_total if bucket_total else 0.0
+            lines.append(
+                f"{fname}\t{finfo['bytes']}\t{pct:.2f}\t{finfo['symbols']}"
+            )
+        lines.append("")
+
+    path.write_text("\n".join(lines) + "\n")
+
+
+DEEP_HTML = """<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="utf-8"/>
+<title>vmlinux .text rollup -- deep breakdown</title>
+<style>
+body { margin: 0; font-family: ui-sans-serif, system-ui, sans-serif;
+       background: #fafafa; color: #222; }
+header { padding: 12px 20px; border-bottom: 1px solid #ddd;
+         background: #fff; position: sticky; top: 0; z-index: 10; }
+header h1 { margin: 0; font-size: 15px; }
+header p { margin: 4px 0 0; font-size: 12px; color: #555; }
+header code { background: #eee; padding: 1px 4px; border-radius: 3px; }
+nav { margin-top: 6px; font-size: 12px; }
+nav a { margin-right: 12px; color: #2a4d8f; text-decoration: none; }
+nav a:hover { text-decoration: underline; }
+section { padding: 18px 20px; border-bottom: 1px solid #eee; }
+section h2 { margin: 0 0 4px; font-size: 14px; }
+section h3 { margin: 16px 0 6px; font-size: 12px; color: #555;
+             text-transform: uppercase; letter-spacing: 0.04em; }
+section .meta { font-size: 12px; color: #666; }
+table { border-collapse: collapse; width: 100%; max-width: 920px;
+        font: 12px ui-monospace, monospace; margin-top: 6px; }
+th, td { border-bottom: 1px solid #eee; padding: 4px 8px; text-align: left;
+         vertical-align: top; white-space: nowrap; }
+th { font-weight: 600; color: #444; background: #f3f3f3; position: sticky;
+     top: 78px; }
+td.num { text-align: right; font-variant-numeric: tabular-nums; }
+.bar { position: relative; height: 14px; background: #eaeaea;
+       border-radius: 2px; overflow: hidden; min-width: 120px; }
+.bar > span { position: absolute; left: 0; top: 0; bottom: 0;
+              background: #4682b4; }
+tr.special td:first-child { color: #c84030; font-style: italic; }
+footer { padding: 10px 20px; font-size: 11px; color: #999; }
+</style>
+</head>
+<body>
+<header>
+  <h1>vmlinux .text rollup -- deep breakdown</h1>
+  <p>Per-bucket 2nd-level subdirectory rollup and top source files.
+     Same attribution rules as <code>subsystem-rollup.txt</code>;
+     hover bars to read exact byte counts.</p>
+  <nav>__NAV__</nav>
+</header>
+__BODY__
+<footer>Generated by <code>scripts/subsystem-rollup.py --deep</code>.</footer>
+</body>
+</html>
+"""
+
+
+def write_deep_html(buckets, deep_buckets, path, top_files=20):
+    sections = []
+    nav = []
+    for bname in deep_buckets:
+        anchor = (
+            "b-" + re.sub(r"[^A-Za-z0-9]+", "-", bname).strip("-").lower()
+        )
+        nav.append(f'<a href="#{anchor}">{xml.sax.saxutils.escape(bname)}</a>')
+
+        b = buckets.get(bname)
+        if b is None or not b.files:
+            sections.append(
+                f'<section id="{anchor}"><h2>{_esc(bname)}</h2>'
+                f'<p class="meta">not present in this rollup</p></section>'
+            )
+            continue
+
+        bucket_total = b.bytes
+        attributed = [
+            (fname, finfo) for fname, finfo in b.files.items()
+            if fname != "<unfiled>"
+        ]
+        n_files = len(attributed)
+
+        depth2 = collections.defaultdict(
+            lambda: {"bytes": 0, "symbols": 0, "files": set()}
+        )
+        for fname, finfo in attributed:
+            parts = fname.split("/")
+            key = "/".join(parts[: min(2, len(parts))])
+            d = depth2[key]
+            d["bytes"] += finfo["bytes"]
+            d["symbols"] += finfo["symbols"]
+            d["files"].add(fname)
+
+        depth2_rows = sorted(
+            depth2.items(), key=lambda kv: kv[1]["bytes"], reverse=True
+        )
+        max_d2 = max((d["bytes"] for _, d in depth2_rows), default=1) or 1
+        depth2_html = ['<h3>2nd-level subdirectories</h3>',
+                       '<table><thead><tr>'
+                       '<th>subdirectory</th><th class="num">bytes</th>'
+                       '<th class="num">%</th><th class="num">symbols</th>'
+                       '<th class="num">files</th><th>share</th>'
+                       '</tr></thead><tbody>']
+        for key, d in depth2_rows:
+            pct = d["bytes"] * 100.0 / bucket_total if bucket_total else 0.0
+            bar_pct = d["bytes"] * 100.0 / max_d2
+            depth2_html.append(
+                f'<tr><td>{_esc(key)}</td>'
+                f'<td class="num">{d["bytes"]:,}</td>'
+                f'<td class="num">{pct:.2f}</td>'
+                f'<td class="num">{d["symbols"]}</td>'
+                f'<td class="num">{len(d["files"])}</td>'
+                f'<td><div class="bar" title="{d["bytes"]:,} bytes">'
+                f'<span style="width:{bar_pct:.1f}%"></span></div></td>'
+                '</tr>'
+            )
+        depth2_html.append('</tbody></table>')
+
+        files_rows = sorted(
+            attributed, key=lambda kv: kv[1]["bytes"], reverse=True
+        )[:top_files]
+        max_f = max((f["bytes"] for _, f in files_rows), default=1) or 1
+        files_html = [
+            f'<h3>top {top_files} source files</h3>',
+            '<table><thead><tr>'
+            '<th>file</th><th class="num">bytes</th>'
+            '<th class="num">%</th><th class="num">symbols</th>'
+            '<th>share</th>'
+            '</tr></thead><tbody>'
+        ]
+        for fname, finfo in files_rows:
+            pct = finfo["bytes"] * 100.0 / bucket_total if bucket_total else 0.0
+            bar_pct = finfo["bytes"] * 100.0 / max_f
+            files_html.append(
+                f'<tr><td>{_esc(fname)}</td>'
+                f'<td class="num">{finfo["bytes"]:,}</td>'
+                f'<td class="num">{pct:.2f}</td>'
+                f'<td class="num">{finfo["symbols"]}</td>'
+                f'<td><div class="bar" title="{finfo["bytes"]:,} bytes">'
+                f'<span style="width:{bar_pct:.1f}%"></span></div></td>'
+                '</tr>'
+            )
+        files_html.append('</tbody></table>')
+
+        sections.append(
+            f'<section id="{anchor}">'
+            f'<h2>{_esc(bname)}</h2>'
+            f'<p class="meta">{bucket_total:,} bytes across {n_files} '
+            f'source files</p>'
+            + "\n".join(depth2_html)
+            + "\n".join(files_html)
+            + '</section>'
+        )
+
+    html = (
+        DEEP_HTML
+        .replace("__NAV__", " ".join(nav) if nav else "(no buckets)")
+        .replace("__BODY__", "\n".join(sections))
+    )
+    path.write_text(html)
+
+
+def write_treemap(buckets, grand_total, path):
+    children = []
+    for bname, b in sorted(
+        buckets.items(), key=lambda kv: kv[1].bytes, reverse=True
+    ):
+        files = []
+        for fname, finfo in sorted(
+            b.files.items(), key=lambda kv: kv[1]["bytes"], reverse=True
+        ):
+            files.append({
+                "name": fname,
+                "value": finfo["bytes"],
+                "symbols": finfo["symbols"],
+            })
+        if not files:
+            files = [{"name": bname, "value": b.bytes, "symbols": b.symbols}]
+        children.append({
+            "name": bname,
+            "icf": b.icf,
+            "clones": b.clones,
+            "children": files,
+        })
+    root = {"name": "vmlinux .text", "children": children}
+    data = json.dumps(root, separators=(",", ":"))
+    html = (
+        TREEMAP_HTML
+        .replace("__DATA__", data)
+        .replace("__TOTAL__", f"{grand_total:,}")
+    )
+    path.write_text(html)
+
+
+def main(argv):
+    parser = argparse.ArgumentParser(
+        description="LTO-aware vmlinux .text rollup by source subsystem."
+    )
+    parser.add_argument("--vmlinux", required=True, type=pathlib.Path)
+    parser.add_argument(
+        "--linux-tree",
+        type=pathlib.Path,
+        help="Kernel source tree (used to strip absolute paths).",
+    )
+    parser.add_argument(
+        "--toolchain-bin",
+        type=pathlib.Path,
+        help="Toolchain bin/ directory (overrides PATH lookup).",
+    )
+    parser.add_argument(
+        "--output",
+        required=True,
+        type=pathlib.Path,
+        help="Path to subsystem-rollup.txt; the SVG and HTML siblings "
+        "are written next to it.",
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=4096,
+        help="Addresses per addr2line invocation (default: 4096).",
+    )
+    parser.add_argument(
+        "--deep",
+        action="append",
+        default=[],
+        metavar="BUCKET",
+        help="Bucket name to break down further into 2nd-level "
+        "subdirs and top source files. Repeatable. When set, "
+        "writes subsystem-rollup-deep.txt next to --output.",
+    )
+    parser.add_argument(
+        "--deep-output",
+        type=pathlib.Path,
+        help="Override path for the deep-rollup output (defaults to "
+        "subsystem-rollup-deep.txt next to --output).",
+    )
+    args = parser.parse_args(argv)
+
+    if args.batch_size < 1:
+        print(
+            f"subsystem-rollup: --batch-size must be >= 1 "
+            f"(got {args.batch_size})",
+            file=sys.stderr,
+        )
+        return 1
+
+    # The deep emitter writes <path>.txt and derives the HTML sibling
+    # via path.with_suffix('.html'). If the operator passes a
+    # .html-suffixed --deep-output, the derived sibling collides with
+    # the .txt write that just happened and silently overwrites it.
+    # Catch that explicitly instead of producing a half-truncated pair.
+    if (
+        args.deep_output is not None
+        and args.deep_output.suffix.lower() == ".html"
+    ):
+        print(
+            "subsystem-rollup: --deep-output must be the .txt path "
+            "(the .html sibling is derived from its stem); refusing "
+            f"to alias the two files (got {args.deep_output})",
+            file=sys.stderr,
+        )
+        return 1
+
+    if not args.vmlinux.exists():
+        print(
+            f"subsystem-rollup: missing vmlinux: {args.vmlinux}",
+            file=sys.stderr,
+        )
+        return 1
+
+    toolchain_bin = args.toolchain_bin
+    if toolchain_bin is None and args.linux_tree is not None:
+        toolchain_bin = (
+            args.linux_tree.resolve().parent / "toolchain" / "bin"
+        )
+
+    nm = resolve_tool("nm", toolchain_bin)
+    addr2line = resolve_tool("addr2line", toolchain_bin)
+    try:
+        size_tool = resolve_tool("size", toolchain_bin)
+    except FileNotFoundError:
+        size_tool = None
+
+    rows = collect_text_symbols(nm, args.vmlinux)
+    if not rows:
+        print(
+            "subsystem-rollup: nm returned no sized text symbols",
+            file=sys.stderr,
+        )
+        return 1
+
+    tree_prefix = (
+        str(args.linux_tree.resolve()) if args.linux_tree else ""
+    )
+
+    entries = build_deduped_entries(rows)
+    if not entries:
+        print(
+            "subsystem-rollup: no entries after dedup",
+            file=sys.stderr,
+        )
+        return 1
+
+    addresses = [e["addr"] for e in entries]
+    stacks = []
+    for start in range(0, len(addresses), args.batch_size):
+        chunk = addresses[start: start + args.batch_size]
+        stacks.extend(resolve_inline_stacks(addr2line, args.vmlinux, chunk))
+
+    buckets, accounted = aggregate(entries, stacks, tree_prefix)
+
+    unknown_bytes = (
+        buckets["<unknown>"].bytes if "<unknown>" in buckets else 0
+    )
+    if accounted and unknown_bytes / accounted > UNRESOLVED_FAIL_RATIO:
+        print(
+            f"subsystem-rollup: {unknown_bytes / accounted * 100:.1f}% "
+            "of resolved .text is <unknown> -- vmlinux likely lacks "
+            "DWARF. Re-build with KERNEL_DEBUG_INFO=reduced "
+            "(CONFIG_DEBUG_INFO_REDUCED=y) for diagnostic attribution.",
+            file=sys.stderr,
+        )
+        return 2
+
+    if size_tool is not None:
+        section_total = text_section_total(size_tool, args.vmlinux)
+        if section_total and section_total > accounted:
+            residual = section_total - accounted
+            buckets["<compiler-partition>"].bytes += residual
+            # No symbols claim these bytes; record the count as zero
+            # rather than fabricating one. The bytes column is enough
+            # to flag the residual when it grows.
+    grand_total = sum(b.bytes for b in buckets.values())
+
+    out = args.output
+    out.parent.mkdir(parents=True, exist_ok=True)
+    write_table(buckets, grand_total, out)
+    write_svg(
+        buckets, grand_total,
+        out.with_name("subsystem-rollup-bars.svg"),
+    )
+    write_treemap(
+        buckets, grand_total,
+        out.with_name("subsystem-rollup-tree.html"),
+    )
+    if args.deep:
+        deep_path = (
+            args.deep_output
+            if args.deep_output is not None
+            else out.with_name("subsystem-rollup-deep.txt")
+        )
+        write_deep_table(buckets, args.deep, deep_path)
+        write_deep_html(
+            buckets, args.deep,
+            deep_path.with_suffix(".html"),
+        )
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv[1:]))
diff --git a/scripts/text-rollup.py b/scripts/text-rollup.py
deleted file mode 100755
index c50e451..0000000
--- a/scripts/text-rollup.py
+++ /dev/null
@@ -1,253 +0,0 @@
-#!/usr/bin/env python3
-
-# Roll up vmlinux .text bytes by source subsystem.
-#
-# Reads `nm -n -S` to get text symbols (address, size, name), feeds the
-# addresses through one batched `addr2line -e vmlinux -f` invocation, and
-# attributes each symbol to a top-level kernel directory. drivers/<subdir>/
-# and arch/<arch>/<subdir>/ are split out so a single driver family does not
-# hide inside the parent bucket.
-#
-# Symbols whose source location addr2line cannot resolve (no DWARF, inlined
-# without debug info, etc.) land in the "<unknown>" bucket so the report
-# always sums to the visible .text mass.
-
-import argparse
-import os
-import pathlib
-import re
-import shutil
-import subprocess
-import sys
-from typing import Optional
-
-
-TEXT_TYPES = {"T", "t", "W", "w"}
-
-# Kernel top-level directories that should each become their own bucket.
-# Anything outside this set rolls up under the literal first path component
-# (or "<external>" if the path is absolute and unrecognized).
-TOP_LEVEL_DIRS = frozenset({
-    "arch", "block", "certs", "crypto", "drivers", "fs", "init", "io_uring",
-    "ipc", "kernel", "lib", "mm", "net", "rust", "samples", "scripts",
-    "security", "sound", "tools", "usr", "virt",
-})
-
-# Subdirectories that are themselves large enough that splitting one level
-# deeper is the right granularity, even though the parent already gets a
-# bucket.
-SPLIT_ONE_DEEPER = frozenset({"arch", "drivers", "sound", "net"})
-
-
-def resolve_tool(name: str, toolchain_bin: Optional[pathlib.Path]) -> str:
-    # Precedence: explicit --toolchain-bin > CROSS_COMPILE > PATH lookup.
-    # An explicit flag must beat ambient env, otherwise a stray
-    # CROSS_COMPILE pointing at a different toolchain silently produces
-    # mismatched nm/addr2line and corrupt attribution.
-    candidates = []
-    if toolchain_bin is not None:
-        candidates.append(str(toolchain_bin / ("arm-uclinuxfdpiceabi-" + name)))
-    cross_compile = os.environ.get("CROSS_COMPILE", "")
-    if cross_compile:
-        candidates.append(cross_compile + name)
-    candidates.append("arm-uclinuxfdpiceabi-" + name)
-
-    for candidate in candidates:
-        resolved = shutil.which(candidate) if not pathlib.Path(candidate).is_absolute() else candidate
-        if resolved and pathlib.Path(resolved).exists():
-            return resolved
-
-    raise FileNotFoundError(f"unable to locate arm-uclinuxfdpiceabi-{name}")
-
-
-def collect_text_symbols(nm: str, vmlinux: pathlib.Path):
-    cmd = [nm, "-n", "-S", "--defined-only", str(vmlinux)]
-    proc = subprocess.run(cmd, check=True, text=True, capture_output=True)
-    rows = []
-    for line in proc.stdout.splitlines():
-        fields = line.split(None, 3)
-        if len(fields) != 4:
-            continue
-        addr_hex, size_hex, sym_type, name = fields
-        # Defensive: a name that contains whitespace would push a non-hex
-        # token into size_hex / sym_type. Cheap to guard, avoids a hard
-        # crash on a single malformed line.
-        if len(sym_type) != 1 or sym_type not in TEXT_TYPES:
-            continue
-        try:
-            size = int(size_hex, 16)
-            addr = int(addr_hex, 16)
-        except ValueError:
-            continue
-        if size == 0:
-            continue
-        # Thumb function symbols carry the low bit set; addr2line wants the
-        # even instruction address.
-        rows.append((addr & ~1, size, name))
-    return rows
-
-
-def resolve_addresses(addr2line: str, vmlinux: pathlib.Path, addresses):
-    # addr2line -f without -i emits exactly two lines per input: function
-    # name, then file:line. Feeding a long list as stdin keeps this to one
-    # subprocess instead of the per-symbol fork/exec storm that
-    # scripts/faddr2line incurs.
-    cmd = [addr2line, "-e", str(vmlinux), "-f"]
-    payload = "\n".join(f"0x{a:x}" for a in addresses) + "\n"
-    proc = subprocess.run(cmd, input=payload, check=True, text=True, capture_output=True)
-    lines = proc.stdout.splitlines()
-    expected = len(addresses) * 2
-    if len(lines) != expected:
-        raise RuntimeError(
-            f"addr2line returned {len(lines)} lines, expected {expected}"
-        )
-    out = []
-    for i in range(0, len(lines), 2):
-        # function name on lines[i] is unused; file:line is what we bucket on.
-        location = lines[i + 1]
-        out.append(location)
-    return out
-
-
-# Match "/abs/.../source/file.c:123 (discriminator N)" or "??:0".
-LINE_RE = re.compile(r"^(?P<path>.*?):[0-9?]+(?:\s+\(discriminator.*\))?$")
-
-
-def normalize_path(location: str, tree_prefix: str):
-    m = LINE_RE.match(location)
-    if not m:
-        return None
-    path = m.group("path")
-    if path in ("", "??"):
-        return None
-
-    # addr2line returns absolute paths from the build's working directory.
-    # Strip the kernel source prefix so paths become relative to the kernel
-    # tree root. The boundary check ("/" suffix or exact match) prevents a
-    # sibling directory like `/home/u/linux-build/...` from being treated
-    # as a child of `/home/u/linux/`.
-    if tree_prefix and (path == tree_prefix or path.startswith(tree_prefix + "/")):
-        path = path[len(tree_prefix):].lstrip("/")
-    elif path.startswith("/"):
-        # Fall back to the first known top-level directory in the path.
-        parts = path.split("/")
-        for idx, part in enumerate(parts):
-            if part in TOP_LEVEL_DIRS:
-                path = "/".join(parts[idx:])
-                break
-        else:
-            return path  # absolute, unrecognized -- leave for "<external>"
-    return path
-
-
-def bucket_for(path):
-    if path is None:
-        return "<unknown>"
-    if path.startswith("/"):
-        return "<external>"
-
-    parts = path.split("/")
-    head = parts[0]
-    if head not in TOP_LEVEL_DIRS:
-        return head or "<unknown>"
-    if head in SPLIT_ONE_DEEPER and len(parts) > 1:
-        return f"{head}/{parts[1]}"
-    return head
-
-
-def render_report(buckets, total_text):
-    lines = []
-    lines.append(f"# .text rollup -- total resolved bytes: {total_text}")
-    lines.append("# bucket\tbytes\tpercent")
-    rows = sorted(buckets.items(), key=lambda kv: kv[1], reverse=True)
-    for name, size in rows:
-        pct = (size * 100.0 / total_text) if total_text else 0.0
-        lines.append(f"{name}\t{size}\t{pct:.2f}")
-    return "\n".join(lines) + "\n"
-
-
-def main(argv):
-    parser = argparse.ArgumentParser(
-        description="Attribute vmlinux .text bytes to source subsystems."
-    )
-    parser.add_argument("--vmlinux", required=True, type=pathlib.Path)
-    parser.add_argument(
-        "--linux-tree",
-        type=pathlib.Path,
-        help="Path to the kernel source tree (used to strip absolute paths).",
-    )
-    parser.add_argument(
-        "--toolchain-bin",
-        type=pathlib.Path,
-        help="Toolchain bin/ directory (overrides PATH lookup).",
-    )
-    parser.add_argument("--output", required=True, type=pathlib.Path)
-    parser.add_argument(
-        "--batch-size",
-        type=int,
-        default=4096,
-        help="Addresses per addr2line invocation (default: 4096).",
-    )
-    args = parser.parse_args(argv)
-
-    if args.batch_size < 1:
-        print(
-            f"text-rollup: --batch-size must be >= 1 (got {args.batch_size})",
-            file=sys.stderr,
-        )
-        return 1
-
-    vmlinux = args.vmlinux
-    if not vmlinux.exists():
-        print(f"text-rollup: missing vmlinux: {vmlinux}", file=sys.stderr)
-        return 1
-
-    toolchain_bin = args.toolchain_bin
-    if toolchain_bin is None and args.linux_tree is not None:
-        toolchain_bin = args.linux_tree.resolve().parent / "toolchain" / "bin"
-
-    nm = resolve_tool("nm", toolchain_bin)
-    addr2line = resolve_tool("addr2line", toolchain_bin)
-
-    rows = collect_text_symbols(nm, vmlinux)
-    if not rows:
-        print("text-rollup: no text symbols with size found", file=sys.stderr)
-        return 1
-
-    tree_prefix = ""
-    if args.linux_tree:
-        tree_prefix = str(args.linux_tree.resolve())
-
-    # nm -nS already address-sorts; re-sort defensively so the overlap
-    # clamp below cannot be defeated by a symbol whose address ties an
-    # earlier one. Kernel ARM entry-point macros emit multiple T symbols
-    # covering the same code blob (e.g. `ret_to_user` at +0 and
-    # `ret_to_user_from_irq` at +4 with the same end address); summing
-    # their reported sizes would double-count the shared region.
-    rows.sort(key=lambda r: r[0])
-    starts = [r[0] for r in rows]
-
-    buckets = {}
-    total = 0
-    for start in range(0, len(rows), args.batch_size):
-        chunk = rows[start:start + args.batch_size]
-        addresses = [addr for addr, _, _ in chunk]
-        locations = resolve_addresses(addr2line, vmlinux, addresses)
-        for idx, ((addr, size, _), loc) in enumerate(zip(chunk, locations)):
-            row_idx = start + idx
-            next_addr = starts[row_idx + 1] if row_idx + 1 < len(starts) else addr + size
-            effective = min(size, max(0, next_addr - addr))
-            if effective <= 0:
-                continue
-            path = normalize_path(loc, tree_prefix)
-            key = bucket_for(path)
-            buckets[key] = buckets.get(key, 0) + effective
-            total += effective
-
-    args.output.parent.mkdir(parents=True, exist_ok=True)
-    args.output.write_text(render_report(buckets, total))
-    return 0
-
-
-if __name__ == "__main__":
-    sys.exit(main(sys.argv[1:]))