In [1]:
# test_stage_tree.py
import importlib, stage_runner  # ensures latest after edits
importlib.reload(stage_runner)

from stage_runner import StageTreeRunner, StaticCfg, Stage, Trial

static = StaticCfg(
    model_path="mlx-community/Mistral-7B-Instruct-v0.2-4bit",
    lora_layers=16,
    val_batches=-1,
    steps_per_eval=10,
    dataset_id="youtube-comments-v1",
    tokenizer_id="mistral-tokenizer-v0.2",
)

runner = StageTreeRunner(
    static,
    cache_dir="./stage_cache",
    lora_script="./scripts/lora.py",
    log_file="stage_log.csv",
    fastcdc_bin=None,   # or "/path/to/fastcdc" if you want chunk counts
)

# Two trials sharing stage 1 (HiPPO-style prefix reuse)
trial_A = Trial("A_lr1e-5_then_5e-5", [Stage(50, 1e-5), Stage(50, 5e-5)])
trial_B = Trial("B_lr1e-5_then_3e-5", [Stage(50, 1e-5), Stage(50, 3e-5)])

runner.run_trial(trial_A)
runner.run_trial(trial_B)
print("\nAll done. Metrics logged to stage_log.csv")


[build] A_lr1e-5_then_5e-5 stage 1/2 (iters=50, lr=1e-05)
>> python /Users/sanjeeb/Coding/HSSL/qlora-mlx/stage-tree-deltaDNN/scripts/lora.py --model mlx-community/Mistral-7B-Instruct-v0.2-4bit --train --iters 50 --steps-per-eval 10 --val-batches -1 --learning-rate 1e-05 --lora-layers 16 --adapter-file ./stage_cache/233ac6dad23f2688/adapters.npz
Loading pretrained model

Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]
Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 137840.98it/s]
Total parameters 1243.189M
Trainable parameters 0.852M
Loading datasets
Training
Iter 1: Val loss 4.243, Val took 16.840s
Iter 10: Train loss 4.008, It/sec 0.108, Tokens/sec 89.558
Iter 10: Val loss 3.044, Val took 14.053s
Iter 20: Train loss 2.690, It/sec 0.095, Tokens/sec 76.872
Iter 20: Val loss 2.181, Val took 14.856s
Iter 30: Train loss 1.690, It/sec 0.094, Tokens/sec 74.547
Iter 30: Val loss 1.609, Val took 15.858s
Iter 40: Train loss 1.352, It/sec 0.099, Tokens/sec 80.312
Iter 40: Val loss 1

In [2]:
# analyze_results.py
import pandas as pd
from utils import fmt_bytes

df = pd.read_csv("stage_log.csv")

# Basic cleanup/coercion
for col in ["stage_idx","iters","runtime_sec","cache_hit","cache_miss",
            "size_full_bytes","size_delta_bytes","compression_ratio_full_over_delta","lr",
            "fastcdc_chunks"]:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

# 1) Per-trial stage table (like HiPPO)
print("\n=== Per-trial stages ===")
show = df[["trial","stage_idx","iters","lr","runtime_sec","cache_hit",
           "size_full_bytes","size_delta_bytes","compression_ratio_full_over_delta"]].copy()
# Prettify sizes
show["size_full"]  = show["size_full_bytes"].apply(lambda x: fmt_bytes(int(x)) if pd.notna(x) else "")
show["size_delta"] = show["size_delta_bytes"].apply(lambda x: fmt_bytes(int(x)) if pd.notna(x) else "")
show = show.drop(columns=["size_full_bytes","size_delta_bytes"])
print(show.to_string(index=False))

# 2) Compute reuse: compare iters without reuse vs with reuse
iters_built = df.loc[df["cache_hit"]==0, "iters"].sum()
iters_reused = df.loc[df["cache_hit"]==1, "iters"].sum()
iters_no_reuse = iters_built + iters_reused
iters_with_reuse = iters_built
saved_iters = iters_no_reuse - iters_with_reuse
pct_saved = (saved_iters / iters_no_reuse * 100.0) if iters_no_reuse > 0 else 0.0
speedup = (iters_no_reuse / iters_with_reuse) if iters_with_reuse > 0 else 1.0

print("\n=== Compute reuse summary ===")
print(f"Iterations (no reuse baseline): {iters_no_reuse}")
print(f"Iterations (with reuse):        {iters_with_reuse}")
print(f"Saved iterations:               {saved_iters}  ({pct_saved:.2f}%)")
print(f"Speedup:                        {speedup:.2f}×")

# 3) Storage: anchor vs delta
print("\n=== Storage savings (DeltaDNN-style) ===")
st2 = df[df["stage_idx"]>1].copy()
if len(st2) > 0:
    total_full = st2["size_full_bytes"].sum()
    total_delta = st2["size_delta_bytes"].sum()
    ratio = (total_full / total_delta) if total_delta > 0 else float("inf")
    print(f"Total full size (stages>1):  {fmt_bytes(int(total_full))}")
    print(f"Total delta size (stages>1): {fmt_bytes(int(total_delta))}")
    print(f"Aggregate compression ratio: {ratio:.2f}×")
else:
    print("No stages >1 found; nothing to compress.")

# 4) Optional: FastCDC chunks per produced file
if "fastcdc_chunks" in df.columns and df["fastcdc_chunks"].notna().any():
    print("\n=== FastCDC chunk counts (optional) ===")
    print(df[["trial","stage_idx","fastcdc_chunks","adapter_path"]].to_string(index=False))



=== Per-trial stages ===
             trial  stage_idx  iters      lr  runtime_sec  cache_hit  compression_ratio_full_over_delta size_full size_delta
A_lr1e-5_then_5e-5          1     50 0.00001      650.219          0                           1.000000    3.3 MB      0.0 B
A_lr1e-5_then_5e-5          2     50 0.00005      687.578          0                           1.986844    3.3 MB     1.6 MB
B_lr1e-5_then_3e-5          1     50 0.00001        0.000          1                           1.000000    3.3 MB      0.0 B
B_lr1e-5_then_3e-5          2     50 0.00003      632.073          0                           1.986844    3.3 MB     1.6 MB

=== Compute reuse summary ===
Iterations (no reuse baseline): 200
Iterations (with reuse):        150
Saved iterations:               50  (25.00%)
Speedup:                        1.33×

=== Storage savings (DeltaDNN-style) ===
Total full size (stages>1):  6.5 MB
Total delta size (stages>1): 3.3 MB
Aggregate compression ratio: 1.99×


# Per-trial stage rows

Each row is one “stage” of a trial. It is recorded: stage index, iterations, LR, how long that stage took, whether you *reused* a cached prefix (“cache_hit”), and two storage numbers inspired by DeltaDNN: the size of storing the entire adapter (“size_full”) vs. storing only the *delta* from the previous stage (“size_delta”). The derived metric `compression_ratio_full_over_delta = size_full / size_delta`.

- **A, stage 1**
    `cache_hit=0` and `size_delta=0.0 B` → First stage is the *base*, so there is nothing to delta against. You must store the full adapter once (≈3.3 MB). That’s why the row shows `compression_ratio = 1.0`—full vs delta is undefined here, so you pinned it to 1.0 for the base.

- **A, stage 2**
    `cache_hit=0` (built new) and `size_full≈3.3 MB`, `size_delta≈1.6 MB`.
    `compression_ratio ≈ 3.3 / 1.66 ≈ 1.99×`.
    Meaning: if you stored the raw adapter artifact for stage 2, it would be ~3.3 MB, but storing only the *difference* from stage 1 costs ~1.6 MB. That’s ~50% storage saved for this stage.

- **B, stage 1**
    `cache_hit=1` with `runtime_sec = 0.000` and `size_delta=0.0 B`.
    This stage *reused* the cached prefix from trial A (same LR@50 iterations), so no retraining time and no new storage. That’s your stage-tree/HiPPO-style **compute reuse** in action.

- **B, stage 2**
    Built new from the reused stage-1 checkpoint (`cache_hit=0` for this stage), with the same storage pattern as A’s stage 2: full ≈3.3 MB vs delta ≈1.6 MB → ~1.99× compression.
    Intuition: although LR differs (3e-5 vs 5e-5), the delta from the shared prefix is still about half the size of the full adapter, which is typical—later stages “nudge” weights rather than rewriting everything.