In [3]:
# ================================================================
# LEAN REBUILD (no giant CSVs)
# - Finds best α-lanes fairly, then writes only small, reviewer-ready CSVs
# - Files written:
#     alpha_best_lanes.csv
#     alpha_best_per_base.csv
#     alpha_lambda_robustness.csv
#     alpha_perturbation.csv
#     alpha_offset_invariance.csv
#     alpha_outlier_score.csv
#     alpha_lambda_identifiability.csv
#     alpha_summary.csv
# ================================================================
from decimal import Decimal as D, getcontext, ROUND_HALF_EVEN
import math, numpy as np, pandas as pd
import itertools, time, os

# ---------- Precision & target ----------
NS = [12, 27, 81, 243, 729, 2187]
getcontext().prec = max(2600, int(max(NS)*1.2) + 200)
getcontext().rounding = ROUND_HALF_EVEN
ALPHA_INV = D("137.035999084")
def ppm(v: D, ref: D = ALPHA_INV) -> D:
    return (abs(v - ref) / ref) * D(1_000_000)

# ---------- Words ----------
def fib_word(n):
    a, b = "0", "01"
    while len(b) < n:
        a, b = b, b + a
    return b[:n]
def thue_morse(n):        return ''.join('1' if bin(i).count('1')%2 else '0' for i in range(n))
def period_doubling(n):
    s="0"
    while len(s)<n:
        s = s.replace("0","X").replace("1","Y")
        s = s.replace("X","01").replace("Y","00")
    return s[:n]
def rudin_shapiro(n):
    out=[]
    for i in range(n):
        b = bin(i)[2:]
        cnt = sum(1 for j in range(len(b)-1) if b[j]=='1' and b[j+1]=='1')
        out.append('1' if cnt%2 else '0')
    return ''.join(out)
def random_word(n, seed=777):
    rng = np.random.RandomState(seed)
    return ''.join(str(int(x)) for x in rng.randint(0,2,size=n))
def fib_shuffled(n, seed=888):
    digs = list(fib_word(n))
    rng = np.random.RandomState(seed)
    rng.shuffle(digs)
    return ''.join(digs)

WORDS = {
    "Fib": fib_word,
    "TM": thue_morse,
    "PD": period_doubling,
    "RS": rudin_shapiro,
    "Rand": random_word,     # deterministic seed
    "FibShuf": fib_shuffled, # deterministic seed
}

# ---------- Interpret 0/1 as base-B fraction ----------
def word_to_fraction_base(digits_str: str, B: int) -> D:
    w = D(0)
    weight = D(1) / D(B)
    for ch in digits_str:
        if ch == '1':
            w += weight
        weight = weight / D(B)
    return w
BASES = [10, 3, 5, 2]   # keep same set, in an intuitive order

# ---------- Ops (rails) ----------
def S1(x): return D(1) - x
def S2(x): return None if x == 0 else D(1) / x
def S3(x): return None if x == 1 else x / (D(1) - x)
def S4(x): return -x

OPS = {"S1":S1, "S2":S2, "S3":S3, "S4":S4}
INV = {"S1":S1, "S2":S2, "S3":(lambda y: None if y == -1 else y/(D(1)+y)), "S4":S4}

T = {
    "T1":("S2","S1"), "T2":("S1","S2"), "T3":("S1","S3"), "T4":("S3","S2"),
    "T5":("S3","S1"), "T6":("S4","S1"), "T7":("S1","S4"), "T8":("S4","S3"),
    "T9":("S3","S4"), "T10":("S2","S4"), "T11":("S4","S2"), "T12":("S2","S3"),
    "T13":("S3","S3"), "T14":("S2","S2"), "T15":("S1","S1"), "T16":("S4","S4"),
}
def apply_seq(x: D, seq) -> D:
    y = x
    for s in seq:
        y = OPS[s](y)
        if y is None: return None
    return y
ALL_OPS = {("S",k):(k,) for k in OPS.keys()}
ALL_OPS.update({("T",k):T[k] for k in T.keys()})

# ---------- Scale grid (fair, symmetric, modest size) ----------
def generate_scales():
    # λ = 2^u * 3^v * 5^w / 10^k with small exponents; include 3^6/10^3 explicitly
    vals = set()
    for u in range(-1,2):      # 2^[-1..1]
        for v in range(-6,7):  # 3^[-6..6]
            for w in range(-1,2):  # 5^[-1..1]
                for k in range(-3,4):  # 10^[-3..3]
                    lam = (D(2)**u)*(D(3)**v)*(D(5)**w)/(D(10)**k)
                    if D("0.03") <= lam <= D("30"):
                        vals.add(lam)
    vals.add((D(3)**6)/(D(10)**3))  # 0.729
    return sorted(vals)
LAMBDAS = generate_scales()

# ---------- Complexity proxy ----------
def complexity_score(word_name, base, op_kind, op_code, lam: D):
    lam_str = format(lam, 'f').rstrip('0').rstrip('.') if '.' in format(lam, 'f') else str(lam)
    depth = 1 if op_kind=="S" else 2
    word_term = {"Fib":0,"TM":1,"PD":1,"RS":2,"Rand":3,"FibShuf":2}[word_name]
    base_term = {10:0,3:1,5:1,2:2}[base]
    return len(lam_str) + depth + word_term + base_term

# ---------- Streaming search: keep only lane summaries (tiny) ----------
def run_search():
    agg = {}  # key -> {"ppms":[], "complexity":c}
    total = len(BASES)*len(WORDS)*len(NS)*len(LAMBDAS)*len(ALL_OPS)
    t0 = time.time(); step=0

    for base in BASES:
        for word_name, builder in WORDS.items():
            # prebuild all N digit strings deterministically
            digits_by_N = {}
            for N in NS:
                if word_name == "Rand":
                    digits_by_N[N] = builder(N, seed=777)
                elif word_name == "FibShuf":
                    digits_by_N[N] = builder(N, seed=888)
                else:
                    digits_by_N[N] = builder(N)
            # convert once per N
            xraw_by_N = {N: word_to_fraction_base(digits_by_N[N], base) for N in NS}

            for lam in LAMBDAS:
                for (ok, oc), seq in ALL_OPS.items():
                    key = (base, word_name, str(lam), ok, oc)
                    ppms=[]
                    for N in NS:
                        y = apply_seq(xraw_by_N[N]*lam, seq)
                        val_ppm = float(ppm(y)) if y is not None else float('inf')
                        ppms.append(val_ppm)
                    agg[key] = {
                        "median_ppm": float(np.median(ppms)),
                        "plateau_adjacent": any(ppms[i]<=50.0 and ppms[i+1]<=50.0 for i in range(len(ppms)-1)),
                        "complexity": complexity_score(word_name, base, ok, oc, lam)
                    }
                step += 1
                if step % 1000 == 0:
                    dt = time.time()-t0
                    print(f"…progress lanes: {step:,} chunks, elapsed {dt:.1f}s")

    rows=[]
    for (base,word,lam,ok,oc), rec in agg.items():
        rows.append({
            "base": base, "word": word, "lambda": lam, "op_kind": ok, "op_code": oc,
            "median_ppm": rec["median_ppm"],
            "plateau_adjacent": rec["plateau_adjacent"],
            "complexity": rec["complexity"],
        })
    lanes = pd.DataFrame(rows).sort_values(["median_ppm","complexity"]).reset_index(drop=True)
    return lanes

lanes = run_search()
lanes.to_csv("alpha_best_lanes.csv", index=False)
print("\nSaved: alpha_best_lanes.csv (rows:", len(lanes), ")")

# ---------- Best per base ----------
best_per_base = lanes.groupby("base", as_index=False).first()
best_per_base.to_csv("alpha_best_per_base.csv", index=False)
print("Saved: alpha_best_per_base.csv")

# ---------- Extract overall best (for follow-up tests) ----------
best = lanes.iloc[0]
best_base = int(best["base"])
best_word = best["word"]
best_lambda = D(str(best["lambda"]))
best_okind = best["op_kind"]; best_oc = best["op_code"]
summary = {
    "best_base": best_base,
    "best_word": best_word,
    "best_lambda": str(best_lambda),
    "best_op": f"{best_okind}/{best_oc}",
    "best_median_ppm": float(best["median_ppm"]),
    "best_plateau_adjacent": bool(best["plateau_adjacent"]),
}
pd.DataFrame([summary]).to_csv("alpha_summary.csv", index=False)
print("Saved: alpha_summary.csv")
print("Best lane:", summary)

# ---------- λ-robustness (±20 ppm around best λ) ----------
def robustness_sweep(best_lambda, seq, base, word_name, ns=NS, ppm_window=20):
    deltas_ppm = np.linspace(-ppm_window, ppm_window, 41)
    rows=[]
    builder = WORDS[word_name]
    for N in ns:
        if word_name == "Rand":
            digits = builder(N, seed=777)
        elif word_name == "FibShuf":
            digits = builder(N, seed=888)
        else:
            digits = builder(N)
        x_raw = word_to_fraction_base(digits, base)
        for d in deltas_ppm:
            lam = best_lambda * (D(1) + D(d)/D(1_000_000))
            y = apply_seq(x_raw * lam, ALL_OPS[(best_okind, best_oc)])
            rows.append({"N":N, "d_ppm_on_lambda": float(d), "ppm": float(ppm(y)) if y is not None else float('inf')})
    return pd.DataFrame(rows)

rob = robustness_sweep(best_lambda, ALL_OPS[(best_okind,best_oc)], best_base, best_word)
rob.to_csv("alpha_lambda_robustness.csv", index=False)
print("Saved: alpha_lambda_robustness.csv")

# ---------- Perturbation sensitivity (mean±sd) ----------
def flip_bits(s, rate, seed):
    if rate<=0: return s
    rng = np.random.RandomState(seed)
    n = len(s); k = int(round(rate*n))
    idx = rng.choice(n, size=k, replace=False)
    arr = np.array(list(s))
    arr[idx] = np.where(arr[idx]=='0','1','0')
    return ''.join(arr)
def perturbation_curve(seq, base, word_name, lam, rates=(0,0.001,0.01,0.05,0.1), seeds=(101,202,303), ns=(81,243,729)):
    rows=[]
    builder = WORDS[word_name]
    for N in ns:
        if word_name == "Rand":
            base_digits = builder(N, seed=777)
        elif word_name == "FibShuf":
            base_digits = builder(N, seed=888)
        else:
            base_digits = builder(N)
        for r in rates:
            ppms=[]
            for sd in seeds:
                pert = flip_bits(base_digits, r, sd)
                x_raw = word_to_fraction_base(pert, base)
                y = apply_seq(x_raw * lam, seq)
                ppms.append(float(ppm(y)) if y is not None else float('inf'))
            rows.append({"N":N, "flip_rate":r, "ppm_mean":float(np.mean(ppms)), "ppm_std":float(np.std(ppms))})
    return pd.DataFrame(rows)
pert = perturbation_curve(ALL_OPS[(best_okind,best_oc)], best_base, best_word, best_lambda)
pert.to_csv("alpha_perturbation.csv", index=False)
print("Saved: alpha_perturbation.csv")

# ---------- Offset/window invariance (random start offsets) ----------
def offset_invariance(word_name, lam, base, Ns=(81,243,729,2187), samples={81:50,243:40,729:30,2187:15}):
    rows=[]
    builder = WORDS[word_name]
    rng = np.random.RandomState(2025)
    for N in Ns:
        L = N + 5000
        digits_big = builder(L) if word_name not in ("Rand","FibShuf") else (builder(L, seed=777) if word_name=="Rand" else builder(L, seed=888))
        starts = sorted(rng.choice(L - N, size=samples[N], replace=False))
        for o in starts:
            sub = digits_big[o:o+N]
            x0 = word_to_fraction_base(sub, base) * lam
            y = S2(x0)
            rows.append({"N":N, "offset":int(o), "ppm": float(ppm(y))})
    return pd.DataFrame(rows)
off = offset_invariance(best_word, best_lambda, best_base)
off.to_csv("alpha_offset_invariance.csv", index=False)
print("Saved: alpha_offset_invariance.csv")

# ---------- Outlier score (empirical p vs all lanes) ----------
all_meds = lanes["median_ppm"].values
best_med = float(best["median_ppm"])
rank = int((all_meds <= best_med).sum())
p_emp = rank / len(all_meds)
gap = float(lanes.iloc[1]["median_ppm"] - best_med) if len(lanes) > 1 else float("nan")
pd.DataFrame([{
    "best_median_ppm": best_med,
    "rank_among_lanes": rank,
    "num_lanes": int(len(all_meds)),
    "empirical_p_value": float(p_emp),
    "gap_to_2nd_best_ppm": float(gap)
}]).to_csv("alpha_outlier_score.csv", index=False)
print("Saved: alpha_outlier_score.csv")

# ---------- λ identifiability (local quadratic around Δλ=0) ----------
local = rob[rob["d_ppm_on_lambda"].abs() <= 4].copy()
X = np.vstack([np.ones(len(local)), local["d_ppm_on_lambda"].values, (local["d_ppm_on_lambda"].values**2)]).T
y = local["ppm"].values
beta, *_ = np.linalg.lstsq(X, y, rcond=None)
a,b,c = beta
pd.DataFrame([{"ppm_at_min_approx":a, "slope_dppm_dppmLam":b, "curvature":c}]).to_csv("alpha_lambda_identifiability.csv", index=False)
print("Saved: alpha_lambda_identifiability.csv")

print("\nDone. Small files only:")
for f in ["alpha_best_lanes.csv","alpha_best_per_base.csv","alpha_lambda_robustness.csv",
          "alpha_perturbation.csv","alpha_offset_invariance.csv","alpha_outlier_score.csv",
          "alpha_lambda_identifiability.csv","alpha_summary.csv"]:
    print(" -", f, os.path.getsize(f)/1024, "KB")


…progress lanes: 1,000 chunks, elapsed 35.2s
…progress lanes: 2,000 chunks, elapsed 99.4s
…progress lanes: 3,000 chunks, elapsed 144.8s
…progress lanes: 4,000 chunks, elapsed 180.0s

Saved: alpha_best_lanes.csv (rows: 92640 )
Saved: alpha_best_per_base.csv
Saved: alpha_summary.csv
Best lane: {'best_base': 10, 'best_word': 'Fib', 'best_lambda': '0.729', 'best_op': 'S/S2', 'best_median_ppm': 1.4256779972806497, 'best_plateau_adjacent': True}
Saved: alpha_lambda_robustness.csv
Saved: alpha_perturbation.csv
Saved: alpha_offset_invariance.csv
Saved: alpha_outlier_score.csv
Saved: alpha_lambda_identifiability.csv

Done. Small files only:
 - alpha_best_lanes.csv 123098.41796875 KB
 - alpha_best_per_base.csv 5.75390625 KB
 - alpha_lambda_robustness.csv 6.5703125 KB
 - alpha_perturbation.csv 0.6240234375 KB
 - alpha_offset_invariance.csv 3.5146484375 KB
 - alpha_outlier_score.csv 0.14453125 KB
 - alpha_lambda_identifiability.csv 0.103515625 KB
 - alpha_summary.csv 0.1171875 KB
