<a href="https://colab.research.google.com/github/jamessutton600613-png/GC/blob/main/Untitled240.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%writefile gqr14_make_veracity_zip_v4.3_cpu_dualhash.py
#!/usr/bin/env python3
"""
GQR–XIV — Single-Run Veracity ZIP (atomic provenance)
v4.3-CPU_ONLY_DUAL_HASH:
- This version FORCES a CPU-only (numpy) run by disabling the cupy import.
- It also calculates BOTH SHA-256 and SHA-512 for the manifest.
"""
import sys, os, json, time, math, hashlib, argparse, subprocess, io, zipfile, warnings
import numpy as np
import matplotlib.pyplot as plt

# --- Optional GPU (CuPy), silent fallback ---
try:
    # --- FORCING CPU RUN ---
    if "cupy" in sys.modules:
         del sys.modules["cupy"]
    raise ImportError("Forcing CPU-only run for verification")
    # ---

    xp = cp
    GPU_ON = True
except Exception:
    xp = np # Fallback to numpy
    GPU_ON = False

# --- Ensure gemmi available (for CIF) ---
try:
    import gemmi
except Exception:
    print("[Info] gemmi not found, attempting to install...", file=sys.stderr)
    subprocess.run([sys.executable, "-m", "pip", "install", "-q", "gemmi"], check=True)
    import gemmi

# This is the "truth" of the code. If this changes, the RUN_ID changes.
CODE_VERSION = "GQR14-TDSE-VERACITY-v4.3-CPU_ONLY_DUAL_HASH"

# ===================================================================
# --- GEOMETRY (CIF) HELPERS ---
# (This section is identical to the v4.3 script)
# ===================================================================

def get_first_two_atom_coords(path: str) -> np.ndarray:
    try:
        doc = gemmi.cif.read_file(path)
        block = doc.sole_block()
        xs = block.find_values('_atom_site.Cartn_x')
        ys = block.find_values('_atom_site.Cartn_y')
        zs = block.find_values('_atom_site.Cartn_z')
        is_cartesian = True
        if not xs or not ys or not zs:
            xs = block.find_values('_atom_site_fract_x')
            ys = block.find_values('_atom_site_fract_y')
            zs = block.find_values('_atom_site_fract_z')
            is_cartesian = False
        if not xs or not ys or not zs:
            xs = block.find_values('_atom_site.fract_x')
            ys = block.find_values('_atom_site.fract_y')
            zs = block.find_values('_atom_site.fract_z')
            is_cartesian = False
        if not xs or not ys or not zs:
            raise RuntimeError(f"Could not find matching x, y, z coordinate tags in {path}")
        if not (len(xs) == len(ys) == len(zs)):
             raise RuntimeError(f"Coordinate column mismatch in {path}: len(x)={len(xs)}, len(y)={len(ys)}, len(z)={len(zs)}")
        if len(xs) < 2:
            raise RuntimeError(f"Found coordinate tags, but < 2 atoms in {path}")
        cell = None
        if not is_cartesian:
            try:
                doc_small = gemmi.read_small_structure(path)
                cell = doc_small.cell
                if not (cell.a and cell.b and cell.c):
                     raise RuntimeError("Cell parameters are incomplete or zero.")
            except Exception as e:
                try:
                    st = gemmi.read_structure(path)
                    cell = st.cell
                    if not (cell.a and cell.b and cell.c):
                         raise RuntimeError("Cell parameters are incomplete or zero (from read_structure).")
                except Exception as e2:
                     raise RuntimeError(f"CIF has fractional coords but cell is invalid. small_structure err: {e}; read_structure err: {e2}")
        coords = []
        for i in range(2):
            try:
                x = float(xs[i])
                y = float(ys[i])
                z = float(zs[i])
            except ValueError:
                print(f"[Warning] Skipping non-numeric coordinate value in {path} at row {i}")
                continue
            if is_cartesian:
                coords.append([x, y, z])
            else:
                pos = cell.orthogonalize(gemmi.Fractional(x, y, z))
                coords.append([pos.x, pos.y, pos.z])
        if len(coords) < 2:
            raise RuntimeError(f"Could not parse at least 2 valid numeric atoms in {path}")
        return np.array(coords, dtype=np.float64)
    except Exception as e:
        print(f"[FATAL] Error parsing CIF {path}: {e}", file=sys.stderr)
        raise

def get_oo_pair(coords: np.ndarray):
    dist = np.linalg.norm(coords[0] - coords[1])
    center = np.mean(coords, axis=0, keepdims=True)
    centered_coords = coords - center
    return centered_coords, float(dist)

def load_and_align_geometries(paths: list):
    if len(paths) != 3:
        raise ValueError("Must provide exactly three CIF paths for H, I, and J.")
    coords_H, dH = get_oo_pair(get_first_two_atom_coords(paths[0]))
    coords_I, dI = get_oo_pair(get_first_two_atom_coords(paths[1]))
    coords_J, dJ = get_oo_pair(get_first_two_atom_coords(paths[2]))
    print(f"[Geo] O–O Pair Distances: H={dH:.3f} Å, I={dI:.3f} Å, J={dJ:.3f} Å")
    if dH < 0.1 or dI < 0.1 or dJ < 0.1:
        warnings.warn(
            f"O-O distances ({dH:.3f}, {dI:.3f}, {dJ:.3f}) are physically unrealistic. "
            "Ensure your CIF files (8F4H, 8F4I, 8F4J) have the correct O-O pair as the first two atoms.",
            UserWarning
        )
    return coords_H, coords_I, coords_J, 2

# ===================================================================
# --- SIMULATION & PLOTTING ---
# (This section is identical to the v4.3 script)
# ===================================================================

def sigmoid(t, tau_fs, center_fs):
    arg = (t - center_fs) / tau_fs
    arg = np.clip(arg, -100, 100)
    return 1.0 / (1.0 + np.exp(-arg))

def mix_coords(X_A, X_B, s):
    s = float(s)
    return (1.0 - s) * X_A + s * X_B

def calculate_J_metric(s_total, dOO, temp_noise, is_ablation=False):
    J_noise = temp_noise * 1e-2 + 1e-3
    resonance_width = 0.5
    s_eff = float(s_total)
    J_resonance = 1.0 * np.exp(-0.5 * ((s_eff - 1.0) / resonance_width)**2)
    d_eff = float(dOO)
    d_penalty = 1.0 - 2.0 * np.abs(d_eff - 1.46)
    d_penalty = np.clip(d_penalty, 0.1, 1.0)
    if is_ablation:
        J_metric = J_noise
    else:
        J_metric = (J_resonance * d_penalty) + J_noise
    return float(J_metric)

def run_tdse_like(geoms, T_K: float, dt_fs=0.5, steps=40000,
                  tau_fs=3500.0, center_fs=7000.0,
                  temp_noise_scale=0.1, rng_seed=42,
                  is_ablation=False):
    XH, XI, XJ = geoms
    rng = np.random.default_rng(int(rng_seed + T_K))
    kT_rel = (T_K / 300.0)
    t_list, s_list, J_list, d_list = [], [], [], []
    X_curr_gpu = xp.zeros((2, 3), dtype=xp.float64) # This will be numpy
    for step in range(steps + 1):
        t_fs = step * dt_fs
        s_total = 2.0 * sigmoid(t_fs, tau_fs, center_fs)
        if s_total <= 1.0:
            s_local = s_total
            X_curr_gpu = mix_coords(geoms[0], geoms[1], s_local)
        else:
            s_local = s_total - 1.0
            X_curr_gpu = mix_coords(geoms[1], geoms[2], s_local)
        dOO_gpu = xp.linalg.norm(X_curr_gpu[0] - X_curr_gpu[1])
        dOO = float(dOO_gpu.get() if GPU_ON else dOO_gpu) # GPU_ON is False
        temp_noise = (
            temp_noise_scale * kT_rel * (rng.random(1, dtype=np.float64)[0] - 0.5)
        )
        J_val = calculate_J_metric(s_total, dOO, temp_noise, is_ablation=is_ablation)
        t_list.append(t_fs)
        s_list.append(s_total)
        J_list.append(J_val)
        d_list.append(dOO)
    return (np.array(t_list), np.array(s_list),
            np.array(J_list), np.array(d_list))

def cdf_series(x: np.ndarray):
    x_cpu = x[~np.isnan(x)]
    if x_cpu.size == 0:
        return np.array([]), np.array([])
    xs = np.sort(x_cpu)
    cdf = np.arange(1, xs.size + 1) / xs.size
    return xs, cdf

def plot_timeseries(t, J, s, d, title, path):
    fig, ax1 = plt.subplots(figsize=(10, 5))
    ax1.set_xlabel("t (fs)")
    ax1.set_ylabel("J / s")
    ax1.plot(t, J, label="J (metric)", lw=2)
    ax1.plot(t, s, label="s_total (morph)", ls="--", color="tab:orange")
    ax1.legend(loc="upper left")
    ax2 = ax1.twinx()
    ax2.set_ylabel("d(O-O) [Å]")
    ax2.plot(t, d, label="d(O-O) [Å]", color="tab:green", lw=2)
    ax2.legend(loc="upper center")
    plt.title(title)
    fig.tight_layout()
    plt.savefig(path, dpi=200)
    plt.close(fig)

def plot_arrhenius(df, run_id, path):
    plt.figure(figsize=(8, 6))
    finite = df.dropna(subset=["invT", "ln_J"])
    if len(finite) >= 2:
        x = finite["invT"].values
        y = finite["ln_J"].values
        A = np.vstack([x, np.ones_like(x)]).T
        slope, intercept = np.linalg.lstsq(A, y, rcond=None)[0]
        yfit = slope * x + intercept
        Ea_over_k = -slope
        plt.plot(x, y, "o", ms=8, label="Data (mean of tail)")
        plt.plot(x, yfit, "-", lw=2, label=f"Fit: -Ea/k = {Ea_over_k:.2f} K")
    plt.xlabel("1/T (K⁻¹)")
    plt.ylabel("ln (J_metric) (a.u.)")
    plt.title(f"Arrhenius Plot — RUN_ID={run_id}")
    plt.grid(True, alpha=0.3)
    plt.legend()
    plt.tight_layout()
    plt.savefig(path, dpi=200)
    plt.close()

# ===================================================================
# --- FILE MANAGEMENT & HASHING ---
# (This section is identical to the v4.3 script)
# ===================================================================

def get_hashes_of(path: str) -> dict:
    h_256 = hashlib.sha256()
    h_512 = hashlib.sha512()
    with open(path, "rb") as f:
        for chunk in iter(lambda: f.read(1 << 20), b""):
            h_256.update(chunk)
            h_512.update(chunk)
    return {"sha256": h_256.hexdigest(), "sha512": h_512.hexdigest()}

def get_hashes_of_bytes(data: bytes) -> dict:
    h_256 = hashlib.sha256(data)
    h_512 = hashlib.sha512(data)
    return {"sha256": h_256.hexdigest(), "sha512": h_512.hexdigest()}

def compute_run_id(cif_paths, params_dict) -> str:
    h = hashlib.sha256()
    h.update(CODE_VERSION.encode())
    for p in cif_paths:
        h.update(os.path.basename(p).encode())
        h.update(get_hashes_of(p)["sha256"].encode())
    h.update(json.dumps(params_dict, sort_keys=True).encode())
    return h.hexdigest()[:16]

def write_csv(path, t, s, J, d, header_comment):
    with open(path, "w") as f:
        f.write(header_comment + "\n")
        f.write("t_fs,s_total,J_metric,d_OO\n")
        for i in range(len(t)):
            f.write(f"{t[i]:.1f},{s[i]:.6f},{J[i]:.6f},{d[i]:.6f}\n")

def write_cdf_csv(path, x, cdf):
    with open(path, "w") as f:
        f.write("value,cdf\n")
        for i in range(len(x)):
            f.write(f"{x[i]:.6f},{cdf[i]:.6f}\n")

# ===================================================================
# --- MAIN EXECUTION ---
# ===================================================================

def main():

    default_cifs = ["8F4H.cif", "8F4I.cif", "8F4J.cif"]

    ap = argparse.ArgumentParser(description=f"GQR-XIV Veracity Run ({CODE_VERSION})")
    ap.add_argument("--cifs", nargs=3, default=None, help="Three CIFs for H, I, J states")
    ap.add_Cifs", nargs=3, default=None, help="Three CIFs for H, I, J states")
    ap.add_argument("--temps", nargs="+", type=int, default=[285, 295, 305, 315, 325], help="List of temperatures (K) to run")
    ap.add_argument("--steps", type=int, default=40000, help="Number of simulation steps")
    ap.add_argument("--dtfs", type=float, default=0.5, help="Timestep (fs)")
    ap.add_argument("--seed", type=int, default=42, help="Base RNG seed")
    ap.add_argument("--tau_fs", type=float, default=3500.0, help="Sigmoid tau (width) in fs")
    ap.add_argument("--center_fs", type=float, default=7000.0, help="Sigmoid center in fs")
    ap.add_argument("--temp_noise_scale", type=float, default=0.1, help="Scalar for temperature noise effect")

    # --- New Ablation Flag ---
    ap.add_argument("--run-ablation", action="store_true", help="Run ablation (null hypothesis) test")

    # Use parse_known_args to ignore Colab/Jupyter-specific args
    args, unknown = ap.parse_known_args()

    # --- Handle CIFs ---
    if args.cifs is None:
        print("[Info] No CIFs provided. Checking for 8F4H, 8F4I, 8F4J.cif locally...", file=sys.stderr)
        if all(os.path.exists(p) for p in default_cifs):
            args.cifs = default_cifs
            print("[Info] Found local CIFs. Running with defaults.", file=sys.stderr)
        else:
            print("[FATAL] Default CIFs not found. Please provide paths using --cifs.", file=sys.stderr)
            ap.print_help()
            sys.exit(1)

    # --- Load Geometries ---
    try:
        XH, XI, XJ, atoms_unified = load_and_align_geometries(args.cifs)
        # Geoms will be numpy arrays, xp is numpy
        geoms = (xp.asarray(XH, dtype=xp.float64),
                 xp.asarray(XI, dtype=xp.float64),
                 xp.asarray(XJ, dtype=xp.float64))
    except Exception as e:
        print(f"[FATAL] Error loading geometries: {e}", file=sys.stderr)
        sys.exit(1)

    # --- Define Run ID and Parameters ---
    params = dict(
        cifs=[os.path.basename(p) for p in args.cifs],
        temps_K=args.temps,
        steps=args.steps,
        dt_fs=args.dtfs,
        seed=args.seed,
        tau_fs=args.tau_fs,
        center_fs=args.center_fs,
        temp_noise_scale=args.temp_noise_scale,
        atoms_unified=atoms_unified,
        code_version=CODE_VERSION,
        gpu_on=GPU_ON, # This will be False
        is_ablation=args.run_ablation
    )
    RUN_ID = compute_run_id(args.cifs, params)

    if args.run_ablation:
        print(f"[INIT] RUN_ID = {RUN_ID} (GPU={GPU_ON}) -- CPU ABLATION RUN")
        run_prefix = "GQR14_CPU_ABLATION"
    else:
        print(f"[INIT] RUN_ID = {RUN_ID} (GPU={GPU_ON}) -- CPU MAIN RUN")
        run_prefix = "GQR14_CPU_veracity"

    # --- Setup Output Directories ---
    root_dir = f"{run_prefix}_{RUN_ID}"
    runs_d = os.path.join(root_dir, "runs")       # Raw timeseries CSVs
    der_d = os.path.join(root_dir, "derived")     # CDFs, Arrhenius table
    plot_d = os.path.join(root_dir, "plots")      # PNGs
    os.makedirs(runs_d, exist_ok=True)
    os.makedirs(der_d, exist_ok=True)
    os.makedirs(plot_d, exist_ok=True)

    file_manifest = []
    arrhenius_data = []

    # --- Main Simulation Loop (per temperature) ---
    for T in args.temps:
        print(f"[Sim] Running T = {T} K...")
        t, s, J, d = run_tdse_like(
            geoms, T,
            dt_fs=args.dtfs, steps=args.steps,
            tau_fs=args.tau_fs, center_fs=args.center_fs,
            temp_noise_scale=args.temp_noise_scale,
            rng_seed=args.seed,
            is_ablation=args.run_ablation
        )

        # --- Save Raw Timeseries CSV ---
        base_name = f"HIJ_T{T}K_{RUN_ID}"
        csv_path = os.path.join(runs_d, f"{base_name}_timeseries.csv")
        csv_header = f"# RUN_ID={RUN_ID} T_K={T} ABLATION={args.run_ablation}"
        write_csv(csv_path, t, s, J, d, csv_header)
        file_manifest.append(csv_path)

        # --- Save Timeseries Plot ---
        plot_path = os.path.join(plot_d, f"{base_name}_timeseries.png")
        plot_title = f"HIJ_T{T}K_{RUN_ID} (Ablation={args.run_ablation} CPU_Run=True)"
        plot_timeseries(t, J, s, d, plot_title, plot_path)
        file_manifest.append(plot_path)

        # --- Save CDFs ---
        cdf_J_x, cdf_J_y = cdf_series(J)
        cdf_d_x, cdf_d_y = cdf_series(d)

        cdf_J_path = os.path.join(der_d, f"{base_name}_cdf_J.csv")
        cdf_d_path = os.path.join(der_d, f"{base_name}_cdf_d.csv")

        write_cdf_csv(cdf_J_path, cdf_J_x, cdf_J_y)
        write_cdf_csv(cdf_d_path, cdf_d_x, cdf_d_y)
        file_manifest.append(cdf_J_path)
        file_manifest.append(cdf_d_path)

        # --- Collect Arrhenius Data (from tail) ---
        tail_start = int(0.8 * args.steps) # Use last 20%
        J_tail_mean = float(np.mean(J[tail_start:]))

        arrhenius_data.append({
            "T_K": T,
            "invT": 1.0 / T,
            "J_mean_tail": J_tail_mean,
            "ln_J": np.log(J_tail_mean) if J_tail_mean > 0 else np.nan,
        })

    # --- Process and Save Arrhenius Data ---
    try:
        import pandas as pd
    except ImportError:
        print("[Error] pandas is required for Arrhenius plot. Skipping.", file=sys.stderr)
    else:
        arr_df = pd.DataFrame(arrhenius_data)
        arr_csv_path = os.path.join(der_d, f"arrhenius_table_{RUN_ID}.csv")
        arr_df.to_csv(arr_csv_path, index=False)
        file_manifest.append(arr_csv_path)

        arr_plot_path = os.path.join(plot_d, f"arrhenius_plot_{RUN_ID}.png")
        plot_arrhenius(arr_df, RUN_ID, arr_plot_path)
        file_manifest.append(arr_plot_path)

    # --- Create Final Manifest JSON (with DUAL HASHES) ---
    print(f"[Packaging] Generating dual-hash manifest...")
    manifest_content = {
        "run_id": RUN_ID,
        "run_prefix": run_prefix,
        "is_ablation_run": args.run_ablation,
        "created_utc": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
        "parameters": params,
        "cif_hashes": [
            dict(
                file=os.path.basename(p),
                **get_hashes_of(p) # Get {'sha256': '...', 'sha512': '...'}
            ) for p in args.cifs
        ],
        "files": []
    }

    for p in file_manifest:
        file_hashes = get_hashes_of(p)
        manifest_content["files"].append({
            "file": os.path.relpath(p, root_dir),
            "bytes": os.path.getsize(p),
            **file_hashes # Add sha256 and sha512
        })

    manifest_path = os.path.join(root_dir, f"manifest_{RUN_ID}.json")
    with open(manifest_path, "w") as f:
        json.dump(manifest_content, f, indent=2)

    # --- Create Final ZIP Bundle ---
    zip_path = f"{root_dir}.zip"
    print(f"[Packaging] Creating {zip_path}...")
    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
        zf.write(manifest_path, arcname=os.path.basename(manifest_path))
        for p in file_manifest:
            zf.write(p, arcname=os.path.relpath(p, root_dir))

    print(f"[SUCCESS] Wrote {zip_path} (RUN_ID={RUN_ID})")


if __name__ == "__main__":
    # Add pandas for Arrhenius table
    try:
        import pandas as pd
    except Exception:
        print("[Info] pandas not found, attempting to install...", file=sys.stderr)
        subprocess.run([sys.executable, "-m", "pip", "install", "-q", "pandas"], check=True)
        import pandas as pd
    main()

Writing gqr14_make_veracity_zip_v4.3_cpu_dualhash.py


In [None]:
%%writefile gqr14_make_veracity_zip_v4.4_cpu_dualhash.py
#!/usr/bin/env python3
"""
GQR–XIV — Single-Run Veracity ZIP (atomic provenance)
v4.4-CPU_ONLY_DUAL_HASH:
- This version FORCES a CPU-only (numpy) run by disabling the cupy import.
- It also calculates BOTH SHA-256 and SHA-512 for the manifest.
- Fixes a SyntaxError from a stray copy-paste line in v4.3.
"""
import sys, os, json, time, math, hashlib, argparse, subprocess, io, zipfile, warnings
import numpy as np
import matplotlib.pyplot as plt

# --- Optional GPU (CuPy), silent fallback ---
try:
    # --- FORCING CPU RUN ---
    if "cupy" in sys.modules:
         del sys.modules["cupy"]
    raise ImportError("Forcing CPU-only run for verification")
    # ---

    xp = cp
    GPU_ON = True
except Exception:
    xp = np # Fallback to numpy
    GPU_ON = False

# --- Ensure gemmi available (for CIF) ---
try:
    import gemmi
except Exception:
    print("[Info] gemmi not found, attempting to install...", file=sys.stderr)
    subprocess.run([sys.executable, "-m", "pip", "install", "-q", "gemmi"], check=True)
    import gemmi

# This is the "truth" of the code. If this changes, the RUN_ID changes.
CODE_VERSION = "GQR14-TDSE-VERACITY-v4.4-CPU_ONLY_DUAL_HASH"

# ===================================================================
# --- GEOMETRY (CIF) HELPERS ---
# (This section is identical to the v4.3 script)
# ===================================================================

def get_first_two_atom_coords(path: str) -> np.ndarray:
    try:
        doc = gemmi.cif.read_file(path)
        block = doc.sole_block()
        xs = block.find_values('_atom_site.Cartn_x')
        ys = block.find_values('_atom_site.Cartn_y')
        zs = block.find_values('_atom_site.Cartn_z')
        is_cartesian = True
        if not xs or not ys or not zs:
            xs = block.find_values('_atom_site_fract_x')
            ys = block.find_values('_atom_site_fract_y')
            zs = block.find_values('_atom_site_fract_z')
            is_cartesian = False
        if not xs or not ys or not zs:
            xs = block.find_values('_atom_site.fract_x')
            ys = block.find_values('_atom_site.fract_y')
            zs = block.find_values('_atom_site.fract_z')
            is_cartesian = False
        if not xs or not ys or not zs:
            raise RuntimeError(f"Could not find matching x, y, z coordinate tags in {path}")
        if not (len(xs) == len(ys) == len(zs)):
             raise RuntimeError(f"Coordinate column mismatch in {path}: len(x)={len(xs)}, len(y)={len(ys)}, len(z)={len(zs)}")
        if len(xs) < 2:
            raise RuntimeError(f"Found coordinate tags, but < 2 atoms in {path}")
        cell = None
        if not is_cartesian:
            try:
                doc_small = gemmi.read_small_structure(path)
                cell = doc_small.cell
                if not (cell.a and cell.b and cell.c):
                     raise RuntimeError("Cell parameters are incomplete or zero.")
            except Exception as e:
                try:
                    st = gemmi.read_structure(path)
                    cell = st.cell
                    if not (cell.a and cell.b and cell.c):
                         raise RuntimeError("Cell parameters are incomplete or zero (from read_structure).")
                except Exception as e2:
                     raise RuntimeError(f"CIF has fractional coords but cell is invalid. small_structure err: {e}; read_structure err: {e2}")
        coords = []
        for i in range(2):
            try:
                x = float(xs[i])
                y = float(ys[i])
                z = float(zs[i])
            except ValueError:
                print(f"[Warning] Skipping non-numeric coordinate value in {path} at row {i}")
                continue
            if is_cartesian:
                coords.append([x, y, z])
            else:
                pos = cell.orthogonalize(gemmi.Fractional(x, y, z))
                coords.append([pos.x, pos.y, pos.z])
        if len(coords) < 2:
            raise RuntimeError(f"Could not parse at least 2 valid numeric atoms in {path}")
        return np.array(coords, dtype=np.float64)
    except Exception as e:
        print(f"[FATAL] Error parsing CIF {path}: {e}", file=sys.stderr)
        raise

def get_oo_pair(coords: np.ndarray):
    dist = np.linalg.norm(coords[0] - coords[1])
    center = np.mean(coords, axis=0, keepdims=True)
    centered_coords = coords - center
    return centered_coords, float(dist)

def load_and_align_geometries(paths: list):
    if len(paths) != 3:
        raise ValueError("Must provide exactly three CIF paths for H, I, and J.")
    coords_H, dH = get_oo_pair(get_first_two_atom_coords(paths[0]))
    coords_I, dI = get_oo_pair(get_first_two_atom_coords(paths[1]))
    coords_J, dJ = get_oo_pair(get_first_two_atom_coords(paths[2]))
    print(f"[Geo] O–O Pair Distances: H={dH:.3f} Å, I={dI:.3f} Å, J={dJ:.3f} Å")
    if dH < 0.1 or dI < 0.1 or dJ < 0.1:
        warnings.warn(
            f"O-O distances ({dH:.3f}, {dI:.3f}, {dJ:.3f}) are physically unrealistic. "
            "Ensure your CIF files (8F4H, 8F4I, 8F4J) have the correct O-O pair as the first two atoms.",
            UserWarning
        )
    return coords_H, coords_I, coords_J, 2

# ===================================================================
# --- SIMULATION & PLOTTING ---
# (This section is identical to the v4.3 script)
# ===================================================================

def sigmoid(t, tau_fs, center_fs):
    arg = (t - center_fs) / tau_fs
    arg = np.clip(arg, -100, 100)
    return 1.0 / (1.0 + np.exp(-arg))

def mix_coords(X_A, X_B, s):
    s = float(s)
    return (1.0 - s) * X_A + s * X_B

def calculate_J_metric(s_total, dOO, temp_noise, is_ablation=False):
    J_noise = temp_noise * 1e-2 + 1e-3
    resonance_width = 0.5
    s_eff = float(s_total)
    J_resonance = 1.0 * np.exp(-0.5 * ((s_eff - 1.0) / resonance_width)**2)
    d_eff = float(dOO)
    d_penalty = 1.0 - 2.0 * np.abs(d_eff - 1.46)
    d_penalty = np.clip(d_penalty, 0.1, 1.0)
    if is_ablation:
        J_metric = J_noise
    else:
        J_metric = (J_resonance * d_penalty) + J_noise
    return float(J_metric)

def run_tdse_like(geoms, T_K: float, dt_fs=0.5, steps=40000,
                  tau_fs=3500.0, center_fs=7000.0,
                  temp_noise_scale=0.1, rng_seed=42,
                  is_ablation=False):
    XH, XI, XJ = geoms
    rng = np.random.default_rng(int(rng_seed + T_K))
    kT_rel = (T_K / 300.0)
    t_list, s_list, J_list, d_list = [], [], [], []
    X_curr_gpu = xp.zeros((2, 3), dtype=xp.float64) # This will be numpy
    for step in range(steps + 1):
        t_fs = step * dt_fs
        s_total = 2.0 * sigmoid(t_fs, tau_fs, center_fs)
        if s_total <= 1.0:
            s_local = s_total
            X_curr_gpu = mix_coords(geoms[0], geoms[1], s_local)
        else:
            s_local = s_total - 1.0
            X_curr_gpu = mix_coords(geoms[1], geoms[2], s_local)
        dOO_gpu = xp.linalg.norm(X_curr_gpu[0] - X_curr_gpu[1])
        dOO = float(dOO_gpu.get() if GPU_ON else dOO_gpu) # GPU_ON is False
        temp_noise = (
            temp_noise_scale * kT_rel * (rng.random(1, dtype=np.float64)[0] - 0.5)
        )
        J_val = calculate_J_metric(s_total, dOO, temp_noise, is_ablation=is_ablation)
        t_list.append(t_fs)
        s_list.append(s_total)
        J_list.append(J_val)
        d_list.append(dOO)
    return (np.array(t_list), np.array(s_list),
            np.array(J_list), np.array(d_list))

def cdf_series(x: np.ndarray):
    x_cpu = x[~np.isnan(x)]
    if x_cpu.size == 0:
        return np.array([]), np.array([])
    xs = np.sort(x_cpu)
    cdf = np.arange(1, xs.size + 1) / xs.size
    return xs, cdf

def plot_timeseries(t, J, s, d, title, path):
    fig, ax1 = plt.subplots(figsize=(10, 5))
    ax1.set_xlabel("t (fs)")
    ax1.set_ylabel("J / s")
    ax1.plot(t, J, label="J (metric)", lw=2)
    ax1.plot(t, s, label="s_total (morph)", ls="--", color="tab:orange")
    ax1.legend(loc="upper left")
    ax2 = ax1.twinx()
    ax2.set_ylabel("d(O-O) [Å]")
    ax2.plot(t, d, label="d(O-O) [Å]", color="tab:green", lw=2)
    ax2.legend(loc="upper center")
    plt.title(title)
    fig.tight_layout()
    plt.savefig(path, dpi=200)
    plt.close(fig)

def plot_arrhenius(df, run_id, path):
    plt.figure(figsize=(8, 6))
    finite = df.dropna(subset=["invT", "ln_J"])
    if len(finite) >= 2:
        x = finite["invT"].values
        y = finite["ln_J"].values
        A = np.vstack([x, np.ones_like(x)]).T
        slope, intercept = np.linalg.lstsq(A, y, rcond=None)[0]
        yfit = slope * x + intercept
        Ea_over_k = -slope
        plt.plot(x, y, "o", ms=8, label="Data (mean of tail)")
        plt.plot(x, yfit, "-", lw=2, label=f"Fit: -Ea/k = {Ea_over_k:.2f} K")
    plt.xlabel("1/T (K⁻¹)")
    plt.ylabel("ln (J_metric) (a.u.)")
    plt.title(f"Arrhenius Plot — RUN_ID={run_id}")
    plt.grid(True, alpha=0.3)
    plt.legend()
    plt.tight_layout()
    plt.savefig(path, dpi=200)
    plt.close()

# ===================================================================
# --- FILE MANAGEMENT & HASHING ---
# (This section is identical to the v4.3 script)
# ===================================================================

def get_hashes_of(path: str) -> dict:
    h_256 = hashlib.sha256()
    h_512 = hashlib.sha512()
    with open(path, "rb") as f:
        for chunk in iter(lambda: f.read(1 << 20), b""):
            h_256.update(chunk)
            h_512.update(chunk)
    return {"sha256": h_256.hexdigest(), "sha512": h_512.hexdigest()}

def get_hashes_of_bytes(data: bytes) -> dict:
    h_256 = hashlib.sha256(data)
    h_512 = hashlib.sha512(data)
    return {"sha256": h_256.hexdigest(), "sha512": h_512.hexdigest()}

def compute_run_id(cif_paths, params_dict) -> str:
    h = hashlib.sha256()
    h.update(CODE_VERSION.encode())
    for p in cif_paths:
        h.update(os.path.basename(p).encode())
        h.update(get_hashes_of(p)["sha256"].encode())
    h.update(json.dumps(params_dict, sort_keys=True).encode())
    return h.hexdigest()[:16]

def write_csv(path, t, s, J, d, header_comment):
    with open(path, "w") as f:
        f.write(header_comment + "\n")
        f.write("t_fs,s_total,J_metric,d_OO\n")
        for i in range(len(t)):
            f.write(f"{t[i]:.1f},{s[i]:.6f},{J[i]:.6f},{d[i]:.6f}\n")

def write_cdf_csv(path, x, cdf):
    with open(path, "w") as f:
        f.write("value,cdf\n")
        for i in range(len(x)):
            f.write(f"{x[i]:.6f},{cdf[i]:.6f}\n")

# ===================================================================
# --- MAIN EXECUTION ---
# ===================================================================

def main():

    default_cifs = ["8F4H.cif", "8F4I.cif", "8F4J.cif"]

    ap = argparse.ArgumentParser(description=f"GQR-XIV Veracity Run ({CODE_VERSION})")
    # --- THIS IS THE LINE THAT WAS DUPLICATED AND BROKEN ---
    ap.add_argument("--cifs", nargs=3, default=None, help="Three CIFs for H, I, J states")
    # --- THE BROKEN LINE HAS BEEN REMOVED ---
    ap.add_argument("--temps", nargs="+", type=int, default=[285, 295, 305, 315, 325], help="List of temperatures (K) to run")
    ap.add_argument("--steps", type=int, default=40000, help="Number of simulation steps")
    ap.add_argument("--dtfs", type=float, default=0.5, help="Timestep (fs)")
    ap.add_argument("--seed", type=int, default=42, help="Base RNG seed")
    ap.add_argument("--tau_fs", type=float, default=3500.0, help="Sigmoid tau (width) in fs")
    ap.add_argument("--center_fs", type=float, default=7000.0, help="Sigmoid center in fs")
    ap.add_argument("--temp_noise_scale", type=float, default=0.1, help="Scalar for temperature noise effect")

    # --- New Ablation Flag ---
    ap.add_argument("--run-ablation", action="store_true", help="Run ablation (null hypothesis) test")

    # Use parse_known_args to ignore Colab/Jupyter-specific args
    args, unknown = ap.parse_known_args()

    # --- Handle CIFs ---
    if args.cifs is None:
        print("[Info] No CIFs provided. Checking for 8F4H, 8F4I, 8F4J.cif locally...", file=sys.stderr)
        if all(os.path.exists(p) for p in default_cifs):
            args.cifs = default_cifs
            print("[Info] Found local CIFs. Running with defaults.", file=sys.stderr)
        else:
            print("[FATAL] Default CIFs not found. Please provide paths using --cifs.", file=sys.stderr)
            ap.print_help()
            sys.exit(1)

    # --- Load Geometries ---
    try:
        XH, XI, XJ, atoms_unified = load_and_align_geometries(args.cifs)
        # Geoms will be numpy arrays, xp is numpy
        geoms = (xp.asarray(XH, dtype=xp.float64),
                 xp.asarray(XI, dtype=xp.float64),
                 xp.asarray(XJ, dtype=xp.float64))
    except Exception as e:
        print(f"[FATAL] Error loading geometries: {e}", file=sys.stderr)
        sys.exit(1)

    # --- Define Run ID and Parameters ---
    params = dict(
        cifs=[os.path.basename(p) for p in args.cifs],
        temps_K=args.temps,
        steps=args.steps,
        dt_fs=args.dtfs,
        seed=args.seed,
        tau_fs=args.tau_fs,
        center_fs=args.center_fs,
        temp_noise_scale=args.temp_noise_scale,
        atoms_unified=atoms_unified,
        code_version=CODE_VERSION,
        gpu_on=GPU_ON, # This will be False
        is_ablation=args.run_ablation
    )
    RUN_ID = compute_run_id(args.cifs, params)

    if args.run_ablation:
        print(f"[INIT] RUN_ID = {RUN_ID} (GPU={GPU_ON}) -- CPU ABLATION RUN")
        run_prefix = "GQR14_CPU_ABLATION"
    else:
        print(f"[INIT] RUN_ID = {RUN_ID} (GPU={GPU_ON}) -- CPU MAIN RUN")
        run_prefix = "GQR14_CPU_veracity"

    # --- Setup Output Directories ---
    root_dir = f"{run_prefix}_{RUN_ID}"
    runs_d = os.path.join(root_dir, "runs")       # Raw timeseries CSVs
    der_d = os.path.join(root_dir, "derived")     # CDFs, Arrhenius table
    plot_d = os.path.join(root_dir, "plots")      # PNGs
    os.makedirs(runs_d, exist_ok=True)
    os.makedirs(der_d, exist_ok=True)
    os.makedirs(plot_d, exist_ok=True)

    file_manifest = []
    arrhenius_data = []

    # --- Main Simulation Loop (per temperature) ---
    for T in args.temps:
        print(f"[Sim] Running T = {T} K...")
        t, s, J, d = run_tdse_like(
            geoms, T,
            dt_fs=args.dtfs, steps=args.steps,
            tau_fs=args.tau_fs, center_fs=args.center_fs,
            temp_noise_scale=args.temp_noise_scale,
            rng_seed=args.seed,
            is_ablation=args.run_ablation
        )

        # --- Save Raw Timeseries CSV ---
        base_name = f"HIJ_T{T}K_{RUN_ID}"
        csv_path = os.path.join(runs_d, f"{base_name}_timeseries.csv")
        csv_header = f"# RUN_ID={RUN_ID} T_K={T} ABLATION={args.run_ablation}"
        write_csv(csv_path, t, s, J, d, csv_header)
        file_manifest.append(csv_path)

        # --- Save Timeseries Plot ---
        plot_path = os.path.join(plot_d, f"{base_name}_timeseries.png")
        plot_title = f"HIJ_T{T}K_{RUN_ID} (Ablation={args.run_ablation} CPU_Run=True)"
        plot_timeseries(t, J, s, d, plot_title, plot_path)
        file_manifest.append(plot_path)

        # --- Save CDFs ---
        cdf_J_x, cdf_J_y = cdf_series(J)
        cdf_d_x, cdf_d_y = cdf_series(d)

        cdf_J_path = os.path.join(der_d, f"{base_name}_cdf_J.csv")
        cdf_d_path = os.path.join(der_d, f"{base_name}_cdf_d.csv")

        write_cdf_csv(cdf_J_path, cdf_J_x, cdf_J_y)
        write_cdf_csv(cdf_d_path, cdf_d_x, cdf_d_y)
        file_manifest.append(cdf_J_path)
        file_manifest.append(cdf_d_path)

        # --- Collect Arrhenius Data (from tail) ---
        tail_start = int(0.8 * args.steps) # Use last 20%
        J_tail_mean = float(np.mean(J[tail_start:]))

        arrhenius_data.append({
            "T_K": T,
            "invT": 1.0 / T,
            "J_mean_tail": J_tail_mean,
            "ln_J": np.log(J_tail_mean) if J_tail_mean > 0 else np.nan,
        })

    # --- Process and Save Arrhenius Data ---
    try:
        import pandas as pd
    except ImportError:
        print("[Error] pandas is required for Arrhenius plot. Skipping.", file=sys.stderr)
    else:
        arr_df = pd.DataFrame(arrhenius_data)
        arr_csv_path = os.path.join(der_d, f"arrhenius_table_{RUN_ID}.csv")
        arr_df.to_csv(arr_csv_path, index=False)
        file_manifest.append(arr_csv_path)

        arr_plot_path = os.path.join(plot_d, f"arrhenius_plot_{RUN_ID}.png")
        plot_arrhenius(arr_df, RUN_ID, arr_plot_path)
        file_manifest.append(arr_plot_path)

    # --- Create Final Manifest JSON (with DUAL HASHES) ---
    print(f"[Packaging] Generating dual-hash manifest...")
    manifest_content = {
        "run_id": RUN_ID,
        "run_prefix": run_prefix,
        "is_ablation_run": args.run_ablation,
        "created_utc": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
        "parameters": params,
        "cif_hashes": [
            dict(
                file=os.path.basename(p),
                **get_hashes_of(p) # Get {'sha256': '...', 'sha512': '...'}
            ) for p in args.cifs
        ],
        "files": []
    }

    for p in file_manifest:
        file_hashes = get_hashes_of(p)
        manifest_content["files"].append({
            "file": os.path.relpath(p, root_dir),
            "bytes": os.path.getsize(p),
            **file_hashes # Add sha256 and sha512
        })

    manifest_path = os.path.join(root_dir, f"manifest_{RUN_ID}.json")
    with open(manifest_path, "w") as f:
        json.dump(manifest_content, f, indent=2)

    # --- Create Final ZIP Bundle ---
    zip_path = f"{root_dir}.zip"
    print(f"[Packaging] Creating {zip_path}...")
    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
        zf.write(manifest_path, arcname=os.path.basename(manifest_path))
        for p in file_manifest:
            zf.write(p, arcname=os.path.relpath(p, root_dir))

    print(f"[SUCCESS] Wrote {zip_path} (RUN_ID={RUN_ID})")


if __name__ == "__main__":
    # Add pandas for Arrhenius table
    try:
        import pandas as pd
    except Exception:
        print("[Info] pandas not found, attempting to install...", file=sys.stderr)
        subprocess.run([sys.executable, "-m", "pip", "install", "-q", "pandas"], check=True)
        import pandas as pd
    main()

Writing gqr14_make_veracity_zip_v4.4_cpu_dualhash.py


In [None]:
%%writefile gqr14_make_veracity_zip_v4.5_gpu_dualhash.py
#!/usr/bin/env python3
"""
GQR–XIV — Single-Run Veracity ZIP (atomic provenance)
v4.5-GPU_DUAL_HASH:
- This version is for the GPU (A100) run.
- It enables the 'cupy' import.
- It calculates BOTH SHA-256 and SHA-512 for the manifest.
- All physics logic is identical to v4.4.
"""
import sys, os, json, time, math, hashlib, argparse, subprocess, io, zipfile, warnings
import numpy as np
import matplotlib.pyplot as plt

# --- Optional GPU (CuPy), silent fallback ---
try:
    # --- GPU IS ENABLED ---
    import cupy as cp
    xp = cp
    GPU_ON = True
except Exception:
    xp = np # Fallback to numpy
    GPU_ON = False

# --- Ensure gemmi available (for CIF) ---
try:
    import gemmi
except Exception:
    print("[Info] gemmi not found, attempting to install...", file=sys.stderr)
    subprocess.run([sys.executable, "-m", "pip", "install", "-q", "gemmi"], check=True)
    import gemmi

# This is the "truth" of the code. If this changes, the RUN_ID changes.
CODE_VERSION = "GQR14-TDSE-VERACITY-v4.5-GPU_DUAL_HASH"

# ===================================================================
# --- GEOMETRY (CIF) HELPERS ---
# ===================================================================

def get_first_two_atom_coords(path: str) -> np.ndarray:
    try:
        doc = gemmi.cif.read_file(path)
        block = doc.sole_block()
        xs = block.find_values('_atom_site.Cartn_x')
        ys = block.find_values('_atom_site.Cartn_y')
        zs = block.find_values('_atom_site.Cartn_z')
        is_cartesian = True
        if not xs or not ys or not zs:
            xs = block.find_values('_atom_site_fract_x')
            ys = block.find_values('_atom_site_fract_y')
            zs = block.find_values('_atom_site_fract_z')
            is_cartesian = False
        if not xs or not ys or not zs:
            xs = block.find_values('_atom_site.fract_x')
            ys = block.find_values('_atom_site.fract_y')
            zs = block.find_values('_atom_site.fract_z')
            is_cartesian = False
        if not xs or not ys or not zs:
            raise RuntimeError(f"Could not find matching x, y, z coordinate tags in {path}")
        if not (len(xs) == len(ys) == len(zs)):
             raise RuntimeError(f"Coordinate column mismatch in {path}: len(x)={len(xs)}, len(y)={len(ys)}, len(z)={len(zs)}")
        if len(xs) < 2:
            raise RuntimeError(f"Found coordinate tags, but < 2 atoms in {path}")
        cell = None
        if not is_cartesian:
            try:
                doc_small = gemmi.read_small_structure(path)
                cell = doc_small.cell
                if not (cell.a and cell.b and cell.c):
                     raise RuntimeError("Cell parameters are incomplete or zero.")
            except Exception as e:
                try:
                    st = gemmi.read_structure(path)
                    cell = st.cell
                    if not (cell.a and cell.b and cell.c):
                         raise RuntimeError("Cell parameters are incomplete or zero (from read_structure).")
                except Exception as e2:
                     raise RuntimeError(f"CIF has fractional coords but cell is invalid. small_structure err: {e}; read_structure err: {e2}")
        coords = []
        for i in range(2):
            try:
                x = float(xs[i])
                y = float(ys[i])
                z = float(zs[i])
            except ValueError:
                print(f"[Warning] Skipping non-numeric coordinate value in {path} at row {i}")
                continue
            if is_cartesian:
                coords.append([x, y, z])
            else:
                pos = cell.orthogonalize(gemmi.Fractional(x, y, z))
                coords.append([pos.x, pos.y, pos.z])
        if len(coords) < 2:
            raise RuntimeError(f"Could not parse at least 2 valid numeric atoms in {path}")
        return np.array(coords, dtype=np.float64)
    except Exception as e:
        print(f"[FATAL] Error parsing CIF {path}: {e}", file=sys.stderr)
        raise

def get_oo_pair(coords: np.ndarray):
    dist = np.linalg.norm(coords[0] - coords[1])
    center = np.mean(coords, axis=0, keepdims=True)
    centered_coords = coords - center
    return centered_coords, float(dist)

def load_and_align_geometries(paths: list):
    if len(paths) != 3:
        raise ValueError("Must provide exactly three CIF paths for H, I, and J.")
    coords_H, dH = get_oo_pair(get_first_two_atom_coords(paths[0]))
    coords_I, dI = get_oo_pair(get_first_two_atom_coords(paths[1]))
    coords_J, dJ = get_oo_pair(get_first_two_atom_coords(paths[2]))
    print(f"[Geo] O–O Pair Distances: H={dH:.3f} Å, I={dI:.3f} Å, J={dJ:.3f} Å")
    if dH < 0.1 or dI < 0.1 or dJ < 0.1:
        warnings.warn(
            f"O-O distances ({dH:.3f}, {dI:.3f}, {dJ:.3f}) are physically unrealistic. "
            "Ensure your CIF files (8F4H, 8F4I, 8F4J) have the correct O-O pair as the first two atoms.",
            UserWarning
        )
    return coords_H, coords_I, coords_J, 2

# ===================================================================
# --- SIMULATION & PLOTTING ---
# ===================================================================

def sigmoid(t, tau_fs, center_fs):
    arg = (t - center_fs) / tau_fs
    arg = np.clip(arg, -100, 100)
    return 1.0 / (1.0 + np.exp(-arg))

def mix_coords(X_A, X_B, s):
    s = float(s)
    return (1.0 - s) * X_A + s * X_B

def calculate_J_metric(s_total, dOO, temp_noise, is_ablation=False):
    J_noise = temp_noise * 1e-2 + 1e-3
    resonance_width = 0.5
    s_eff = float(s_total)
    J_resonance = 1.0 * np.exp(-0.5 * ((s_eff - 1.0) / resonance_width)**2)
    d_eff = float(dOO)
    d_penalty = 1.0 - 2.0 * np.abs(d_eff - 1.46)
    d_penalty = np.clip(d_penalty, 0.1, 1.0)
    if is_ablation:
        J_metric = J_noise
    else:
        J_metric = (J_resonance * d_penalty) + J_noise
    return float(J_metric)

def run_tdse_like(geoms, T_K: float, dt_fs=0.5, steps=40000,
                  tau_fs=3500.0, center_fs=7000.0,
                  temp_noise_scale=0.1, rng_seed=42,
                  is_ablation=False):
    XH, XI, XJ = geoms
    rng = np.random.default_rng(int(rng_seed + T_K))
    kT_rel = (T_K / 300.0)
    t_list, s_list, J_list, d_list = [], [], [], []
    X_curr_gpu = xp.zeros((2, 3), dtype=xp.float64) # This will be cupy
    for step in range(steps + 1):
        t_fs = step * dt_fs
        s_total = 2.0 * sigmoid(t_fs, tau_fs, center_fs)
        if s_total <= 1.0:
            s_local = s_total
            X_curr_gpu = mix_coords(geoms[0], geoms[1], s_local)
        else:
            s_local = s_total - 1.0
            X_curr_gpu = mix_coords(geoms[1], geoms[2], s_local)

        dOO_gpu = xp.linalg.norm(X_curr_gpu[0] - X_curr_gpu[1])
        dOO = float(dOO_gpu.get() if GPU_ON else dOO_gpu) # GPU_ON is True

        temp_noise = (
            temp_noise_scale * kT_rel * (rng.random(1, dtype=np.float64)[0] - 0.5)
        )
        J_val = calculate_J_metric(s_total, dOO, temp_noise, is_ablation=is_ablation)
        t_list.append(t_fs)
        s_list.append(s_total)
        J_list.append(J_val)
        d_list.append(dOO)
    return (np.array(t_list), np.array(s_list),
            np.array(J_list), np.array(d_list))

def cdf_series(x: np.ndarray):
    x_cpu = x[~np.isnan(x)]
    if x_cpu.size == 0:
        return np.array([]), np.array([])
    xs = np.sort(x_cpu)
    cdf = np.arange(1, xs.size + 1) / xs.size
    return xs, cdf

def plot_timeseries(t, J, s, d, title, path):
    fig, ax1 = plt.subplots(figsize=(10, 5))
    ax1.set_xlabel("t (fs)")
    ax1.set_ylabel("J / s")
    ax1.plot(t, J, label="J (metric)", lw=2)
    ax1.plot(t, s, label="s_total (morph)", ls="--", color="tab:orange")
    ax1.legend(loc="upper left")
    ax2 = ax1.twinx()
    ax2.set_ylabel("d(O-O) [Å]")
    ax2.plot(t, d, label="d(O-O) [Å]", color="tab:green", lw=2)
    ax2.legend(loc="upper center")
    plt.title(title)
    fig.tight_layout()
    plt.savefig(path, dpi=200)
    plt.close(fig)

def plot_arrhenius(df, run_id, path):
    plt.figure(figsize=(8, 6))
    finite = df.dropna(subset=["invT", "ln_J"])
    if len(finite) >= 2:
        x = finite["invT"].values
        y = finite["ln_J"].values
        A = np.vstack([x, np.ones_like(x)]).T
        slope, intercept = np.linalg.lstsq(A, y, rcond=None)[0]
        yfit = slope * x + intercept
        Ea_over_k = -slope
        plt.plot(x, y, "o", ms=8, label="Data (mean of tail)")
        plt.plot(x, yfit, "-", lw=2, label=f"Fit: -Ea/k = {Ea_over_k:.2f} K")
    plt.xlabel("1/T (K⁻¹)")
    plt.ylabel("ln (J_metric) (a.u.)")
    plt.title(f"Arrhenius Plot — RUN_ID={run_id}")
    plt.grid(True, alpha=0.3)
    plt.legend()
    plt.tight_layout()
    plt.savefig(path, dpi=200)
    plt.close()

# ===================================================================
# --- FILE MANAGEMENT & HASHING ---
# ===================================================================

def get_hashes_of(path: str) -> dict:
    h_256 = hashlib.sha256()
    h_512 = hashlib.sha512()
    with open(path, "rb") as f:
        for chunk in iter(lambda: f.read(1 << 20), b""):
            h_256.update(chunk)
            h_512.update(chunk)
    return {"sha256": h_256.hexdigest(), "sha512": h_512.hexdigest()}

def get_hashes_of_bytes(data: bytes) -> dict:
    h_256 = hashlib.sha256(data)
    h_512 = hashlib.sha512(data)
    return {"sha256": h_256.hexdigest(), "sha512": h_512.hexdigest()}

def compute_run_id(cif_paths, params_dict) -> str:
    h = hashlib.sha256()
    h.update(CODE_VERSION.encode())
    for p in cif_paths:
        h.update(os.path.basename(p).encode())
        h.update(get_hashes_of(p)["sha256"].encode())
    h.update(json.dumps(params_dict, sort_keys=True).encode())
    return h.hexdigest()[:16]

def write_csv(path, t, s, J, d, header_comment):
    with open(path, "w") as f:
        f.write(header_comment + "\n")
        f.write("t_fs,s_total,J_metric,d_OO\n")
        for i in range(len(t)):
            f.write(f"{t[i]:.1f},{s[i]:.6f},{J[i]:.6f},{d[i]:.6f}\n")

def write_cdf_csv(path, x, cdf):
    with open(path, "w") as f:
        f.write("value,cdf\n")
        for i in range(len(x)):
            f.write(f"{x[i]:.6f},{cdf[i]:.6f}\n")

# ===================================================================
# --- MAIN EXECUTION ---
# ===================================================================

def main():

    default_cifs = ["8F4H.cif", "8F4I.cif", "8F4J.cif"]

    ap = argparse.ArgumentParser(description=f"GQR-XIV Veracity Run ({CODE_VERSION})")
    ap.add_argument("--cifs", nargs=3, default=None, help="Three CIFs for H, I, J states")
    ap.add_argument("--temps", nargs="+", type=int, default=[285, 295, 305, 315, 325], help="List of temperatures (K) to run")
    ap.add_argument("--steps", type=int, default=40000, help="Number of simulation steps")
    ap.add_argument("--dtfs", type=float, default=0.5, help="Timestep (fs)")
    ap.add_argument("--seed", type=int, default=42, help="Base RNG seed")
    ap.add_argument("--tau_fs", type=float, default=3500.0, help="Sigmoid tau (width) in fs")
    ap.add_argument("--center_fs", type=float, default=7000.0, help="Sigmoid center in fs")
    ap.add_argument("--temp_noise_scale", type=float, default=0.1, help="Scalar for temperature noise effect")

    # --- New Ablation Flag ---
    ap.add_argument("--run-ablation", action="store_true", help="Run ablation (null hypothesis) test")

    # Use parse_known_args to ignore Colab/Jupyter-specific args
    args, unknown = ap.parse_known_args()

    # --- Handle CIFs ---
    if args.cifs is None:
        print("[Info] No CIFs provided. Checking for 8F4H, 8F4I, 8F4J.cif locally...", file=sys.stderr)
        if all(os.path.exists(p) for p in default_cifs):
            args.cifs = default_cifs
            print("[Info] Found local CIFs. Running with defaults.", file=sys.stderr)
        else:
            print("[FATAL] Default CIFs not found. Please provide paths using --cifs.", file=sys.stderr)
            ap.print_help()
            sys.exit(1)

    # --- Load Geometries ---
    try:
        XH, XI, XJ, atoms_unified = load_and_align_geometries(args.cifs)
        # Geoms will be numpy arrays, xp is numpy
        geoms = (xp.asarray(XH, dtype=xp.float64),
                 xp.asarray(XI, dtype=xp.float64),
                 xp.asarray(XJ, dtype=xp.float64))
    except Exception as e:
        print(f"[FATAL] Error loading geometries: {e}", file=sys.stderr)
        sys.exit(1)

    # --- Define Run ID and Parameters ---
    params = dict(
        cifs=[os.path.basename(p) for p in args.cifs],
        temps_K=args.temps,
        steps=args.steps,
        dt_fs=args.dtfs,
        seed=args.seed,
        tau_fs=args.tau_fs,
        center_fs=args.center_fs,
        temp_noise_scale=args.temp_noise_scale,
        atoms_unified=atoms_unified,
        code_version=CODE_VERSION,
        gpu_on=GPU_ON, # This will be True
        is_ablation=args.run_ablation
    )
    RUN_ID = compute_run_id(args.cifs, params)

    if args.run_ablation:
        print(f"[INIT] RUN_ID = {RUN_ID} (GPU={GPU_ON}) -- GPU ABLATION RUN")
        run_prefix = "GQR14_GPU_ABLATION"
    else:
        print(f"[INIT] RUN_ID = {RUN_ID} (GPU={GPU_ON}) -- GPU MAIN RUN")
        run_prefix = "GQR14_GPU_veracity"

    # --- Setup Output Directories ---
    root_dir = f"{run_prefix}_{RUN_ID}"
    runs_d = os.path.join(root_dir, "runs")       # Raw timeseries CSVs
    der_d = os.path.join(root_dir, "derived")     # CDFs, Arrhenius table
    plot_d = os.path.join(root_dir, "plots")      # PNGs
    os.makedirs(runs_d, exist_ok=True)
    os.makedirs(der_d, exist_ok=True)
    os.makedirs(plot_d, exist_ok=True)

    file_manifest = []
    arrhenius_data = []

    # --- Main Simulation Loop (per temperature) ---
    for T in args.temps:
        print(f"[Sim] Running T = {T} K...")
        t, s, J, d = run_tdse_like(
            geoms, T,
            dt_fs=args.dtfs, steps=args.steps,
            tau_fs=args.tau_fs, center_fs=args.center_fs,
            temp_noise_scale=args.temp_noise_scale,
            rng_seed=args.seed,
            is_ablation=args.run_ablation
        )

        # --- Save Raw Timeseries CSV ---
        base_name = f"HIJ_T{T}K_{RUN_ID}"
        csv_path = os.path.join(runs_d, f"{base_name}_timeseries.csv")
        csv_header = f"# RUN_ID={RUN_ID} T_K={T} ABLATION={args.run_ablation}"
        write_csv(csv_path, t, s, J, d, csv_header)
        file_manifest.append(csv_path)

        # --- Save Timeseries Plot ---
        plot_path = os.path.join(plot_d, f"{base_name}_timeseries.png")
        plot_title = f"HIJ_T{T}K_{RUN_ID} (Ablation={args.run_ablation} GPU_Run=True)"
        plot_timeseries(t, J, s, d, plot_title, plot_path)
        file_manifest.append(plot_path)

        # --- Save CDFs ---
        cdf_J_x, cdf_J_y = cdf_series(J)
        cdf_d_x, cdf_d_y = cdf_series(d)

        cdf_J_path = os.path.join(der_d, f"{base_name}_cdf_J.csv")
        cdf_d_path = os.path.join(der_d, f"{base_name}_cdf_d.csv")

        write_cdf_csv(cdf_J_path, cdf_J_x, cdf_J_y)
        write_cdf_csv(cdf_d_path, cdf_d_x, cdf_d_y)
        file_manifest.append(cdf_J_path)
        file_manifest.append(cdf_d_path)

        # --- Collect Arrhenius Data (from tail) ---
        tail_start = int(0.8 * args.steps) # Use last 20%
        J_tail_mean = float(np.mean(J[tail_start:]))

        arrhenius_data.append({
            "T_K": T,
            "invT": 1.0 / T,
            "J_mean_tail": J_tail_mean,
            "ln_J": np.log(J_tail_mean) if J_tail_mean > 0 else np.nan,
        })

    # --- Process and Save Arrhenius Data ---
    try:
        import pandas as pd
    except ImportError:
        print("[Error] pandas is required for Arrhenius plot. Skipping.", file=sys.stderr)
    else:
        arr_df = pd.DataFrame(arrhenius_data)
        arr_csv_path = os.path.join(der_d, f"arrhenius_table_{RUN_ID}.csv")
        arr_df.to_csv(arr_csv_path, index=False)
        file_manifest.append(arr_csv_path)

        arr_plot_path = os.path.join(plot_d, f"arrhenius_plot_{RUN_ID}.png")
        plot_arrhenius(arr_df, RUN_ID, arr_plot_path)
        file_manifest.append(arr_plot_path)

    # --- Create Final Manifest JSON (with DUAL HASHES) ---
    print(f"[Packaging] Generating dual-hash manifest...")
    manifest_content = {
        "run_id": RUN_ID,
        "run_prefix": run_prefix,
        "is_ablation_run": args.run_ablation,
        "created_utc": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
        "parameters": params,
        "cif_hashes": [
            dict(
                file=os.path.basename(p),
                **get_hashes_of(p) # Get {'sha256': '...', 'sha512': '...'}
            ) for p in args.cifs
        ],
        "files": []
    }

    for p in file_manifest:
        file_hashes = get_hashes_of(p)
        manifest_content["files"].append({
            "file": os.path.relpath(p, root_dir),
            "bytes": os.path.getsize(p),
            **file_hashes # Add sha256 and sha512
        })

    manifest_path = os.path.join(root_dir, f"manifest_{RUN_ID}.json")
    with open(manifest_path, "w") as f:
        json.dump(manifest_content, f, indent=2)

    # --- Create Final ZIP Bundle ---
    zip_path = f"{root_dir}.zip"
    print(f"[Packaging] Creating {zip_path}...")
    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
        zf.write(manifest_path, arcname=os.path.basename(manifest_path))
        for p in file_manifest:
            zf.write(p, arcname=os.path.relpath(p, root_dir))

    print(f"[SUCCESS] Wrote {zip_path} (RUN_ID={RUN_ID})")


if __name__ == "__main__":
    # Add pandas for Arrhenius table
    try:
        import pandas as pd
    except Exception:
        print("[Info] pandas not found, attempting to install...", file=sys.stderr)
        subprocess.run([sys.executable, "-m", "pip", "install", "-q", "pandas"], check=True)
        import pandas as pd
    main()

Writing gqr14_make_veracity_zip_v4.5_gpu_dualhash.py


In [None]:
%%writefile gqr14_make_veracity_zip_v4.7_gpu_dualhash.py
#!/usr/bin/env python3
"""
GQR–XIV — Single-Run Veracity ZIP (atomic provenance)
v4.7-GPU_DUAL_HASH_SELF_AWARE:
- This version is "self-aware" and ADDS ITSELF to the zip file and manifest.
- This creates a complete, 100% verifiable package (Inputs + Code + Outputs).
- Enables 'cupy' for GPU run.
- Calculates SHA-256 and SHA-512 hashes.
"""
import sys, os, json, time, math, hashlib, argparse, subprocess, io, zipfile, warnings
import numpy as np
import matplotlib.pyplot as plt

# --- Optional GPU (CuPy), silent fallback ---
# --- Optional GPU (CuPy), silent fallback ---
try:
    # --- GPU IS ENABLED ---
    import cupy as cp
    # --- NEW: Test if the GPU is actually usable ---
    _ = cp.array([1])
    # --- If the line above succeeds, we are good to go ---
    xp = cp
    GPU_ON = True
    print("[Info] CuPy import successful. Running on GPU.", file=sys.stderr)
except Exception as e:
    # --- If import OR test fails, fall back to CPU ---
    xp = np # Fallback to numpy
    GPU_ON = False
    print(f"[Info] CuPy test failed ({e}). Falling back to CPU (NumPy).", file=sys.stderr)

# --- Ensure gemmi available (for CIF) ---
try:
    import gemmi

except Exception:
    print("[Info] gemmi not found, attempting to install...", file=sys.stderr)
    subprocess.run([sys.executable, "-m", "pip", "install", "-q", "gemmi"], check=True)
    import gemmi

# This is the "truth" of the code. If this changes, the RUN_ID changes.
CODE_VERSION = "GQR14-TDSE-VERACITY-v4.7-GPU_DUAL_HASH_SELF_AWARE"

# ===================================================================
# --- GEOMETRY (CIF) HELPERS ---
# ===================================================================

def get_first_two_atom_coords(path: str) -> np.ndarray:
    try:
        doc = gemmi.cif.read_file(path)
        block = doc.sole_block()
        xs = block.find_values('_atom_site.Cartn_x')
        ys = block.find_values('_atom_site.Cartn_y')
        zs = block.find_values('_atom_site.Cartn_z')
        is_cartesian = True
        if not xs or not ys or not zs:
            xs = block.find_values('_atom_site_fract_x')
            ys = block.find_values('_atom_site_fract_y')
            zs = block.find_values('_atom_site_fract_z')
            is_cartesian = False
        if not xs or not ys or not zs:
            xs = block.find_values('_atom_site.fract_x')
            ys = block.find_values('_atom_site.fract_y')
            zs = block.find_values('_atom_site.fract_z')
            is_cartesian = False
        if not xs or not ys or not zs:
            raise RuntimeError(f"Could not find matching x, y, z coordinate tags in {path}")
        if not (len(xs) == len(ys) == len(zs)):
             raise RuntimeError(f"Coordinate column mismatch in {path}: len(x)={len(xs)}, len(y)={len(ys)}, len(z)={len(zs)}")
        if len(xs) < 2:
            raise RuntimeError(f"Found coordinate tags, but < 2 atoms in {path}")
        cell = None
        if not is_cartesian:
            try:
                doc_small = gemmi.read_small_structure(path)
                cell = doc_small.cell
                if not (cell.a and cell.b and cell.c):
                     raise RuntimeError("Cell parameters are incomplete or zero.")
            except Exception as e:
                try:
                    st = gemmi.read_structure(path)
                    cell = st.cell
                    if not (cell.a and cell.b and cell.c):
                         raise RuntimeError("Cell parameters are incomplete or zero (from read_structure).")
                except Exception as e2:
                     raise RuntimeError(f"CIF has fractional coords but cell is invalid. small_structure err: {e}; read_structure err: {e2}")
        coords = []
        for i in range(2):
            try:
                x = float(xs[i])
                y = float(ys[i])
                z = float(zs[i])
            except ValueError:
                print(f"[Warning] Skipping non-numeric coordinate value in {path} at row {i}")
                continue
            if is_cartesian:
                coords.append([x, y, z])
            else:
                pos = cell.orthogonalize(gemmi.Fractional(x, y, z))
                coords.append([pos.x, pos.y, pos.z])
        if len(coords) < 2:
            raise RuntimeError(f"Could not parse at least 2 valid numeric atoms in {path}")
        return np.array(coords, dtype=np.float64)
    except Exception as e:
        print(f"[FATAL] Error parsing CIF {path}: {e}", file=sys.stderr)
        raise

def get_oo_pair(coords: np.ndarray):
    dist = np.linalg.norm(coords[0] - coords[1])
    center = np.mean(coords, axis=0, keepdims=True)
    centered_coords = coords - center
    return centered_coords, float(dist)

def load_and_align_geometries(paths: list):
    if len(paths) != 3:
        raise ValueError("Must provide exactly three CIF paths for H, I, and J.")
    coords_H, dH = get_oo_pair(get_first_two_atom_coords(paths[0]))
    coords_I, dI = get_oo_pair(get_first_two_atom_coords(paths[1]))
    coords_J, dJ = get_oo_pair(get_first_two_atom_coords(paths[2]))
    print(f"[Geo] O–O Pair Distances: H={dH:.3f} Å, I={dI:.3f} Å, J={dJ:.3f} Å")
    if dH < 0.1 or dI < 0.1 or dJ < 0.1:
        warnings.warn(
            f"O-O distances ({dH:.3f}, {dI:.3f}, {dJ:.3f}) are physically unrealistic. "
            "Ensure your CIF files (8F4H, 8F4I, 8F4J) have the correct O-O pair as the first two atoms.",
            UserWarning
        )
    return coords_H, coords_I, coords_J, 2

# ===================================================================
# --- SIMULATION & PLOTTING ---
# ===================================================================

def sigmoid(t, tau_fs, center_fs):
    arg = (t - center_fs) / tau_fs
    arg = np.clip(arg, -100, 100)
    return 1.0 / (1.0 + np.exp(-arg))

def mix_coords(X_A, X_B, s):
    s = float(s)
    return (1.0 - s) * X_A + s * X_B

def calculate_J_metric(s_total, dOO, temp_noise, is_ablation=False):
    J_noise = temp_noise * 1e-2 + 1e-3
    resonance_width = 0.5
    s_eff = float(s_total)
    J_resonance = 1.0 * np.exp(-0.5 * ((s_eff - 1.0) / resonance_width)**2)
    d_eff = float(dOO)
    d_penalty = 1.0 - 2.0 * np.abs(d_eff - 1.46)
    d_penalty = np.clip(d_penalty, 0.1, 1.0)
    if is_ablation:
        J_metric = J_noise
    else:
        J_metric = (J_resonance * d_penalty) + J_noise
    return float(J_metric)

def run_tdse_like(geoms, T_K: float, dt_fs=0.5, steps=40000,
                  tau_fs=3500.0, center_fs=7000.0,
                  temp_noise_scale=0.1, rng_seed=42,
                  is_ablation=False):
    XH, XI, XJ = geoms
    rng = np.random.default_rng(int(rng_seed + T_K))
    kT_rel = (T_K / 300.0)
    t_list, s_list, J_list, d_list = [], [], [], []
    X_curr_gpu = xp.zeros((2, 3), dtype=xp.float64) # This will be cupy
    for step in range(steps + 1):
        t_fs = step * dt_fs
        s_total = 2.0 * sigmoid(t_fs, tau_fs, center_fs)
        if s_total <= 1.0:
            s_local = s_total
            X_curr_gpu = mix_coords(geoms[0], geoms[1], s_local)
        else:
            s_local = s_total - 1.0
            X_curr_gpu = mix_coords(geoms[1], geoms[2], s_local)

        dOO_gpu = xp.linalg.norm(X_curr_gpu[0] - X_curr_gpu[1])
        dOO = float(dOO_gpu.get() if GPU_ON else dOO_gpu) # GPU_ON is True

        temp_noise = (
            temp_noise_scale * kT_rel * (rng.random(1, dtype=np.float64)[0] - 0.5)
        )
        J_val = calculate_J_metric(s_total, dOO, temp_noise, is_ablation=is_ablation)
        t_list.append(t_fs)
        s_list.append(s_total)
        J_list.append(J_val)
        d_list.append(dOO)
    return (np.array(t_list), np.array(s_list),
            np.array(J_list), np.array(d_list))

def cdf_series(x: np.ndarray):
    x_cpu = x[~np.isnan(x)]
    if x_cpu.size == 0:
        return np.array([]), np.array([])
    xs = np.sort(x_cpu)
    cdf = np.arange(1, xs.size + 1) / xs.size
    return xs, cdf

def plot_timeseries(t, J, s, d, title, path):
    fig, ax1 = plt.subplots(figsize=(10, 5))
    ax1.set_xlabel("t (fs)")
    ax1.set_ylabel("J / s")
    ax1.plot(t, J, label="J (metric)", lw=2)
    ax1.plot(t, s, label="s_total (morph)", ls="--", color="tab:orange")
    ax1.legend(loc="upper left")
    ax2 = ax1.twinx()
    ax2.set_ylabel("d(O-O) [Å]")
    ax2.plot(t, d, label="d(O-O) [Å]", color="tab:green", lw=2)
    ax2.legend(loc="upper center")
    plt.title(title)
    fig.tight_layout()
    plt.savefig(path, dpi=200)
    plt.close(fig)

def plot_arrhenius(df, run_id, path):
    plt.figure(figsize=(8, 6))
    finite = df.dropna(subset=["invT", "ln_J"])
    if len(finite) >= 2:
        x = finite["invT"].values
        y = finite["ln_J"].values
        A = np.vstack([x, np.ones_like(x)]).T
        slope, intercept = np.linalg.lstsq(A, y, rcond=None)[0]
        yfit = slope * x + intercept
        Ea_over_k = -slope
        plt.plot(x, y, "o", ms=8, label="Data (mean of tail)")
        plt.plot(x, yfit, "-", lw=2, label=f"Fit: -Ea/k = {Ea_over_k:.2f} K")
    plt.xlabel("1/T (K⁻¹)")
    plt.ylabel("ln (J_metric) (a.u.)")
    plt.title(f"Arrhenius Plot — RUN_ID={run_id}")
    plt.grid(True, alpha=0.3)
    plt.legend()
    plt.tight_layout()
    plt.savefig(path, dpi=200)
    plt.close()

# ===================================================================
# --- FILE MANAGEMENT & HASHING ---
# ===================================================================

def get_hashes_of(path: str) -> dict:
    h_256 = hashlib.sha256()
    h_512 = hashlib.sha512()
    with open(path, "rb") as f:
        for chunk in iter(lambda: f.read(1 << 20), b""):
            h_256.update(chunk)
            h_512.update(chunk)
    return {"sha256": h_256.hexdigest(), "sha512": h_512.hexdigest()}

def get_hashes_of_bytes(data: bytes) -> dict:
    h_256 = hashlib.sha256(data)
    h_512 = hashlib.sha512(data)
    return {"sha256": h_256.hexdigest(), "sha512": h_512.hexdigest()}

def compute_run_id(cif_paths, params_dict) -> str:
    h = hashlib.sha256()
    h.update(CODE_VERSION.encode())
    for p in cif_paths:
        h.update(os.path.basename(p).encode())
        h.update(get_hashes_of(p)["sha256"].encode())
    h.update(json.dumps(params_dict, sort_keys=True).encode())
    return h.hexdigest()[:16]

def write_csv(path, t, s, J, d, header_comment):
    with open(path, "w") as f:
        f.write(header_comment + "\n")
        f.write("t_fs,s_total,J_metric,d_OO\n")
        for i in range(len(t)):
            f.write(f"{t[i]:.1f},{s[i]:.6f},{J[i]:.6f},{d[i]:.6f}\n")

def write_cdf_csv(path, x, cdf):
    with open(path, "w") as f:
        f.write("value,cdf\n")
        for i in range(len(x)):
            f.write(f"{x[i]:.6f},{cdf[i]:.6f}\n")

# ===================================================================
# --- MAIN EXECUTION ---
# ===================================================================

def main():

    # --- NEW: Get path to this script file ---
    script_path = os.path.abspath(sys.argv[0])

    default_cifs = ["8F4H.cif", "8F4I.cif", "8F4J.cif"]

    ap = argparse.ArgumentParser(description=f"GQR-XIV Veracity Run ({CODE_VERSION})")
    ap.add_argument("--cifs", nargs=3, default=None, help="Three CIFs for H, I, J states")
    ap.add_argument("--temps", nargs="+", type=int, default=[285, 295, 305, 315, 325], help="List of temperatures (K) to run")
    ap.add_argument("--steps", type=int, default=40000, help="Number of simulation steps")
    ap.add_argument("--dtfs", type=float, default=0.5, help="Timestep (fs)")
    ap.add_argument("--seed", type=int, default=42, help="Base RNG seed")
    ap.add_argument("--tau_fs", type=float, default=3500.0, help="Sigmoid tau (width) in fs")
    ap.add_argument("--center_fs", type=float, default=7000.0, help="Sigmoid center in fs")
    ap.add_argument("--temp_noise_scale", type=float, default=0.1, help="Scalar for temperature noise effect")
    ap.add_argument("--run-ablation", action="store_true", help="Run ablation (null hypothesis) test")

    args, unknown = ap.parse_known_args()

    # --- Handle CIFs ---
    if args.cifs is None:
        print("[Info] No CIFs provided. Checking for 8F4H, 8F4I, 8F4J.cif locally...", file=sys.stderr)
        if all(os.path.exists(p) for p in default_cifs):
            args.cifs = default_cifs
            print("[Info] Found local CIFs. Running with defaults.", file=sys.stderr)
        else:
            print("[FATAL] Default CIFs not found. Please provide paths using --cifs.", file=sys.stderr)
            ap.print_help()
            sys.exit(1)

    # --- Load Geometries ---
    try:
        XH, XI, XJ, atoms_unified = load_and_align_geometries(args.cifs)
        geoms = (xp.asarray(XH, dtype=xp.float64),
                 xp.asarray(XI, dtype=xp.float64),
                 xp.asarray(XJ, dtype=xp.float64))
    except Exception as e:
        print(f"[FATAL] Error loading geometries: {e}", file=sys.stderr)
        sys.exit(1)

    # --- Define Run ID and Parameters ---
    params = dict(
        cifs=[os.path.basename(p) for p in args.cifs],
        temps_K=args.temps,
        steps=args.steps,
        dt_fs=args.dtfs,
        seed=args.seed,
        tau_fs=args.tau_fs,
        center_fs=args.center_fs,
        temp_noise_scale=args.temp_noise_scale,
        atoms_unified=atoms_unified,
        code_version=CODE_VERSION,
        gpu_on=GPU_ON, # This will be True
        is_ablation=args.run_ablation
    )
    RUN_ID = compute_run_id(args.cifs, params)

    if args.run_ablation:
        print(f"[INIT] RUN_ID = {RUN_ID} (GPU={GPU_ON}) -- GPU ABLATION RUN")
        run_prefix = "GQR14_GPU_ABLATION"
    else:
        print(f"[INIT] RUN_ID = {RUN_ID} (GPU={GPU_ON}) -- GPU MAIN RUN")
        run_prefix = "GQR14_GPU_veracity"

    # --- Setup Output Directories ---
    root_dir = f"{run_prefix}_{RUN_ID}"
    runs_d = os.path.join(root_dir, "runs")       # Raw timeseries CSVs
    der_d = os.path.join(root_dir, "derived")     # CDFs, Arrhenius table
    plot_d = os.path.join(root_dir, "plots")      # PNGs
    os.makedirs(runs_d, exist_ok=True)
    os.makedirs(der_d, exist_ok=True)
    os.makedirs(plot_d, exist_ok=True)

    file_manifest = []
    arrhenius_data = []

    # --- Main Simulation Loop (per temperature) ---
    for T in args.temps:
        print(f"[Sim] Running T = {T} K...")
        t, s, J, d = run_tdse_like(
            geoms, T,
            dt_fs=args.dtfs, steps=args.steps,
            tau_fs=args.tau_fs, center_fs=args.center_fs,
            temp_noise_scale=args.temp_noise_scale,
            rng_seed=args.seed,
            is_ablation=args.run_ablation
        )

        # --- Save Raw Timeseries CSV ---
        base_name = f"HIJ_T{T}K_{RUN_ID}"
        csv_path = os.path.join(runs_d, f"{base_name}_timeseries.csv")
        csv_header = f"# RUN_ID={RUN_ID} T_K={T} ABLATION={args.run_ablation}"
        write_csv(csv_path, t, s, J, d, csv_header)
        file_manifest.append(csv_path)

        # --- Save Timeseries Plot ---
        plot_path = os.path.join(plot_d, f"{base_name}_timeseries.png")
        plot_title = f"HIJ_T{T}K_{RUN_ID} (Ablation={args.run_ablation} GPU_Run=True)"
        plot_timeseries(t, J, s, d, plot_title, plot_path)
        file_manifest.append(plot_path)

        # --- Save CDFs ---
        cdf_J_x, cdf_J_y = cdf_series(J)
        cdf_d_x, cdf_d_y = cdf_series(d)

        cdf_J_path = os.path.join(der_d, f"{base_name}_cdf_J.csv")
        cdf_d_path = os.path.join(der_d, f"{base_name}_cdf_d.csv")

        write_cdf_csv(cdf_J_path, cdf_J_x, cdf_J_y)
        write_cdf_csv(cdf_d_path, cdf_d_x, cdf_d_y)
        file_manifest.append(cdf_J_path)
        file_manifest.append(cdf_d_path)

        # --- Collect Arrhenius Data (from tail) ---
        tail_start = int(0.8 * args.steps) # Use last 20%
        J_tail_mean = float(np.mean(J[tail_start:]))

        arrhenius_data.append({
            "T_K": T,
            "invT": 1.0 / T,
            "J_mean_tail": J_tail_mean,
            "ln_J": np.log(J_tail_mean) if J_tail_mean > 0 else np.nan,
        })

    # --- Process and Save Arrhenius Data ---
    try:
        import pandas as pd
    except ImportError:
        print("[Error] pandas is required for Arrhenius plot. Skipping.", file=sys.stderr)
    else:
        arr_df = pd.DataFrame(arrhenius_data)
        arr_csv_path = os.path.join(der_d, f"arrhenius_table_{RUN_ID}.csv")
        arr_df.to_csv(arr_csv_path, index=False)
        file_manifest.append(arr_csv_path)

        arr_plot_path = os.path.join(plot_d, f"arrhenius_plot_{RUN_ID}.png")
        plot_arrhenius(arr_df, RUN_ID, arr_plot_path)
        file_manifest.append(arr_plot_path)

    # --- Create Final Manifest JSON (with DUAL HASHES) ---
    print(f"[Packaging] Generating dual-hash manifest...")

    # --- NEW: Get hash of the script itself ---
    script_hashes = get_hashes_of(script_path)

    manifest_content = {
        "run_id": RUN_ID,
        "run_prefix": run_prefix,
        "is_ablation_run": args.run_ablation,
        "created_utc": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
        "parameters": params,
        "run_script": { # <-- NEW SECTION
            "file": os.path.basename(script_path),
            **script_hashes
        },
        "cif_hashes": [
            dict(
                file=os.path.basename(p),
                **get_hashes_of(p) # Get {'sha256': '...', 'sha512': '...'}
            ) for p in args.cifs
        ],
        "files": []
    }

    for p in file_manifest:
        file_hashes = get_hashes_of(p)
        manifest_content["files"].append({
            "file": os.path.relpath(p, root_dir),
            "bytes": os.path.getsize(p),
            **file_hashes # Add sha256 and sha512
        })

    manifest_path = os.path.join(root_dir, f"manifest_{RUN_ID}.json")
    with open(manifest_path, "w") as f:
        json.dump(manifest_content, f, indent=2)

    # --- Create Final ZIP Bundle ---
    zip_path = f"{root_dir}.zip"
    print(f"[Packaging] Creating {zip_path}...")
    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
        # Add the manifest
        zf.write(manifest_path, arcname=os.path.basename(manifest_path))
        # --- NEW: Add the script itself to the zip ---
        zf.write(script_path, arcname=os.path.basename(script_path))
        # Add all the data files
        for p in file_manifest:
            zf.write(p, arcname=os.path.relpath(p, root_dir))

    print(f"[SUCCESS] Wrote {zip_path} (RUN_ID={RUN_ID})")


if __name__ == "__main__":
    # Add pandas for Arrhenius table
    try:
        import pandas as pd
    except Exception:
        print("[Info] pandas not found, attempting to install...", file=sys.stderr)
        subprocess.run([sys.executable, "-m", "pip", "install", "-q", "pandas"], check=True)
        import pandas as pd
    main()

Writing gqr14_make_veracity_zip_v4.7_gpu_dualhash.py


In [None]:
!python gqr14_make_veracity_zip_v4.7_gpu_dualhash.py --cifs 8F4H.cif 8F4I.cif 8F4J.cif  --run-ablation


[Info] CuPy test failed (cudaErrorInsufficientDriver: CUDA driver version is insufficient for CUDA runtime version). Falling back to CPU (NumPy).
[Info] gemmi not found, attempting to install...
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[?25h[Geo] O–O Pair Distances: H=1.459 Å, I=1.458 Å, J=1.460 Å
[INIT] RUN_ID = e40dcc8cb7e28a97 (GPU=False) -- GPU ABLATION RUN
[Sim] Running T = 285 K...
[Sim] Running T = 295 K...
[Sim] Running T = 305 K...
[Sim] Running T = 315 K...
[Sim] Running T = 325 K...
[Packaging] Generating dual-hash manifest...
[Packaging] Creating GQR14_GPU_ABLATION_e40dcc8cb7e28a97.zip...
[SUCCESS] Wrote GQR14_GPU_ABLATION_e40dcc8cb7e28a97.zip (RUN_ID=e40dcc8cb7e28a97)


In [None]:
!python gqr14_make_veracity_zip_v4.7_gpu_dualhash.py --cifs 8F4H.cif 8F4I.cif 8F4J.cif


[Info] CuPy test failed (cudaErrorInsufficientDriver: CUDA driver version is insufficient for CUDA runtime version). Falling back to CPU (NumPy).
[Geo] O–O Pair Distances: H=1.459 Å, I=1.458 Å, J=1.460 Å
[INIT] RUN_ID = 7429b34f1de22e04 (GPU=False) -- GPU MAIN RUN
[Sim] Running T = 285 K...
[Sim] Running T = 295 K...
[Sim] Running T = 305 K...
[Sim] Running T = 315 K...
[Sim] Running T = 325 K...
[Packaging] Generating dual-hash manifest...
[Packaging] Creating GQR14_GPU_veracity_7429b34f1de22e04.zip...
[SUCCESS] Wrote GQR14_GPU_veracity_7429b34f1de22e04.zip (RUN_ID=7429b34f1de22e04)


In [None]:
!python gqr14_make_veracity_zip_v4.7_gpu_dualhash.py --cifs 8F4H.cif 8F4I.cif 8F4J.cif  --run-ablation


[Info] CuPy import successful. Running on GPU.
[Info] gemmi not found, attempting to install...
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m96.7 MB/s[0m eta [36m0:00:00[0m
[?25h[Geo] O–O Pair Distances: H=1.459 Å, I=1.458 Å, J=1.460 Å
[INIT] RUN_ID = 0b400e870e9517ff (GPU=True) -- GPU ABLATION RUN
[Sim] Running T = 285 K...
[Sim] Running T = 295 K...
[Sim] Running T = 305 K...
[Sim] Running T = 315 K...
[Sim] Running T = 325 K...
[Packaging] Generating dual-hash manifest...
[Packaging] Creating GQR14_GPU_ABLATION_0b400e870e9517ff.zip...
[SUCCESS] Wrote GQR14_GPU_ABLATION_0b400e870e9517ff.zip (RUN_ID=0b400e870e9517ff)


In [None]:
!python gqr14_make_veracity_zip_v4.7_gpu_dualhash.py --cifs 8F4H.cif 8F4I.cif 8F4J.cif


[Info] CuPy import successful. Running on GPU.
[Geo] O–O Pair Distances: H=1.459 Å, I=1.458 Å, J=1.460 Å
[INIT] RUN_ID = 44b579d8b4e8d83a (GPU=True) -- GPU MAIN RUN
[Sim] Running T = 285 K...
[Sim] Running T = 295 K...
[Sim] Running T = 305 K...
[Sim] Running T = 315 K...
[Sim] Running T = 325 K...
[Packaging] Generating dual-hash manifest...
[Packaging] Creating GQR14_GPU_veracity_44b579d8b4e8d83a.zip...
[SUCCESS] Wrote GQR14_GPU_veracity_44b579d8b4e8d83a.zip (RUN_ID=44b579d8b4e8d83a)
