In [1]:
import glob
print(len(glob.glob('work_dirs/host_hybrid/*/eq1_final.data')))
sorted(glob.glob('work_dirs/host_hybrid/*/eq1_final.data'))

63


['work_dirs/host_hybrid/102/eq1_final.data',
 'work_dirs/host_hybrid/104/eq1_final.data',
 'work_dirs/host_hybrid/106/eq1_final.data',
 'work_dirs/host_hybrid/1099/eq1_final.data',
 'work_dirs/host_hybrid/11/eq1_final.data',
 'work_dirs/host_hybrid/110/eq1_final.data',
 'work_dirs/host_hybrid/1119/eq1_final.data',
 'work_dirs/host_hybrid/1365/eq1_final.data',
 'work_dirs/host_hybrid/145/eq1_final.data',
 'work_dirs/host_hybrid/1552/eq1_final.data',
 'work_dirs/host_hybrid/1585/eq1_final.data',
 'work_dirs/host_hybrid/1661/eq1_final.data',
 'work_dirs/host_hybrid/191/eq1_final.data',
 'work_dirs/host_hybrid/192/eq1_final.data',
 'work_dirs/host_hybrid/199/eq1_final.data',
 'work_dirs/host_hybrid/2066/eq1_final.data',
 'work_dirs/host_hybrid/21/eq1_final.data',
 'work_dirs/host_hybrid/2110/eq1_final.data',
 'work_dirs/host_hybrid/2113/eq1_final.data',
 'work_dirs/host_hybrid/2132/eq1_final.data',
 'work_dirs/host_hybrid/216/eq1_final.data',
 'work_dirs/host_hybrid/2192/eq1_final.data',
 

In [2]:
import polars as pl
all_smiles = pl.read_csv('../data/from_host/train.csv')['SMILES'].to_list()
SMILES_INDEX = 1552
all_smiles[SMILES_INDEX]

'*CCCS(*)(=O)=O'

In [3]:
# Python >= 3.11, Ubuntu. Verbose names, single spaces around equals.
import re
import numpy as np
import MDAnalysis as mda
from MDAnalysis.lib import mdamath

def parse_type_to_mass_map_from_lammps_data(data_file_path: str) -> dict[int, float]:
    """
    Parse the 'Masses' section of a LAMMPS DATA file and return {atom_type: mass_amu}.
    Robust to comments; stops when the next section header begins.
    """
    with open(data_file_path, "r") as file_handle:
        all_lines = file_handle.read().splitlines()

    start_index = None
    for index, line_text in enumerate(all_lines):
        if line_text.strip().lower().startswith("masses"):
            start_index = index + 1
            break
    if start_index is None:
        raise ValueError("No 'Masses' section found in DATA file.")

    type_to_mass_map: dict[int, float] = {}
    numeric_line_pattern = re.compile(
        r"^\s*(\d+)\s+([+\-]?(?:\d+\.?\d*|\.\d+)(?:[eE][+\-]?\d+)?)"
    )

    i = start_index
    while i < len(all_lines):
        text = all_lines[i].strip()
        i += 1
        if not text:
            continue
        if text[0].isalpha():  # next section header like "Atoms", "Bonds", etc.
            break
        m = numeric_line_pattern.match(text)
        if m:
            atom_type = int(m.group(1))
            mass_amu = float(m.group(2))
            type_to_mass_map[atom_type] = mass_amu

    if not type_to_mass_map:
        raise ValueError("Failed to parse any masses from the 'Masses' section.")
    return type_to_mass_map

def initialize_universe_with_dump_and_masses(
    lammps_dump_path: str,
    lammps_data_path_for_masses: str,
) -> mda.Universe:
    """
    Load coordinates/box/types from a LAMMPS dump, attach masses from a DATA file's 'Masses' section,
    and return a ready-to-use MDAnalysis Universe.
    """
    # 1) Load the dump (coordinates + periodic box + atom types)
    universe = mda.Universe(lammps_dump_path, format="LAMMPSDUMP")

    # 2) Parse type->mass (amu) from the DATA file
    type_to_mass_amu = parse_type_to_mass_map_from_lammps_data(lammps_data_path_for_masses)

    # 3) Build per-atom masses from 'types' in the dump and attach to Universe
    atom_types_raw = np.asarray(universe.atoms.types)
    try:
        atom_types_int = atom_types_raw.astype(int)
    except Exception:
        atom_types_int = np.array([int(str(t)) for t in atom_types_raw], dtype=int)

    missing_types = sorted(set(atom_types_int) - set(type_to_mass_amu.keys()))
    if missing_types:
        raise KeyError(f"Missing masses for atom types: {missing_types}")

    per_atom_masses_amu = np.array([type_to_mass_amu[t] for t in atom_types_int], dtype=float)
    universe.add_TopologyAttr("masses", per_atom_masses_amu)

    # 4) Optional sanity checks (safe bounds: average atomic mass between ~6 and ~40 amu)
    average_atomic_mass_amu = float(per_atom_masses_amu.mean())
    if not (5.0 <= average_atomic_mass_amu <= 40.0):
        raise RuntimeError(f"Average atomic mass looks off: {average_atomic_mass_amu:.2f} amu")

    # 5) Optional: quick density print for verification (g/cm^3)
    amu_to_g = 1.66053906660e-24
    volume_angstrom3 = mdamath.box_volume(universe.dimensions)  # Å^3
    density_g_per_cm3 = (per_atom_masses_amu.sum() * amu_to_g) / (volume_angstrom3 * 1e-24)
    print(f"[init] n_atoms={len(universe.atoms)}, avg_mass_amu={average_atomic_mass_amu:.3f}, "
          f"density_g_per_cm3={density_g_per_cm3:.6f}")

    return universe

# -------------------------
# Example usage:
# -------------------------
dump_path = f"work_dirs/host_hybrid/{SMILES_INDEX}/eq1_final.dump"
data_path = f"work_dirs/host_hybrid/{SMILES_INDEX}/eq1_final.data"
u = initialize_universe_with_dump_and_masses(
    lammps_dump_path=dump_path,
    lammps_data_path_for_masses=data_path,
)

# topology_path = "work_dirs/host_hybrid/70/eq1_final.data"
# coords_path   = "work_dirs/host_hybrid/70/eq1_final.dump"
# u = mda.Universe(
#     topology_path,
#     coords_path,
#     format=("LAMMPSDATA", "LAMMPSDUMP"),
#     atom_style="full",
#     n_atoms=count_atoms_from_data_file(topology_path),
# )

  from .autonotebook import tqdm as notebook_tqdm


[init] n_atoms=5960, avg_mass_amu=8.777, density_g_per_cm3=0.957941


  ts.data["time"] = step_num * ts.dt


In [4]:
import re
import numpy as np

def parse_bond_atom_ids_from_lammps_data(data_file_path: str) -> np.ndarray:
    """
    Returns an (N, 2) array of LAMMPS atom IDs (1-based) from the DATA file's 'Bonds' section.
    Assumes canonical 'Bonds' lines:  id  type  atom_i  atom_j  [# comment]
    """
    with open(data_file_path, "r") as fh:
        lines = fh.read().splitlines()

    # locate 'Bonds' header
    start = None
    for i, line in enumerate(lines):
        if line.strip().lower().startswith("bonds"):
            start = i + 1
            break
    if start is None:
        raise ValueError("No 'Bonds' section found in DATA file.")

    bonds = []
    patt = re.compile(r"^\s*(\d+)\s+(\d+)\s+(\d+)\s+(\d+)")
    i = start
    while i < len(lines):
        s = lines[i].strip()
        i += 1
        if not s:
            continue
        if s[0].isalpha():  # next section header
            break
        m = patt.match(s)
        if m:
            ai_id = int(m.group(3))  # LAMMPS atom IDs (1-based)
            aj_id = int(m.group(4))
            bonds.append((ai_id, aj_id))
    if not bonds:
        raise ValueError("Parsed zero bonds from 'Bonds' section.")
    return np.asarray(bonds, dtype=int)

def attach_bonds_by_atom_ids(universe, bonds_atom_ids_1based: np.ndarray) -> None:
    """
    Map (atom_id_i, atom_id_j) to 0-based indices in the current Universe and attach as 'bonds'.
    """
    # Build ID -> index map from the dump (your dump includes 'id', so MDAnalysis populated .ids)
    id_to_index = {int(atom_id): idx for idx, atom_id in enumerate(universe.atoms.ids)}
    pairs = []
    for ai_id, aj_id in bonds_atom_ids_1based:
        if ai_id in id_to_index and aj_id in id_to_index:
            ai = id_to_index[ai_id]
            aj = id_to_index[aj_id]
            if ai != aj:
                pairs.append((ai, aj))
    if not pairs:
        raise RuntimeError("No bonds matched current atom IDs; check that DATA and dump refer to the same system.")
    bonds_indices = np.asarray(pairs, dtype=int)
    # If bonds already exist, skip; otherwise attach
    if getattr(universe, "bonds", None) is None or len(universe.bonds) == 0:
        universe.add_TopologyAttr("bonds", bonds_indices)

# ---- call this once after you create `u` from the dump/xtc ----
bonds_ids = parse_bond_atom_ids_from_lammps_data(data_path)
attach_bonds_by_atom_ids(u, bonds_ids)

print(f"Attached bonds: {len(u.bonds)}")

Attached bonds: 5950


In [5]:
import numpy as np
from MDAnalysis.lib import mdamath
import freud

# --- positions & box for freud ---
positions_angstrom = u.atoms.positions.astype(np.float64)
box_matrix_angstrom = mdamath.triclinic_vectors(u.dimensions)  # 3x3
freud_box = freud.Box.from_matrix(box_matrix_angstrom)

# --- mass density (g/cm^3) ---
amu_to_g = 1.66053906660e-24
volume_ang3 = mdamath.box_volume(u.dimensions)          # Å^3
rho_g_cm3 = (u.atoms.masses.sum() * amu_to_g) / (volume_ang3 * 1e-24)

# --- RDF first peak ---
rdf = freud.density.RDF(bins=200, r_max=10.0)
rdf.compute(system=(freud_box, positions_angstrom))
first_peak_idx = int(np.argmax(rdf.rdf))
first_peak_distance_angstrom = float(rdf.bin_centers[first_peak_idx])

# --- Steinhardt Q6 ---
q6 = freud.order.Steinhardt(l=6)
q6.compute((freud_box, positions_angstrom), neighbors={'num_neighbors': 12, 'exclude_ii': True})
average_q6 = float(q6.order)

# --- Voronoi cell volumes (Å^3) ---
voro = freud.locality.Voronoi()
voro.compute(system=(freud_box, positions_angstrom))
voronoi_volumes_ang3 = voro.volumes  # per-atom cell volumes

print(f"rho = {rho_g_cm3:.3f}, Q6 = {average_q6:.4f}, gr_peak = {first_peak_distance_angstrom:.2f} Å")
print(f"Voronoi volumes: mean = {np.mean(voronoi_volumes_ang3):.2f} Å^3")

rho = 0.958, Q6 = 0.0085, gr_peak = 1.08 Å
Voronoi volumes: mean = 15.21 Å^3


In [6]:
# --- Build Voronoi & volumes (you already have freud_box and positions_angstrom) ---
voronoi_calculator = freud.locality.Voronoi()
voronoi_calculator.compute(system=(freud_box, positions_angstrom))
voronoi_volumes_ang3 = np.asarray(voronoi_calculator.volumes, dtype=float)  # per-atom

# --- Choose van der Waals radii per atom (Å) ---
# If you have element names: map element->radius; otherwise fallback per-type or a single value.
bondi_vdw_radius_by_element_angstrom = {
    "H": 1.20, "C": 1.70, "N": 1.55, "O": 1.52, "F": 1.47,
    "P": 1.80, "S": 1.80, "Cl": 1.75, "Br": 1.85, "Si": 2.10
}

def build_per_atom_vdw_radii_angstrom(universe):
    # Try to use elements if available; else names’ leading letters; else default heavy-atom radius.
    try:
        element_symbols = [el if el is not None else "" for el in getattr(universe.atoms, "elements", [None]*len(universe.atoms))]
    except Exception:
        element_symbols = ["" for _ in range(len(universe.atoms))]

    per_atom_radii = []
    for atom, element_symbol in zip(universe.atoms, element_symbols):
        radius = None
        key = element_symbol if element_symbol in bondi_vdw_radius_by_element_angstrom else str(getattr(atom, "name", "")).rstrip("0123456789")
        if key in bondi_vdw_radius_by_element_angstrom:
            radius = bondi_vdw_radius_by_element_angstrom[key]
        elif key and key[0] in bondi_vdw_radius_by_element_angstrom:
            radius = bondi_vdw_radius_by_element_angstrom[key[0]]
        else:
            # Fallback: treat as generic heavy atom
            radius = 1.70
        per_atom_radii.append(radius)
    return np.asarray(per_atom_radii, dtype=float)

def estimate_ffv_monte_carlo(
    freud_box,
    atom_positions_angstrom: np.ndarray,
    per_atom_vdw_radii_angstrom: np.ndarray,
    probe_radius_angstrom: float = 0.0,
    grid_points_per_axis: int = 48,
    rng_seed: int = 0
) -> float:
    """
    Returns an estimate of the probe-accessible FFV using uniform grid sampling
    with PBC. Increase grid_points_per_axis for accuracy (cost ~ N^3).
    """
    rng = np.random.default_rng(rng_seed)

    # --- build Cartesian grid in fractional space, then map to box ---
    fractional_lin = (np.arange(grid_points_per_axis, dtype=float) + 0.5) / grid_points_per_axis
    fx, fy, fz = np.meshgrid(fractional_lin, fractional_lin, fractional_lin, indexing="ij")
    sample_fracs = np.column_stack([fx.ravel(), fy.ravel(), fz.ravel()])  # (M,3)

    box_matrix_angstrom = freud_box.to_matrix().astype(float)  # 3x3
    sample_points_angstrom = sample_fracs @ box_matrix_angstrom.T         # (M,3)

    # --- neighbor query with a single conservative cutoff (max radius) ---
    max_effective_radius = float(np.max(per_atom_vdw_radii_angstrom) + probe_radius_angstrom)
    neighbor_query = freud.locality.AABBQuery(freud_box, atom_positions_angstrom.astype(float))
    neighbor_list = neighbor_query.query(sample_points_angstrom, {"r_max": max_effective_radius}).toNeighborList()

    # --- mark any sample point that has at least one neighbor within that conservative cutoff as "potentially occupied" ---
    # This is conservative; to be stricter, filter by per-atom specific radius below (optional refinement).
    occupied_flags = np.zeros(sample_points_angstrom.shape[0], dtype=bool)
    occupied_flags[np.asarray(neighbor_list.query_point_indices, dtype=np.int64)] = True

    # Optional refinement: shrink back by checking per-atom specific radii
    # (kept lightweight; comment out if the neighbor list exposes distances differently in your freud)
    try:
        # If your freud exposes neighbor vectors/distances, validate per-pair
        neighbor_vectors = np.asarray(neighbor_list.separations, dtype=float)  # shape (K,3) in some freud builds
        neighbor_distances = np.linalg.norm(neighbor_vectors, axis=1)
        neighbor_atom_indices = np.asarray(neighbor_list.point_indices, dtype=np.int64)
        neighbor_query_indices = np.asarray(neighbor_list.query_point_indices, dtype=np.int64)
        effective_radii_per_pair = per_atom_vdw_radii_angstrom[neighbor_atom_indices] + probe_radius_angstrom
        within_true_radius = neighbor_distances <= effective_radii_per_pair + 1e-9

        # Reset & re-mark with the refined criterion
        occupied_flags[:] = False
        occupied_flags[neighbor_query_indices[within_true_radius]] = True
    except Exception:
        # If separations/distances are not exposed in your freud version,
        # we keep the conservative occupied_flags computed above.
        pass

    ffv_estimate = float((~occupied_flags).mean())
    return ffv_estimate

per_atom_vdw_radii_angstrom = build_per_atom_vdw_radii_angstrom(u)

# Example call (uses same radii array built above):
ffv_accessible = estimate_ffv_monte_carlo(
    freud_box=freud_box,
    atom_positions_angstrom=positions_angstrom,
    per_atom_vdw_radii_angstrom=per_atom_vdw_radii_angstrom,
    # probe_radius_angstrom=1.20,   # e.g., helium-like probe; set 0.0 for geometric void
    probe_radius_angstrom=0,
    grid_points_per_axis=48,      # ↑ to 64–96 for tighter estimates
    rng_seed=0
)

print('FFV (monte-carlo):', ffv_accessible)

FFV (monte-carlo): 0.3735080295138889


# Part 2

In [8]:
from MDAnalysis.transformations import unwrap, wrap

# 1) Make molecules whole and re-wrap into the primary cell for every frame
# u.trajectory.add_transformations(
#     unwrap(u.atoms),                    # join molecules split by PBC
#     wrap(u.atoms, compound='fragments') # move each molecule back into box as a unit
# )

# 2) Compute Rg per chain (heavy atoms only), then summarize
rg_values_per_chain = []
for fragment in u.atoms.fragments:
    heavy_mask = (fragment.masses > 1.2)
    # heavy_mask = (fragment.masses > 0)
    if np.count_nonzero(heavy_mask) < 10:
        continue
    chain_rg = float(fragment[heavy_mask].radius_of_gyration(wrap=True))
    rg_values_per_chain.append(chain_rg)

if not rg_values_per_chain:
    raise RuntimeError("No chain had enough heavy atoms for an Rg estimate.")

print(rg_values_per_chain)
rg_values_per_chain = np.array(rg_values_per_chain, dtype=float)
print(
    f"Rg per chain (Å): median={np.median(rg_values_per_chain):.2f}, "
    f"mean={np.mean(rg_values_per_chain):.2f}, "
    f"std={np.std(rg_values_per_chain):.2f}, "
    f"p10={np.percentile(rg_values_per_chain,10):.2f}, "
    f"p90={np.percentile(rg_values_per_chain,90):.2f}, "
    f"n_chains={len(rg_values_per_chain)}"
)

[22.41651024139426, 15.065481516990053, 18.313351487763217, 24.215045194228377, 18.47992588829563, 21.166790719862608, 19.24184774307696, 16.599726512837695, 15.046631781066885, 19.62132465197348]
Rg per chain (Å): median=18.86, mean=19.02, std=2.86, p10=15.06, p90=22.60, n_chains=10


In [9]:
import numpy as np
from collections import deque
import MDAnalysis as mda
from MDAnalysis.transformations import unwrap, wrap, center_in_box
from MDAnalysis.analysis import msd, polymer

# ------------------------------------------------------------
# 0) Transformations: make molecules whole & keep them in the box
# ------------------------------------------------------------
u.trajectory.add_transformations(
    unwrap(u.atoms),
    wrap(u.atoms, compound='fragments'),
    center_in_box(u.atoms, wrap=True),
)
# Activate transforms by stepping the trajectory
_ = u.trajectory[0]
_ = u.trajectory[-1]

# ------------------------------------------------------------
# 1) Diffusivity via Einstein MSD (3D): D = slope / 6
#    Pass time_per_frame_ps if the time axis isn't present in the file.
# ------------------------------------------------------------
def compute_diffusivity(
    universe: mda.Universe,
    selection: str = "all",
    fit_points: int = 20,
    time_per_frame_ps: float | None = None
) -> tuple[float, float]:
    msd_result = msd.EinsteinMSD(universe, select=selection).run()
    msd_values_A2 = np.asarray(msd_result.results.timeseries, dtype=float)
    n_frames = len(msd_values_A2)
    if n_frames < 2:
        raise RuntimeError("Not enough frames to fit MSD slope (need >= 2 frames).")

    time_values_ps = np.asarray(msd_result.times, dtype=float)
    if time_values_ps.size != n_frames or not np.all(np.isfinite(time_values_ps)):
        dt_ps = time_per_frame_ps if time_per_frame_ps is not None else getattr(universe.trajectory, "dt", None)
        if dt_ps is None:
            raise RuntimeError("No time axis; provide time_per_frame_ps (e.g., 0.2 for 2 fs step dumped every 100).")
        time_values_ps = np.arange(n_frames, dtype=float) * float(dt_ps)

    k = max(2, min(int(fit_points), n_frames))
    slope_A2_per_ps = float(np.polyfit(time_values_ps[:k], msd_values_A2[:k], 1)[0])
    diffusivity_A2_per_ps = slope_A2_per_ps / 6.0
    diffusivity_cm2_per_s = diffusivity_A2_per_ps * 1e-4
    return diffusivity_A2_per_ps, diffusivity_cm2_per_s

# ------------------------------------------------------------
# 2) Mass-weighted shape metrics (global and per-chain), PBC-aware
#    Asphericity    b = λ1 - 0.5*(λ2 + λ3)           (Å²)
#    Acylindricity  c = λ2 - λ3                      (Å²)
#    κ² (anisotropy)   = 1 - 3*(Σ λiλj)/(Σ λi)²      (dimensionless)
#    We build the mass-weighted gyration tensor manually for compatibility.
# ------------------------------------------------------------
def mass_weighted_gyration_eigenvalues(atomgroup: mda.core.groups.AtomGroup) -> np.ndarray:
    """
    Return eigenvalues (λ1 >= λ2 >= λ3) of the mass-weighted gyration tensor (Å²).
    Compatible with MDAnalysis versions lacking AtomGroup.gyration_tensor().
    """
    positions_angstrom = atomgroup.positions.astype(float)            # shape (N,3)
    masses_amu = atomgroup.masses.astype(float)                       # shape (N,)
    if positions_angstrom.size == 0:
        raise ValueError("AtomGroup is empty.")
    if np.all(masses_amu == 0):
        raise ValueError("Masses are all zero; attach masses before computing shape metrics.")

    # Mass-weighted center-of-mass in PBC (uses masses internally)
    center_of_mass_angstrom = atomgroup.center_of_mass(wrap=True).astype(float)
    centered_positions = positions_angstrom - center_of_mass_angstrom

    # Mass-weighted gyration tensor G = (1/M) Σ m_i r_i r_i^T
    total_mass = float(np.sum(masses_amu))
    weighted_positions = centered_positions * masses_amu[:, None]
    gyration_tensor_A2 = (weighted_positions.T @ centered_positions) / total_mass  # (3,3)

    # Eigenvalues sorted descending
    eigenvalues = np.linalg.eigvalsh(gyration_tensor_A2)
    return eigenvalues[::-1].astype(float)

def shape_metrics_from_eigs(eigvals_desc: np.ndarray) -> dict:
    lam1, lam2, lam3 = map(float, eigvals_desc)
    trace = lam1 + lam2 + lam3
    asphericity_A2 = lam1 - 0.5 * (lam2 + lam3)
    acylindricity_A2 = lam2 - lam3
    kappa2 = 1.0 - 3.0 * ((lam1*lam2 + lam2*lam3 + lam3*lam1) / (trace*trace + 1e-30))
    return {
        "asphericity_A2": asphericity_A2,
        "acylindricity_A2": acylindricity_A2,
        "kappa2": kappa2,
        "trace_A2": trace,
        "lambda1_A2": lam1,
        "lambda2_A2": lam2,
        "lambda3_A2": lam3,
    }

def compute_shape_metrics(universe: mda.Universe) -> tuple[dict, dict]:
    # Global (all atoms, last frame)
    eigvals_global = mass_weighted_gyration_eigenvalues(universe.atoms)
    global_metrics = shape_metrics_from_eigs(eigvals_global)

    # Per-chain over fragments (filter tiny ones)
    per_chain = []
    for fragment in universe.atoms.fragments:
        if fragment.n_atoms < 12:
            continue
        eigvals = mass_weighted_gyration_eigenvalues(fragment)
        per_chain.append(shape_metrics_from_eigs(eigvals))

    summary = {}
    if per_chain:
        keys = per_chain[0].keys()
        for key in keys:
            values = np.array([d[key] for d in per_chain], dtype=float)
            summary[key] = {
                "median": float(np.median(values)),
                "mean": float(np.mean(values)),
                "std": float(np.std(values)),
                "p10": float(np.percentile(values, 10)),
                "p90": float(np.percentile(values, 90)),
                "n_chains": int(len(values)),
            }
    return global_metrics, summary

# ------------------------------------------------------------
# 3) Persistence length from heavy-atom backbone paths (uses bonds)
#    Same graph-based backbone finder you used earlier.
# ------------------------------------------------------------
def _build_adjacency_from_bonds(n_atoms: int, bonds_0based: np.ndarray) -> list[list[int]]:
    adjacency = [[] for _ in range(n_atoms)]
    for ai, aj in bonds_0based:
        adjacency[ai].append(aj)
        adjacency[aj].append(ai)
    return adjacency

def _bfs_path(adjacency: list[list[int]], start: int, goal: int) -> list[int]:
    queue = deque([start]); parent = {start: None}
    while queue:
        v = queue.popleft()
        if v == goal:
            break
        for w in adjacency[v]:
            if w not in parent:
                parent[w] = v
                queue.append(w)
    if goal not in parent:
        return []
    path = []
    cur = goal
    while cur is not None:
        path.append(cur); cur = parent[cur]
    return path[::-1]

def _double_bfs_longest_path(adjacency: list[list[int]], nodes: list[int]) -> list[int]:
    node_set = set(nodes)
    def farthest(x: int) -> tuple[int, dict[int, int | None]]:
        q = deque([x]); parent = {x: None}; last = x
        while q:
            v = q.popleft(); last = v
            for w in adjacency[v]:
                if w not in parent and w in node_set:
                    parent[w] = v; q.append(w)
        return last, parent
    a = nodes[0]
    a, _ = farthest(a)
    b, parent = farthest(a)
    path = []
    cur = b
    while cur is not None:
        path.append(cur); cur = parent[cur]
    path.reverse()
    return [v for v in path if v in node_set]

def build_backbone_paths_heavy_atoms(universe: mda.Universe, min_atoms_per_chain: int = 12) -> list[mda.core.groups.AtomGroup]:
    assert getattr(universe, "bonds", None) is not None and len(universe.bonds) > 0, "Universe has no bonds; attach them first."
    heavy_mask = (universe.atoms.masses > 1.2)
    heavy_indices = set(np.nonzero(heavy_mask)[0].tolist())
    adjacency = _build_adjacency_from_bonds(len(universe.atoms), universe.bonds.to_indices())

    # connected components within heavy subgraph
    unvisited = set(heavy_indices)
    chains = []
    while unvisited:
        seed = next(iter(unvisited))
        component = []
        q = deque([seed]); unvisited.remove(seed)
        while q:
            v = q.popleft(); component.append(v)
            for w in adjacency[v]:
                if w in unvisited:
                    unvisited.remove(w); q.append(w)

        # endpoints in heavy subgraph
        degree = {v: sum((nbr in component) for nbr in adjacency[v]) for v in component}
        endpoints = [v for v in component if degree[v] == 1]
        if len(endpoints) >= 2:
            best = []
            for i in range(len(endpoints)):
                for j in range(i + 1, len(endpoints)):
                    path = _bfs_path(adjacency, endpoints[i], endpoints[j])
                    path = [p for p in path if p in set(component)]
                    if len(path) > len(best):
                        best = path
            path_indices = best
        else:
            path_indices = _double_bfs_longest_path(adjacency, component)

        if len(path_indices) >= min_atoms_per_chain:
            chains.append(universe.atoms[path_indices])
    return chains

def compute_persistence_length_stats(universe: mda.Universe, backbone_chains: list[mda.core.groups.AtomGroup]) -> dict:
    per_chain_lp = []
    for chain_ag in backbone_chains:
        try:
            pl_calc = polymer.PersistenceLength([chain_ag]).run()
            lp_value = float(np.nanmean(pl_calc.results.lp))
            if np.isfinite(lp_value):
                per_chain_lp.append(lp_value)
        except Exception:
            continue
    if not per_chain_lp:
        raise RuntimeError("Persistence length failed for all chains.")
    arr = np.array(per_chain_lp, dtype=float)
    return {
        "median": float(np.median(arr)),
        "mean": float(np.mean(arr)),
        "std": float(np.std(arr)),
        "p10": float(np.percentile(arr, 10)),
        "p90": float(np.percentile(arr, 90)),
        "n_chains": int(len(arr)),
    }


# D_A2ps, D_cm2s = compute_diffusivity(u, selection="all", fit_points=20, time_per_frame_ps=0.2)
backbone_chains = build_backbone_paths_heavy_atoms(u, min_atoms_per_chain=12)
pl_stats = compute_persistence_length_stats(u, backbone_chains)
print(f"Persistence length (Å): median={pl_stats['median']:.2f}, mean={pl_stats['mean']:.2f}, n={pl_stats['n_chains']}")

global_shape, per_chain_shape = compute_shape_metrics(u)
print("Global shape:", global_shape)
if per_chain_shape:
    print("Asphericity per-chain (Å²) median:", per_chain_shape["asphericity_A2"]["median"])

Persistence length (Å): median=3.13, mean=3.16, n=10
Global shape: {'asphericity_A2': 129.65323381917, 'acylindricity_A2': 75.45504106462423, 'kappa2': 0.052417042508096534, 'trace_A2': 634.1611733535154, 'lambda1_A2': 297.82254699728514, 'lambda2_A2': 205.89683371042725, 'lambda3_A2': 130.44179264580302}
Asphericity per-chain (Å²) median: 253.17436876138657
