<a href="https://colab.research.google.com/github/jamessutton600613-png/GC/blob/main/Untitled269.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Fe–S curvature-trap pipeline for E. coli IspH (3ZGL, 3ZGN)
# Clean, minimal notebook: STO-3G, PBE, 4Fe-4S cluster only.

!pip install -q gemmi pyscf

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.3/51.3 MB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.3/51.3 MB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from pathlib import Path
import requests
import numpy as np
import matplotlib.pyplot as plt
import gemmi

from pyscf import gto, dft

# Base directories
BASE_DIR    = Path("/content")
STRUCT_DIR  = BASE_DIR / "ispH_structures"
OUT_DIR     = BASE_DIR / "ispH_npz"
COMB_DIR    = BASE_DIR / "ispH_combs"   # for kappa_base/mode comb plots

STRUCT_DIR.mkdir(exist_ok=True)
OUT_DIR.mkdir(exist_ok=True)
COMB_DIR.mkdir(exist_ok=True)

# PDB IDs for IspH + inhibitors
PDB_IDS = {
    "3ZGL": "TMBPP",   # very potent thiol inhibitor
    "3ZGN": "AMBPP",   # amino analogue
}

# Grid for Laplacian
L_BOX  = 3.0    # Å half-length of cube
N_GRID = 32     # 32^3 ≈ 3.3×10^4 points

# Mode B amplitude
MODE_B_SHIFT = 0.20   # Å

# Canonical NBINS list (your STO-3G Fe–S grid)
NBINS_LIST = [
    1,3,5,7,9,10,11,12,13,14,15,16,17,18,19,20,
    21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,
    36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,
    51,52,53,54,55,56,57,58,59,60,
    65,70,71,73,75,77,78,79,80,81,82,83,84,85,
    87,88,89,90,91,92,93,94,95,96,97,98,99,100,
    102,103,105,106,107,109,110,111,112,113,114,
    115,116,117,118,119,120,121,122,123,124,125
]
L_EFF = 3.0    # Å for Δr = L_EFF / bins

In [None]:
def download_cifs(pdb_map, dest_dir: Path):
    base_url = "https://files.rcsb.org/download/{}.cif"
    dest_dir.mkdir(exist_ok=True)
    for pid in pdb_map.keys():
        pid_up = pid.upper()
        out_path = dest_dir / f"{pid_up}.cif"
        if out_path.exists():
            print(f"[SKIP] {pid_up} already present")
            continue

        url = base_url.format(pid_up)
        print(f"[GET ] {pid_up} ← {url}")
        r = requests.get(url)

        if r.status_code == 200:
            out_path.write_bytes(r.content)
            print(f"[OK  ] saved → {out_path}")
        else:
            print(f"[FAIL] {pid_up}: HTTP {r.status_code}")

# Run the downloader
download_cifs(PDB_IDS, STRUCT_DIR)

print("CIFs in", STRUCT_DIR)
for p in STRUCT_DIR.glob("*.cif"):
    print("  ", p.name)

[GET ] 3ZGL ← https://files.rcsb.org/download/3ZGL.cif
[OK  ] saved → /content/ispH_structures/3ZGL.cif
[GET ] 3ZGN ← https://files.rcsb.org/download/3ZGN.cif
[OK  ] saved → /content/ispH_structures/3ZGN.cif
CIFs in /content/ispH_structures
   3ZGL.cif
   3ZGN.cif


In [None]:
def read_structure_any(path: Path):
    print(f"  Reading structure from {path}")
    return gemmi.read_structure(str(path))

def get_fe_s_clusters(struct, cutoff=3.0):
    """Return list of connectivity clusters of Fe/S atoms."""
    atoms = []
    for model in struct:
        for chain in model:
            for residue in chain:
                for atom in residue:
                    el = atom.element.name.upper()
                    if el in ("FE", "S"):
                        atoms.append(atom)
    if not atoms:
        raise RuntimeError("No Fe or S atoms found")

    coords = np.array(
        [[float(a.pos.x), float(a.pos.y), float(a.pos.z)] for a in atoms],
        dtype=float,
    )
    n = len(atoms)
    adj = [[] for _ in range(n)]
    for i in range(n):
        for j in range(i+1, n):
            d = np.linalg.norm(coords[i] - coords[j])
            if d <= cutoff:
                adj[i].append(j)
                adj[j].append(i)

    clusters = []
    visited = [False]*n
    for i in range(n):
        if visited[i]:
            continue
        stack = [i]
        comp  = []
        visited[i] = True
        while stack:
            k = stack.pop()
            comp.append(atoms[k])
            for nb in adj[k]:
                if not visited[nb]:
                    visited[nb] = True
                    stack.append(nb)
        clusters.append(comp)
    print(f"  Found {len(clusters)} Fe/S connectivity clusters")
    return clusters

def select_fe4s4_cluster(clusters):
    """Pick Fe4S4-like cluster closest to global Fe/S centroid."""
    desired_fe = 4
    desired_s  = 4

    candidates = []
    for comp in clusters:
        n_fe = sum(1 for a in comp if a.element.name.upper() == "FE")
        n_s  = sum(1 for a in comp if a.element.name.upper() == "S")
        if n_fe == desired_fe and n_s >= desired_s:
            candidates.append(comp)
    if not candidates:
        raise RuntimeError("No Fe4S4-like cluster found")

    all_atoms  = [a for c in candidates for a in c]
    all_coords = np.array([[a.pos.x, a.pos.y, a.pos.z] for a in all_atoms], float)
    global_cent = all_coords.mean(axis=0)

    def centroid(comp):
        c = np.array([[a.pos.x, a.pos.y, a.pos.z] for a in comp], float)
        return c.mean(axis=0)

    best, best_d = None, 1e9
    for comp in candidates:
        c = centroid(comp)
        d = np.linalg.norm(c - global_cent)
        if d < best_d:
            best, best_d = comp, d
    print(f"  Selected Fe4S4 cluster with {len(best)} atoms")
    return best

def atoms_to_pyscf(cluster_atoms):
    """Convert gemmi atoms → PySCF atom list centered at origin."""
    coords = np.array(
        [[float(a.pos.x), float(a.pos.y), float(a.pos.z)] for a in cluster_atoms],
        dtype=float,
    )
    centre = coords.mean(axis=0)
    coords -= centre
    pys_atoms = []
    for atom, pos in zip(cluster_atoms, coords):
        sym = atom.element.name.capitalize()
        pys_atoms.append((sym, (float(pos[0]), float(pos[1]), float(pos[2]))))
    return pys_atoms

In [None]:
def apply_mode_b(pys_atoms, shift=MODE_B_SHIFT):
    """Asymmetric Fe–S stretch: move closest S for first 2 Fe centres."""
    symbols = [a[0] for a in pys_atoms]
    coords  = np.array([a[1] for a in pys_atoms], float)
    fe_idx  = [i for i,s in enumerate(symbols) if s.lower()=="fe"]
    s_idx   = [i for i,s in enumerate(symbols) if s.lower()=="s"]
    if len(fe_idx) < 2 or len(s_idx) < 2:
        raise RuntimeError("Need at least 2 Fe and 2 S for Mode B")

    new_coords = coords.copy()
    sign = +1.0
    for fi in fe_idx[:2]:
        fe_pos = coords[fi]
        best_j, best_d = None, 1e9
        for sj in s_idx:
            d = np.linalg.norm(coords[sj] - fe_pos)
            if d < best_d:
                best_j, best_d = sj, d
        if best_j is None:
            continue
        vec  = coords[best_j] - fe_pos
        norm = np.linalg.norm(vec)
        if norm < 1e-6:
            continue
        u = vec / norm
        new_coords[best_j] += sign * shift * u
        sign *= -1.0

    new_atoms = []
    for sym, pos in zip(symbols, new_coords):
        new_atoms.append((sym, (float(pos[0]), float(pos[1]), float(pos[2]))))
    return new_atoms

In [None]:
def run_scf_sto3g(mol_atoms, charge=0, spin=4):
    """
    UKS PBE / STO-3G for Fe4S4 cluster.
    - Default spin=4 (S=2) for high-spin Fe4S4
    - More robust SCF settings + Newton fallback
    """
    mol = gto.Mole()
    mol.build(
        atom=mol_atoms,
        basis="sto-3g",
        charge=charge,
        spin=spin,       # <-- high-spin instead of 0
        verbose=3
    )

    # Use UKS for open-shell
    mf = dft.UKS(mol)
    mf.xc = "PBE"
    mf.conv_tol = 1e-6
    mf.max_cycle = 80         # more iterations than default 50
    mf.diis_space = 12        # more DIIS history
    mf.level_shift = 0.3      # small level shift to stabilise early cycles
    mf.init_guess = "minao"

    print(f"    SCF: starting UKS (spin={spin})...")
    e = mf.kernel()

    if mf.converged:
        print(f"    UKS converged: E = {e:.6f} Ha")
        return mol, mf

    print("    UKS did not converge, trying Newton solver...")
    try:
        mf_new = mf.newton()
        e2 = mf_new.kernel()
        if mf_new.converged:
            print(f"    Newton-UKS converged: E = {e2:.6f} Ha")
            return mol, mf_new
    except Exception as err:
        print("    Newton-UKS failed:", err)

    raise RuntimeError("SCF did not converge (UKS + Newton)")

In [None]:
def count_traps(kappa, nbins, pdf_thresh=1e-9):
    """Count curvature traps from a kappa array for a given nbins."""
    k_clip = np.clip(kappa, None, np.percentile(kappa, 99.5))
    hist, edges = np.histogram(k_clip, bins=nbins, density=True)
    below = hist < pdf_thresh
    n_trap = 0
    in_trap = False
    for flag in below:
        if flag and not in_trap:
            in_trap = True
            n_trap += 1
        elif not flag and in_trap:
            in_trap = False
    return n_trap

def build_teeth_rows(label: str, k_base, k_mode):
    rows = []
    for nb in NBINS_LIST:
        dr = L_EFF / nb
        n_b = count_traps(k_base, nb)
        n_m = count_traps(k_mode, nb)
        dteeth = n_m - n_b
        rows.append({
            "file": label,
            "bins": nb,
            "Δr": dr,
            "n_teeth_base": n_b,
            "n_teeth_mode": n_m,
            "Δteeth": dteeth,
        })
    return rows

def save_teeth_npz(rows, out_path: Path):
    file_arr = np.array([r["file"] for r in rows], dtype=object)
    bins_arr = np.array([r["bins"] for r in rows], dtype=int)
    dr_arr   = np.array([r["Δr"]   for r in rows], dtype=float)
    nb_arr   = np.array([r["n_teeth_base"] for r in rows], dtype=int)
    nm_arr   = np.array([r["n_teeth_mode"] for r in rows], dtype=int)
    dt_arr   = np.array([r["Δteeth"]       for r in rows], dtype=int)

    np.savez(
        out_path,
        file=file_arr,
        bins=bins_arr,
        Δr=dr_arr,
        n_teeth_base=nb_arr,
        n_teeth_mode=nm_arr,
        Δteeth=dt_arr,
    )
    print(f"  Saved Δteeth NPZ → {out_path}")

def save_comb_npz(label: str, k_base, k_mode, out_path: Path):
    np.savez(
        out_path,
        kappa_base=k_base,
        kappa_mode=k_mode,
        tag=label,
    )
    print(f"  Saved comb NPZ → {out_path}")

In [None]:
all_rows = []

for pid, lig_name in PDB_IDS.items():
    label = f"{pid}_{lig_name}"
    cif_path = STRUCT_DIR / f"{pid}.cif"

    print("\n========================================")
    print(f"Structure: {pid} ({lig_name}) → {cif_path.name}")

    st = read_structure_any(cif_path)
    clusters = get_fe_s_clusters(st, cutoff=3.0)
    core = select_fe4s4_cluster(clusters)
    pys_atoms_base = atoms_to_pyscf(core)

    # BASE
    print("  Running SCF (BASE, STO-3G)...")
    mol_b, mf_b = run_scf_sto3g(pys_atoms_base)
    print("  Computing Laplacian (BASE)...")
    k_base = compute_abs_laplacian(mol_b, mf_b, L=L_BOX, N=N_GRID)

    # MODE-B
    pys_atoms_mode = apply_mode_b(pys_atoms_base, shift=MODE_B_SHIFT)
    print("  Running SCF (MODE-B, STO-3G)...")
    mol_m, mf_m = run_scf_sto3g(pys_atoms_mode)
    print("  Computing Laplacian (MODE-B)...")
    k_mode = compute_abs_laplacian(mol_m, mf_m, L=L_BOX, N=N_GRID)

    # Save comb NPZ (for visual combs)
    comb_npz = COMB_DIR / f"{label}_comb.npz"
    save_comb_npz(label, k_base, k_mode, comb_npz)

    # Build Δteeth table + save NPZ
    rows = build_teeth_rows(label, k_base, k_mode)
    all_rows.extend(rows)
    teeth_npz = OUT_DIR / f"{label}_teeth.npz"
    save_teeth_npz(rows, teeth_npz)

print("\nDone. All IspH inhibitors processed.")


Structure: 3ZGL (TMBPP) → 3ZGL.cif
  Reading structure from /content/ispH_structures/3ZGL.cif
  Found 16 Fe/S connectivity clusters
  Selected Fe4S4 cluster with 11 atoms
  Running SCF (BASE, STO-3G)...
    SCF: starting UKS (spin=4)...
SCF not converged.
SCF energy = -7686.1586670321 after 80 cycles  <S^2> = 6.1452133  2S+1 = 5.0577518
    UKS did not converge, trying Newton solver...
converged SCF energy = -7754.72394730136  <S^2> = 14.147245  2S+1 = 7.5887404
    Newton-UKS converged: E = -7754.723947 Ha
  Computing Laplacian (BASE)...
    Evaluating AO and density on grid...
    Computing finite-difference Laplacian...
  Running SCF (MODE-B, STO-3G)...
    SCF: starting UKS (spin=4)...
SCF not converged.
SCF energy = -7701.43664657445 after 80 cycles  <S^2> = 6.0777298  2S+1 = 5.0309959
    UKS did not converge, trying Newton solver...

WARN: HOMO -0.08464460564014571 > LUMO -0.08514058542113251 was found in the canonicalized orbitals.

converged SCF energy = -7754.80457609396  <S

In [None]:
import glob
import pandas as pd

npz_paths = sorted(glob.glob(str(OUT_DIR / "*_teeth.npz")))
print("Teeth NPZ files:")
for p in npz_paths:
    print("  ", p)

dfs = []
for p in npz_paths:
    data = np.load(p, allow_pickle=True)
    if {"file","bins","Δr","Δteeth"}.issubset(set(data.files)):
        df_here = pd.DataFrame({
            "file": data["file"],
            "bins": data["bins"],
            "Δr": data["Δr"],
            "Δteeth": data["Δteeth"],
        })
        dfs.append(df_here)

df_all = pd.concat(dfs, ignore_index=True)
print("\nFiles in df_all:", df_all["file"].unique())

# Focused overlay: 3ZGL (TMBPP) vs 3ZGN (AMBPP)
plt.figure(figsize=(8,4))

for name in df_all["file"].unique():
    sub = df_all[df_all["file"] == name].sort_values("Δr")
    plt.plot(sub["Δr"], sub["Δteeth"], "-o", ms=3, label=name)

plt.axhline(0.0, color="k", lw=1)
plt.xscale("log")
plt.xlim(2e-2, 1e-1)
plt.xlabel("Δr (Å, log scale)")
plt.ylabel("Δteeth (stretched – base)")
plt.title("IspH curvature–trap resonance Δteeth(Δr)\n3ZGL (TMBPP) vs 3ZGN (AMBPP)")
plt.grid(True, which="both", alpha=0.25)
plt.legend(fontsize=8)
plt.tight_layout()
plt.show()

In [None]:
def plot_curvature_comb(npz_file, bins=80, xmax=None):
    data = np.load(npz_file)
    if "kappa_base" not in data or "kappa_mode" not in data:
        print("Skipping", npz_file, "(no kappa arrays)")
        return
    k_base = data["kappa_base"]
    k_mode = data["kappa_mode"]
    if k_base.size == 0 or k_mode.size == 0:
        print("Skipping", npz_file, "(empty arrays)")
        return

    k_max = float(max(k_base.max(), k_mode.max()))
    if xmax is None:
        xmax = k_max

    k_min = 0.0
    edges = np.linspace(k_min, xmax, bins + 1)
    centers = 0.5 * (edges[:-1] + edges[1:])
    width = centers[1] - centers[0]

    hist_b, _ = np.histogram(k_base, bins=edges, density=True)
    hist_m, _ = np.histogram(k_mode, bins=edges, density=True)
    hist_b = np.maximum(hist_b, 1e-20)
    hist_m = np.maximum(hist_m, 1e-20)

    plt.figure(figsize=(7,3))
    plt.bar(centers, hist_b, width=width, alpha=0.6, color="tab:blue", label="base")
    plt.bar(centers, hist_m, width=width, alpha=0.6, color="tab:pink", label="stretched")
    plt.yscale("log")
    plt.ylim(1e-15, 1e0)
    plt.xlabel(r"curvature $|\nabla^2 \rho|$")
    plt.ylabel("PDF (log)")
    plt.title(Path(npz_file).name)
    plt.legend()
    plt.tight_layout()
    plt.show()

print("Comb NPZs:")
for comb in sorted(COMB_DIR.glob("*_comb.npz")):
    print("  ", comb.name)
    plot_curvature_comb(str(comb), bins=80)