<a href="https://colab.research.google.com/github/jamessutton600613-png/GC/blob/main/Untitled221.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip -q install gemmi

[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m2.6/2.6 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# ================================================================
# OEC cubane from multiple PDB IDs ‚Äî robust Œº-oxo detection + W1‚ÄìW4 split
# Automated download, processing, and plotting for 4RTI, 5XNL, 5XNM, 4IXQ
# ================================================================
import os
import math
import numpy as np
import matplotlib.pyplot as plt
from itertools import combinations

try:
    import gemmi
except ModuleNotFoundError as e:
    raise ModuleNotFoundError(
        "gemmi is required. In Colab run:\n  !pip -q install gemmi\nthen rerun this cell."
    )

# -------------------- USER SETTINGS --------------------
# The four structures central to the GQR inverse design project
PDB_IDS    = ["4RTI", "5XNL", "5XNM", "4IXQ"]

# Parameters for atom identification (generally robust)
ZOOM_R     = 4.0              # √Ö radius around metal centroid for zoom plot
W_ANY_MAX  = 3.2              # √Ö water ‚Üí any metal shortlist
W_CA_BIND  = 2.60             # √Ö W1/W2 Ca-bound cutoff
W_CA_FAR   = 3.2              # √Ö W3/W4 must be ‚â• this far from Ca

# -------------------- HELPERS --------------------
def P(a):
    """Get the position of a gemmi Atom as a NumPy array."""
    p=a.pos
    return np.array([p.x, p.y, p.z], float)

def d(a,b):
    """Calculate the distance between two gemmi Atoms."""
    return float(np.linalg.norm(P(a)-P(b)))

def set_equal(ax, pts, pad=0.8):
    """Set equal aspect ratio for a 3D plot."""
    pts = np.asarray(pts)
    cx,cy,cz = pts.mean(axis=0)
    span = np.max(np.ptp(pts, axis=0))
    r = span * 0.65 + pad
    ax.set_xlim(cx-r, cx+r)
    ax.set_ylim(cy-r, cy-r)
    ax.set_zlim(cz-r, cz+r)
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_zticks([])

def draw_bond(ax, A, B, color="k", lw=2.0, alpha=0.95):
    """Draw a line between two 3D points."""
    ax.plot([A[0],B[0]],[A[1],B[1]],[A[2],B[2]], color=color, lw=lw, alpha=alpha)

# -------------------- CORE PROCESSING FUNCTION --------------------
def process_and_plot_oec(pdb_id):
    """
    Downloads, processes, and plots the OEC structure for a given PDB ID.
    """
    print(f"\n{'='*20} PROCESSING {pdb_id} {'='*20}")

    # 1. LOAD STRUCTURE (fetches from PDB)
    try:
        # Construct the URL for the PDB file
        pdb_url = f"https://files.rcsb.org/download/{pdb_id}.cif"
        # Use gemmi.read_file to read directly from the URL
        st = gemmi.read_file(pdb_url)
        print(f"Successfully downloaded and read {pdb_id}.cif from PURL")
    except Exception as e:
        print(f"‚ùå ERROR: Could not download or read {pdb_id}.cif from PDB. Reason: {e}")
        return

    st.remove_hydrogens()
    model = st[0]

    # 2. FIND ALL POTENTIAL ATOMS
    Mn_all, Ca_all, O_all, W_all = [], [], [], []
    for ch in model:
        for res in ch:
            is_w = res.name.upper() in ("HOH","WAT","H2O","DOD","W")
            for a in res:
                el = a.element.name.upper()
                if el == "MN": Mn_all.append(a)
                elif el == "CA": Ca_all.append(a)
                elif el.startswith("O"):
                    (W_all if is_w else O_all).append(a)

    if len(Mn_all) < 4 or len(Ca_all) < 1:
        print("‚ùå ERROR: Could not find at least 4√óMn and 1√óCa in the file.")
        return

    # 3. PICK THE CORRECT Ca + Mn4 CLUSTER (from the same asymmetric unit)
    Mn_xyz = np.array([P(a) for a in Mn_all], float)
    Ca_near = [a for a in Ca_all if np.min(np.linalg.norm(Mn_xyz - P(a), axis=1)) < 6.0]
    if not Ca_near:
        print("‚ùå ERROR: No Ca within 6 √Ö of any Mn.")
        return

    def cubane_cost(mset, Ca_atom):
        pts = np.array([P(m) for m in mset])
        pair_dist = sum(np.linalg.norm(pts[i]-pts[j]) for i in range(4) for j in range(i+1,4))
        cen = pts.mean(axis=0)
        d_ca = np.linalg.norm(cen - P(Ca_atom))
        return pair_dist + 2.5 * d_ca

    best_combo, best_cost = None, 1e9
    for Ca_atom in Ca_near:
        for combo in combinations(Mn_all, 4):
            cst = cubane_cost(combo, Ca_atom)
            if cst < best_cost:
                best_cost = cst
                best_combo = (Ca_atom, list(combo))

    if not best_combo:
        print("‚ùå ERROR: Could not determine the best Mn4Ca cluster.")
        return

    Ca1, Mn4 = best_combo
    metals = [Ca1] + Mn4

    # 4. Œº-OXO PICKER (prefer ‚â•3-bridging)
    def pick_mu_oxo(oxy_atoms, metals, thr_mn_o=2.20, thr_ca_o=2.55, widen=0.05, max_mn=2.40, max_ca=2.70):
        def neighbors(o, t_mn, t_ca):
            return [(m, d(o, m)) for m in metals if (m.element.name.upper() == "MN" and d(o, m) <= t_mn) or \
                                                    (m.element.name.upper() == "CA" and d(o, m) <= t_ca)]
        t_mn, t_ca = thr_mn_o, thr_ca_o
        while True:
            candidates = []
            for o in oxy_atoms:
                nb = neighbors(o, t_mn, t_ca)
                if len(nb) >= 2: candidates.append((o, len(nb), sum(dm for _, dm in nb)))

            deg3 = sorted([c for c in candidates if c[1] >= 3], key=lambda x: (-x[1], x[2]))
            pick_from = deg3 if deg3 else sorted(candidates, key=lambda x: (-x[1], x[2]))

            if len(pick_from) >= 4: return [c[0] for c in pick_from[:5]], t_mn, t_ca
            if t_mn + widen > max_mn and t_ca + widen > max_ca: return [c[0] for c in pick_from], t_mn, t_ca
            t_mn = min(max_mn, t_mn + widen)
            t_ca = min(max_ca, t_ca + widen)

    muO, thr_mn_o, thr_ca_o = pick_mu_oxo(O_all, metals)
    if len(muO) < 5:
        print(f"‚ö†Ô∏è WARNING: Only found {len(muO)} Œº-oxo candidates. Check structure quality.")
        if len(muO) < 4:
            print(f"‚ùå ERROR: Found only {len(muO)} Œº-oxo atoms. Cannot proceed.")
            return

    # Refine to the 5 closest to the metal centroid
    metal_centroid = np.mean([P(m) for m in metals], axis=0)
    muO = sorted(muO, key=lambda o: np.linalg.norm(P(o) - metal_centroid))[:5]

    # 5. WATER CLASSIFICATION (W1/W2 vs W3/W4)
    def classify_waters(waters, metals, Ca_atom, max_any, ca_bind, far_from_ca):
        close = [w for w in waters if min(d(w, m) for m in metals) <= max_any]
        ca_w  = sorted([w for w in close if d(w, Ca_atom) <= ca_bind], key=lambda w: d(w, Ca_atom))[:2]
        mn_w_candidates = [w for w in close if w not in ca_w and d(w, Ca_atom) >= far_from_ca]

        # Sort by distance to the nearest Mn atom
        mn_w_sorted = sorted(mn_w_candidates, key=lambda w: min(d(w, m) for m in Mn4))

        # Pick two that are not too close to each other
        mn_w = []
        if mn_w_sorted:
            mn_w.append(mn_w_sorted[0])
            for w_cand in mn_w_sorted[1:]:
                if all(d(w_cand, w_sel) > 1.5 for w_sel in mn_w):
                    mn_w.append(w_cand)
                    if len(mn_w) == 2:
                        break
        return ca_w, mn_w

    W_ca, W_mn = classify_waters(W_all, metals, Ca1, W_ANY_MAX, W_CA_BIND, W_CA_FAR)
    W = W_ca + W_mn
    W_labels = {id(w): f"W{i+1}" for i, w in enumerate(W_ca)}
    W_labels.update({id(w): f"W{i+3}" for i, w in enumerate(W_mn)})

    # 6. EDGES
    M_O_edges = [(i, j) for i, m in enumerate(metals) for j, o in enumerate(muO) if d(m, o) <= (thr_ca_o if m.element.name.upper()=="CA" else thr_mn_o)]
    MM_edges = set((i, j) for i, j in combinations(range(len(metals)), 2) if any(((i,k) in M_O_edges) and ((j,k) in M_O_edges) for k in range(len(muO))))

    # 7. DIAGNOSTIC
    print(f"‚Äî DIAGNOSTIC for {pdb_id} ‚Äî")
    print(f"Metals: {[m.element.name for m in metals]}")
    print(f"Œº-O selected: {len(muO)} (using Mn‚ÄìO‚â§{thr_mn_o:.2f} √Ö, Ca‚ÄìO‚â§{thr_ca_o:.2f} √Ö)")
    print(f"Waters: {[W_labels.get(id(w), '?') for w in W]}")

    # 8. DRAW
    COL = {"MN":"#7D3C98", "CA":"#5DADE2", "O_MU":"#E74C3C", "W_CA":"#FFB6B6", "W_MN":"#FFC9A9", "O_X":"#F39C12"}
    SZ  = {"MN":280, "CA":320, "O_MU":160, "W_CA":110, "W_MN":110, "O_X":180}

    def draw_structure(ax, title):
        all_pts = [P(m) for m in metals]
        # Metals and Œº-oxo
        for idx, m in enumerate(metals):
            X, E = P(m), m.element.name.upper()
            ax.scatter(X[0], X[1], X[2], s=SZ[E], c=COL[E], ec="k", lw=0.7)
            ax.text(X[0], X[1], X[2]+0.22, "Ca" if E=="CA" else f"Mn{idx}", fontsize=9)
        for j, o in enumerate(muO, 1):
            X = P(o)
            ax.scatter(X[0], X[1], X[2], s=SZ["O_MU"], c=COL["O_MU"], ec="k", lw=0.6)
            ax.text(X[0], X[1], X[2]-0.28, f"O{j}", fontsize=8, color="#333")
            all_pts.append(X)
        # Bonds
        for i, j in M_O_edges: draw_bond(ax, P(metals[i]), P(muO[j]), "k", 2.2, 0.95)
        for i, j in MM_edges: draw_bond(ax, P(metals[i]), P(metals[j]), "#333", 1.2, 0.35)
        # Waters
        for w in W:
            X = P(w)
            is_ca_side = any(id(w) == id(x) for x in W_ca)
            key = "W_CA" if is_ca_side else "W_MN"
            ax.scatter(X[0], X[1], X[2], s=SZ[key], c=COL[key], ec="k", lw=0.5, alpha=0.9)
            ax.text(X[0], X[1], X[2]+0.22, W_labels.get(id(w), "W"), fontsize=8, color="#a33")
            m_near = min(metals, key=lambda m: np.linalg.norm(P(m)-X))
            if np.linalg.norm(P(m_near)-X) <= 2.5:
                draw_bond(ax, X, P(m_near), "#666", 1.0, 0.7)
            all_pts.append(X)
        return all_pts

    # Overview Plot
    fig = plt.figure(figsize=(8, 6.5))
    ax = fig.add_subplot(111, projection='3d')
    all_pts = draw_structure(ax, f"OEC Geometry ‚Äî {pdb_id}")
    set_equal(ax, all_pts)
    out_ov = f"OEC_{pdb_id}_overview.png"
    plt.tight_layout()
    plt.savefig(out_ov, dpi=300)
    plt.show()
    print(f"‚úÖ Saved overview plot to {out_ov}")

    # Zoom Plot
    fig2 = plt.figure(figsize=(7, 6.5))
    ax2 = fig2.add_subplot(111, projection='3d')
    draw_structure(ax2, f"OEC Geometry ‚Äî {pdb_id} (Zoom)")
    cen = np.mean([P(m) for m in metals], axis=0)
    ax2.set_xlim(cen[0]-ZOOM_R, cen[0]+ZOOM_R)
    ax2.set_ylim(cen[1]-ZOOM_R, cen[1]+ZOOM_R)
    ax2.set_zlim(cen[2]-ZOOM_R, cen[2]+ZOOM_R)
    out_zoom = f"OEC_{pdb_id}_zoom.png"
    plt.tight_layout()
    plt.savefig(out_zoom, dpi=300)
    plt.show()
    print(f"‚úÖ Saved zoom plot to {out_zoom}")

# -------------------- MAIN EXECUTION LOOP --------------------
if __name__ == "__main__":
    for pdb_id in PDB_IDS:
        try:
            process_and_plot_oec(pdb_id)
        except Exception as e:
            print(f"‚ùå An unexpected error occurred while processing {pdb_id}: {e}")
    print("\nüéâ Batch processing complete.")


‚ùå ERROR: Could not download or read 4RTI.cif from PDB. Reason: module 'gemmi' has no attribute 'read_file'

‚ùå ERROR: Could not download or read 5XNL.cif from PDB. Reason: module 'gemmi' has no attribute 'read_file'

‚ùå ERROR: Could not download or read 5XNM.cif from PDB. Reason: module 'gemmi' has no attribute 'read_file'

‚ùå ERROR: Could not download or read 4IXQ.cif from PDB. Reason: module 'gemmi' has no attribute 'read_file'

üéâ Batch processing complete.


In [None]:
# ================================================================
# OEC cubane from local CIF files ‚Äî robust Œº-oxo detection + W1‚ÄìW4 split
# Processes 4RTI, 5XNL, 5XNM, 4IXQ after user upload
# ================================================================
import os
import math
import numpy as np
import matplotlib.pyplot as plt
from itertools import combinations

try:
    import gemmi
except ModuleNotFoundError:
    raise ModuleNotFoundError(
        "gemmi is required. Please run '!pip install gemmi' and then rerun this script."
    )

# -------------------- USER SETTINGS --------------------
# The four structures central to the GQR inverse design project
# The script will look for local files named e.g., "4RTI.cif", "5XNL.cif", etc.
PDB_IDS    = ["4RTI", "5XNL", "5XNM", "4IXQ"]

# Parameters for atom identification
ZOOM_R     = 4.0              # √Ö radius around metal centroid for zoom plot
W_ANY_MAX  = 3.2              # √Ö water ‚Üí any metal shortlist
W_CA_BIND  = 2.60             # √Ö W1/W2 Ca-bound cutoff
W_CA_FAR   = 3.2              # √Ö W3/W4 must be ‚â• this far from Ca

# -------------------- HELPERS --------------------
def P(a):
    """Get the position of a gemmi Atom as a NumPy array."""
    p=a.pos
    return np.array([p.x, p.y, p.z], float)

def d(a,b):
    """Calculate the distance between two gemmi Atoms."""
    return float(np.linalg.norm(P(a)-P(b)))

def set_equal(ax, pts, pad=0.8):
    """Set equal aspect ratio for a 3D plot."""
    pts = np.asarray(pts)
    cx,cy,cz = pts.mean(axis=0)
    span = np.max(np.ptp(pts, axis=0))
    r = span * 0.65 + pad
    ax.set_xlim(cx-r, cx+r)
    ax.set_ylim(cy-r, cy-r)
    ax.set_zlim(cz-r, cz+r)
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_zticks([])

def draw_bond(ax, A, B, color="k", lw=2.0, alpha=0.95):
    """Draw a line between two 3D points."""
    ax.plot([A[0],B[0]],[A[1],B[1]],[A[2],B[2]], color=color, lw=lw, alpha=alpha)

# -------------------- CORE PROCESSING FUNCTION --------------------
def process_and_plot_oec(cif_file, pdb_id):
    """
    Reads a local CIF file, processes, and plots the OEC structure.
    """
    print(f"\n{'='*20} PROCESSING {pdb_id} from {cif_file} {'='*20}")

    # 1. LOAD STRUCTURE from local file
    st = gemmi.read_structure(cif_file)
    st.remove_hydrogens()
    model = st[0]

    # 2. FIND ALL POTENTIAL ATOMS
    Mn_all, Ca_all, O_all, W_all = [], [], [], []
    for ch in model:
        for res in ch:
            is_w = res.name.upper() in ("HOH","WAT","H2O","DOD","W")
            for a in res:
                el = a.element.name.upper()
                if el == "MN": Mn_all.append(a)
                elif el == "CA": Ca_all.append(a)
                elif el.startswith("O"):
                    (W_all if is_w else O_all).append(a)

    if len(Mn_all) < 4 or len(Ca_all) < 1:
        print("‚ùå ERROR: Could not find at least 4√óMn and 1√óCa in the file.")
        return

    # 3. PICK THE CORRECT Ca + Mn4 CLUSTER
    Mn_xyz = np.array([P(a) for a in Mn_all], float)
    Ca_near = [a for a in Ca_all if np.min(np.linalg.norm(Mn_xyz - P(a), axis=1)) < 6.0]
    if not Ca_near:
        print("‚ùå ERROR: No Ca within 6 √Ö of any Mn.")
        return

    def cubane_cost(mset, Ca_atom):
        pts = np.array([P(m) for m in mset])
        pair_dist = sum(np.linalg.norm(pts[i]-pts[j]) for i in range(4) for j in range(i+1,4))
        cen = pts.mean(axis=0)
        d_ca = np.linalg.norm(cen - P(Ca_atom))
        return pair_dist + 2.5 * d_ca

    best_combo, best_cost = None, 1e9
    for Ca_atom in Ca_near:
        for combo in combinations(Mn_all, 4):
            cst = cubane_cost(combo, Ca_atom)
            if cst < best_cost:
                best_cost = cst
                best_combo = (Ca_atom, list(combo))

    if not best_combo:
        print("‚ùå ERROR: Could not determine the best Mn4Ca cluster.")
        return

    Ca1, Mn4 = best_combo
    metals = [Ca1] + Mn4

    # 4. Œº-OXO PICKER
    def pick_mu_oxo(oxy_atoms, metals, thr_mn_o=2.20, thr_ca_o=2.55, widen=0.05, max_mn=2.40, max_ca=2.70):
        def neighbors(o, t_mn, t_ca):
            return [(m, d(o, m)) for m in metals if (m.element.name.upper() == "MN" and d(o, m) <= t_mn) or \
                                                    (m.element.name.upper() == "CA" and d(o, m) <= t_ca)]
        t_mn, t_ca = thr_mn_o, thr_ca_o
        while True:
            candidates = []
            for o in oxy_atoms:
                nb = neighbors(o, t_mn, t_ca)
                if len(nb) >= 2: candidates.append((o, len(nb), sum(dm for _, dm in nb)))

            deg3 = sorted([c for c in candidates if c[1] >= 3], key=lambda x: (-x[1], x[2]))
            pick_from = deg3 if deg3 else sorted(candidates, key=lambda x: (-x[1], x[2]))

            if len(pick_from) >= 5: return [c[0] for c in pick_from[:5]], t_mn, t_ca
            if t_mn + widen > max_mn and t_ca + widen > max_ca: return [c[0] for c in pick_from], t_mn, t_ca
            t_mn = min(max_mn, t_mn + widen)
            t_ca = min(max_ca, t_ca + widen)

    muO_candidates, thr_mn_o, thr_ca_o = pick_mu_oxo(O_all, metals)
    if len(muO_candidates) < 5:
        print(f"‚ö†Ô∏è WARNING: Only found {len(muO_candidates)} Œº-oxo candidates. Check structure quality.")
        if len(muO_candidates) < 4:
            print(f"‚ùå ERROR: Found only {len(muO_candidates)} Œº-oxo atoms. Cannot proceed.")
            return

    metal_centroid = np.mean([P(m) for m in metals], axis=0)
    muO = sorted(muO_candidates, key=lambda o: np.linalg.norm(P(o) - metal_centroid))[:5]

    # 5. WATER CLASSIFICATION
    def classify_waters(waters, metals, Ca_atom, max_any, ca_bind, far_from_ca):


In [None]:
## ============================================================
## Batch PDB ‚Üí CIF downloader (RCSB PDB)
## ============================================================
import os, requests

# Paste your list here (case-insensitive)
pdb_ids = """
6W1U 7RFY 6W1V 7RFZ 6DHE 4IXQ 8EZ5 8F4C
8F4D 8F4E 8F4F 8F4G 8F4H 8F4I 8F4J 8F4K
""".split()

# Output folder
out_dir = "pdb_cif_batch"
os.makedirs(out_dir, exist_ok=True)

for pid in pdb_ids:
    pid = pid.strip().upper()
    if not pid:
        continue
    url = f"https://files.rcsb.org/download/{pid}.cif"
    out_path = os.path.join(out_dir, f"{pid}.cif")
    try:
        r = requests.get(url, timeout=20)
        if r.ok:
            with open(out_path, "wb") as f:
                f.write(r.content)
            print(f"‚úÖ Downloaded {pid}")
        else:
            print(f"‚ö†Ô∏è  Failed {pid}: HTTP {r.status_code}")
    except Exception as e:
        print(f"‚ùå Error {pid}: {e}")

print(f"All done. Files saved in '{out_dir}/'")

‚úÖ Downloaded 6W1U
‚úÖ Downloaded 7RFY
‚úÖ Downloaded 6W1V
‚úÖ Downloaded 7RFZ
‚úÖ Downloaded 6DHE
‚úÖ Downloaded 4IXQ
‚úÖ Downloaded 8EZ5
‚úÖ Downloaded 8F4C
‚úÖ Downloaded 8F4D
‚úÖ Downloaded 8F4E
‚úÖ Downloaded 8F4F
‚úÖ Downloaded 8F4G
‚úÖ Downloaded 8F4H
‚úÖ Downloaded 8F4I
‚úÖ Downloaded 8F4J
‚úÖ Downloaded 8F4K
All done. Files saved in 'pdb_cif_batch/'


In [None]:
!ls -lh pdb_cif_batch

total 100M
-rw-r--r-- 1 root root 5.3M Oct 12 09:54 4IXQ.cif
-rw-r--r-- 1 root root  11M Oct 12 09:54 6DHE.cif
-rw-r--r-- 1 root root  12M Oct 12 09:54 6W1U.cif
-rw-r--r-- 1 root root  11M Oct 12 09:54 6W1V.cif
-rw-r--r-- 1 root root 693K Oct 12 09:54 7RFY.cif
-rw-r--r-- 1 root root 807K Oct 12 09:54 7RFZ.cif
-rw-r--r-- 1 root root 6.0M Oct 12 09:54 8EZ5.cif
-rw-r--r-- 1 root root 6.0M Oct 12 09:54 8F4C.cif
-rw-r--r-- 1 root root 5.9M Oct 12 09:54 8F4D.cif
-rw-r--r-- 1 root root 6.2M Oct 12 09:54 8F4E.cif
-rw-r--r-- 1 root root 6.0M Oct 12 09:54 8F4F.cif
-rw-r--r-- 1 root root 6.1M Oct 12 09:54 8F4G.cif
-rw-r--r-- 1 root root 6.1M Oct 12 09:54 8F4H.cif
-rw-r--r-- 1 root root 6.2M Oct 12 09:55 8F4I.cif
-rw-r--r-- 1 root root 6.1M Oct 12 09:55 8F4J.cif
-rw-r--r-- 1 root root 6.0M Oct 12 09:55 8F4K.cif


In [None]:
!zip -r pdb_cif_batch.zip pdb_cif_batch

  adding: pdb_cif_batch/ (stored 0%)
  adding: pdb_cif_batch/8F4I.cif (deflated 78%)
  adding: pdb_cif_batch/8F4E.cif (deflated 78%)
  adding: pdb_cif_batch/6DHE.cif (deflated 77%)
  adding: pdb_cif_batch/8EZ5.cif (deflated 78%)
  adding: pdb_cif_batch/8F4J.cif (deflated 78%)
  adding: pdb_cif_batch/8F4G.cif (deflated 78%)
  adding: pdb_cif_batch/8F4H.cif (deflated 78%)
  adding: pdb_cif_batch/7RFZ.cif (deflated 73%)
  adding: pdb_cif_batch/6W1U.cif (deflated 74%)
  adding: pdb_cif_batch/4IXQ.cif (deflated 76%)
  adding: pdb_cif_batch/6W1V.cif (deflated 77%)
  adding: pdb_cif_batch/8F4F.cif (deflated 78%)
  adding: pdb_cif_batch/7RFY.cif (deflated 77%)
  adding: pdb_cif_batch/8F4D.cif (deflated 78%)
  adding: pdb_cif_batch/8F4C.cif (deflated 78%)
  adding: pdb_cif_batch/8F4K.cif (deflated 78%)


In [None]:
!ls -lh /content

total 23M
drwxr-xr-x 2 root root 4.0K Oct 12 09:55 pdb_cif_batch
-rw-r--r-- 1 root root  23M Oct 12 09:57 pdb_cif_batch.zip
drwxr-xr-x 1 root root 4.0K Oct  9 13:36 sample_data


In [None]:
!python batch_oec_psii.py

In [None]:
!sed 's/^## [0-9]\+ | //' batch_oec_psii.py > run_batch_oec_psii.py
!python run_batch_oec_psii.py

Traceback (most recent call last):
  File "/content/run_batch_oec_psii.py", line 12, in <module>
    import gemmi
ModuleNotFoundError: No module named 'gemmi'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/content/run_batch_oec_psii.py", line 14, in <module>
    raise ModuleNotFoundError("gemmi is required. Install with: pip install gemmi")
ModuleNotFoundError: gemmi is required. Install with: pip install gemmi


In [None]:
%pip -q install gemmi matplotlib numpy && \
sed 's/^## [0-9]\+ | //' batch_oec_psii.py > run_batch_oec_psii.py && \
python run_batch_oec_psii.py

[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/2.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m‚îÅ‚îÅ‚îÅ[0m[91m‚ï∏[0m[90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.3/2.6 MB[0m [31m8.1 MB/s[0m eta [36m0:00:01[0m[2K   [91m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[90m‚ï∫[0m[90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m2.0/2.6 MB[0m [31m29.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m2.6/2.6 MB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
[?25hCIF directory: /content/pdb_cif_batch
Output directory: /content/oec_outputs
[6W1U] done.
[7RFY] analysis failed: Need at least 4√óMn and 1√óCa near OEC.
Traceback (most recent 

In [None]:
# =====================================================================
# PSII OEC Cubane Analyzer (Batch) ‚Äî Cartesian coords, robust Œº-oxo, W3/W4 focus
# CIF folder: ./pdb_cif_batch   Outputs: ./oec_outputs
# =====================================================================

import os, csv, glob, math, traceback, urllib.request
from itertools import combinations
import numpy as np
import matplotlib.pyplot as plt

# -------------------- CONFIG --------------------
TARGET_IDS = [
    "6W1U","6W1V","6DHE","4IXQ","8EZ5","8F4C","8F4D","8F4E","8F4F","8F4G","8F4H","8F4I","8F4J","8F4K"
    # (7RFY, 7RFZ are often polymer-only; we skip for OEC)
]
CIF_DIR   = "pdb_cif_batch"
OUT_DIR   = "oec_outputs"
os.makedirs(CIF_DIR, exist_ok=True)
os.makedirs(OUT_DIR, exist_ok=True)

# Water classification thresholds (tune if needed)
W_ANY_MAX  = 3.2    # √Ö water ‚Üí any metal shortlist
W_CA_BIND  = 2.60   # √Ö W1/W2 Ca-bound cutoff
W_CA_FAR   = 3.2    # √Ö W3/W4 must be ‚â• this far from Ca

# Plotting
ZOOM_R           = 4.0  # √Ö half-edge for zoom cube
SHOW_CA_WATERS   = False  # if True shows W1/W2 faintly; else only W3/W4
MUTE_CA_WATERS   = True   # if showing Ca waters, draw them faintly

# -------------------- Gfx helpers --------------------
def _draw_bond(ax, A, B, color="k", lw=2.0, alpha=0.95):
    ax.plot([A[0],B[0]],[A[1],B[1]],[A[2],B[2]], color=color, lw=lw, alpha=alpha)

def _set_equal(ax, pts, pad=0.8):
    pts = np.asarray(pts); cx,cy,cz = pts.mean(axis=0)
    span = np.max(np.ptp(pts, axis=0))
    r = (span * 0.65 + pad) if np.isfinite(span) and span > 0 else 4.0
    ax.set_xlim(cx-r, cx+r); ax.set_ylim(cy-r, cy+r); ax.set_zlim(cz-r, cz+r)
    ax.set_xticks([]); ax.set_yticks([]); ax.set_zticks([])

# -------------------- Geometry helpers (√Ö, not fractional!) --------------------
CELL = None  # set per-structure

def _P(a):
    # Cartesian (√Ö) from fractional coord using current unit cell
    # gemmi.UnitCell.orth(Position) ‚Üí Vec3 in √Ö
    p = CELL.orth(a.pos)
    return np.array([p.x, p.y, p.z], float)

def _d(a, b):
    return float(np.linalg.norm(_P(a) - _P(b)))

# -------------------- IO helpers --------------------
def download_cif(pdb_id, dest):
    url = f"https://files.rcsb.org/download/{pdb_id}.cif"
    try:
        urllib.request.urlretrieve(url, dest)
        return True, ""
    except Exception as e:
        return False, str(e)

# -------------------- Core picking logic --------------------
def pick_cubane_CaMn4(model):
    Mn_all, Ca_all = [], []
    for ch in model:
        for res in ch:
            for a in res:
                # Ignore altlocs other than '' or 'A', and low-occupancy atoms
                if getattr(a, "altloc", "") not in ("", "A"):
                    continue
                if hasattr(a, "occ") and a.occ is not None and a.occ < 0.5:
                    continue
                el = a.element.name.upper()
                if el == "MN": Mn_all.append(a)
                elif el == "CA": Ca_all.append(a)

    if len(Mn_all) == 0 and len(Ca_all) == 0:
        raise RuntimeError("No Mn or Ca atoms present (polymer-only deposition).")
    if len(Mn_all) < 4 or len(Ca_all) < 1:
        raise RuntimeError("Need at least 4√óMn and 1√óCa near OEC.")

    Mn_xyz = np.array([_P(a) for a in Mn_all], float)
    Ca_near = [a for a in Ca_all if np.min(np.linalg.norm(Mn_xyz - _P(a), axis=1)) < 6.0]
    if not Ca_near:
        raise RuntimeError("No Ca within 6 √Ö of any Mn ‚Äî check symmetry/assembly.")

    def cubane_cost(mset, Ca_atom):
        pts = np.array([_P(m) for m in mset])
        pair = sum(np.linalg.norm(pts[i]-pts[j]) for i in range(4) for j in range(i+1,4))
        cen  = pts.mean(axis=0)
        d_ca = np.linalg.norm(cen - _P(Ca_atom))
        return pair + 2.5*d_ca

    best_combo, best_cost = None, 1e9
    for Ca_atom in Ca_near:
        for combo in combinations(Mn_all, 4):
            cst = cubane_cost(combo, Ca_atom)
            if cst < best_cost:
                best_cost = cst
                best_combo = (Ca_atom, list(combo))
    return best_combo  # (Ca1, [Mn...])

def pick_mu_oxo(oxy_atoms, metals,
                thr_mn_o=2.20, thr_ca_o=2.55,
                widen=0.05, max_mn=2.40, max_ca=2.70):
    def neighbors(o, t_mn, t_ca):
        nb = []
        for m in metals:
            dm = _d(o, m)
            if (m.element.name.upper() == "MN" and dm <= t_mn) or \
               (m.element.name.upper() == "CA" and dm <= t_ca):
                nb.append((m, dm))
        return nb

    def pick_with_thresholds(t_mn, t_ca):
        candidates = []
        for o in oxy_atoms:
            nb = neighbors(o, t_mn, t_ca)
            if len(nb) >= 2:
                deg = len(nb)         # how many metals it bridges
                tot = sum(dm for _, dm in nb)
                candidates.append((o, deg, tot))
        deg3 = [c for c in candidates if c[1] >= 3]
        pick_from = sorted(deg3, key=lambda x: (-x[1], x[2])) if deg3 \
                    else sorted(candidates, key=lambda x: (-x[1], x[2]))
        return pick_from

    # Pass 1: normal widen
    t_mn, t_ca = thr_mn_o, thr_ca_o
    while True:
        pick_from = pick_with_thresholds(t_mn, t_ca)
        if len(pick_from) >= 4:
            break
        if t_mn + widen <= max_mn or t_ca + widen <= max_ca:
            t_mn = min(max_mn, t_mn + widen)
            t_ca = min(max_ca, t_ca + widen)
        else:
            break

    # Pass 2: gentle rescue
    if len(pick_from) < 4:
        t_mn2, t_ca2 = min(2.60, max_mn+0.20), min(2.90, max_ca+0.20)
        pick_from2 = pick_with_thresholds(t_mn2, t_ca2)
        if len(pick_from2) > len(pick_from):
            pick_from, t_mn, t_ca = pick_from2, t_mn2, t_ca2

    # If still many, choose 4 closest to centroid
    if len(pick_from) >= 4:
        cen = np.mean([_P(m) for m in metals], axis=0)
        pick_from = sorted(pick_from, key=lambda tup: np.linalg.norm(_P(tup[0]) - cen))[:4]
        return [c[0] for c in pick_from], t_mn, t_ca

    return [c[0] for c in pick_from], t_mn, t_ca

def classify_waters(waters, metals, Ca_atom, max_any, ca_bind, far_from_ca):
    close = []
    for w in waters:
        if getattr(w, "altloc", "") not in ("", "A"):
            continue
        if hasattr(w, "occ") and w.occ is not None and w.occ < 0.5:
            continue
        if min(_d(w, m) for m in metals) <= max_any:
            close.append(w)
    ca_w  = sorted([w for w in close if _d(w, Ca_atom) <= ca_bind], key=lambda w: _d(w, Ca_atom))[:2]
    mn_only = []
    for w in close:
        if w in ca_w:
            continue
        if _d(w, Ca_atom) >= far_from_ca and any(m.element.name.upper()=="MN" and _d(w,m) <= max_any for m in metals):
            mn_only.append(w)
    chosen = []
    for w in sorted(mn_only, key=lambda w: min(_d(w,m) for m in metals if m.element.name.upper()=="MN")):
        if all(np.linalg.norm(_P(w)-_P(x)) >= 1.2 for x in chosen):
            chosen.append(w)
        if len(chosen) == 2:
            break
    return ca_w, chosen

# -------------------- Drawing --------------------
COL = {"MN":"#7D3C98", "CA":"#5DADE2", "O_MU":"#E74C3C", "W_CA":"#FFB6B6", "W_MN":"#FFC9A9"}
SZ  = {"MN":280, "CA":320, "O_MU":160, "W_CA":110, "W_MN":110}

def draw_panel(path_png, title, metals, muO, W_ca, W_mn, W_labels, M_O_edges, MM_edges, zoom=None):
    fig = plt.figure(figsize=(7.8, 6.3))
    ax  = fig.add_subplot(111, projection='3d')
    pts = []
    # metals
    for idx, m in enumerate(metals):
        X = _P(m); E = m.element.name.upper()
        ax.scatter([X[0]],[X[1]],[X[2]], s=SZ[E], c=COL[E], ec="k", lw=0.7)
        if E == "CA":
            label = "Ca"
        else:
            mn_index = sum(1 for k in metals[:idx] if k.element.name.upper()=="MN") + 1
            label = f"Mn{mn_index}"
        ax.text(X[0], X[1], X[2]+0.22, label, fontsize=9); pts.append(X)
    # Œº-oxo
    for j, o in enumerate(muO, 1):
        X = _P(o)
        ax.scatter([X[0]],[X[1]],[X[2]], s=SZ["O_MU"], c=COL["O_MU"], ec="k", lw=0.6)
        ax.text(X[0], X[1], X[2]-0.28, f"O{j}", fontsize=8, color="#333"); pts.append(X)
    # bonds
    for i, j in M_O_edges: _draw_bond(ax, _P(metals[i]), _P(muO[j]), "k", 2.2, 0.95)
    for i, j in MM_edges:  _draw_bond(ax, _P(metals[i]), _P(metals[j]), "#333", 1.2, 0.35)
    # waters (W3/W4 focus)
    waters_to_plot = []
    if SHOW_CA_WATERS:
        waters_to_plot += W_ca
    waters_to_plot += W_mn
    for w in waters_to_plot:
        X = _P(w); on_ca_side = w in W_ca
        key = "W_CA" if on_ca_side else "W_MN"
        alpha = 0.35 if (on_ca_side and MUTE_CA_WATERS) else 0.95
        ax.scatter([X[0]],[X[1]],[X[2]], s=SZ[key], c=COL[key], ec="k", lw=0.5, alpha=alpha)
        ax.text(X[0], X[1], X[2]+0.22, W_labels.get(id(w), "W"), fontsize=8, color="#a33")
        # optional weak bond line to nearest metal if close
        m_near = min(metals, key=lambda m: np.linalg.norm(_P(m)-X))
        dnm = np.linalg.norm(_P(m_near)-X)
        if dnm <= 2.5:
            _draw_bond(ax, X, _P(m_near), "#666", 1.0, 0.7)
        pts.append(X)

    ax.set_title(title)
    if zoom is None:
        _set_equal(ax, pts)
    else:
        cx, cy, cz, r = zoom
        ax.set_xlim(cx-r, cx+r); ax.set_ylim(cy-r, cy+r); ax.set_zlim(cz-r, cz+r)
        ax.set_xticks([]); ax.set_yticks([]); ax.set_zticks([])
    plt.tight_layout(); fig.savefig(path_png, dpi=300); plt.close(fig)

# -------------------- Analyze one file --------------------
def analyze_one(cif_path, out_prefix):
    global CELL
    import gemmi  # import here so pip install in colab works per run
    st = gemmi.read_structure(cif_path)
    st.remove_hydrogens()
    CELL = st.cell
    model = st[0]

    # collect oxygen pool + waters (with altloc/occ filters)
    O_all, W_all = [], []
    WATER_NAMES = {"HOH","WAT","H2O","DOD","W"}
    for ch in model:
        for res in ch:
            is_w = res.name.upper() in WATER_NAMES
            for a in res:
                if getattr(a, "altloc", "") not in ("", "A"):
                    continue
                if hasattr(a, "occ") and a.occ is not None and a.occ < 0.5:
                    continue
                if a.element.name.upper().startswith("O"):
                    (W_all if is_w else O_all).append(a)

    Ca1, Mn4 = pick_cubane_CaMn4(model)
    metals = [Ca1] + Mn4

    # Fractional sanity check (distances too tiny?)
    mm = []
    for i in range(len(metals)):
        for j in range(i+1, len(metals)):
            mm.append(_d(metals[i], metals[j]))
    if np.median(mm) < 1.0:
        raise RuntimeError("Coordinates look fractional; check _P() and CELL assignment.")

    muO, thr_mn_o, thr_ca_o = pick_mu_oxo(O_all, metals)
    if len(muO) < 4:
        raise RuntimeError(f"Only found {len(muO)} Œº-oxo (Mn‚ÄìO‚â§{thr_mn_o:.2f}, Ca‚ÄìO‚â§{thr_ca_o:.2f}).")

    W_ca, W_mn = classify_waters(W_all, metals, Ca1, W_ANY_MAX, W_CA_BIND, W_CA_FAR)
    W_labels = {}
    if len(W_ca) >= 1: W_labels[id(W_ca[0])] = "W1"
    if len(W_ca) >= 2: W_labels[id(W_ca[1])] = "W2"
    if len(W_mn) >= 1: W_labels[id(W_mn[0])] = "W3"
    if len(W_mn) >= 2: W_labels[id(W_mn[1])] = "W4"

    # edges
    M_O_edges = []
    for i, m in enumerate(metals):
        thr = thr_ca_o if m.element.name.upper()=="CA" else thr_mn_o
        for j, o in enumerate(muO):
            if _d(m, o) <= thr:
                M_O_edges.append((i, j))

    MM_edges = set()
    for i, j in combinations(range(len(metals)), 2):
        shared = any( ((i,k) in M_O_edges) and ((j,k) in M_O_edges) for k in range(len(muO)) )
        if shared:
            MM_edges.add((i, j))

    # outputs
    csv_path = f"{out_prefix}_distances.csv"
    with open(csv_path, "w", newline="") as fh:
        wr = csv.writer(fh)
        wr.writerow(["Type","From","To","Distance_A"])
        for i, j in combinations(range(len(metals)), 2):
            wr.writerow(["M-M", metals[i].element.name, metals[j].element.name, f"{_d(metals[i], metals[j]):.3f}"])
        for i, j in M_O_edges:
            wr.writerow(["M-Omu", metals[i].element.name, f"O{j+1}", f"{_d(metals[i], muO[j]):.3f}"])
        for w in (W_mn if not SHOW_CA_WATERS else (W_ca+W_mn)):
            nm = min(metals, key=lambda m: np.linalg.norm(_P(m)-_P(w)))
            wr.writerow(["W-Mnear", W_labels.get(id(w), "W"), nm.element.name, f"{_d(w, nm):.3f}"])

    title = os.path.basename(cif_path).replace(".cif","")
    draw_panel(f"{out_prefix}_overview.png", f"OEC cubane ‚Äî HYDRATED ({title})",
               metals, muO, W_ca, W_mn, W_labels, M_O_edges, MM_edges, zoom=None)
    cen = np.mean(np.array([_P(m) for m in metals]), axis=0)
    draw_panel(f"{out_prefix}_zoom.png", "OEC cubane ‚Äî HYDRATED (zoom)",
               metals, muO, W_ca, W_mn, W_labels, M_O_edges, MM_edges,
               zoom=(cen[0], cen[1], cen[2], ZOOM_R))

    return {
        "muO_count": len(muO),
        "waters": [W_labels.get(id(w), "?") for w in (W_ca + W_mn)],
        "csv": csv_path
    }

# -------------------- Batch main --------------------
def main():
    print("CIF directory:", os.path.abspath(CIF_DIR))
    print("Output directory:", os.path.abspath(OUT_DIR))
    summary_csv = os.path.join(OUT_DIR, "batch_oec_summary.csv")

    results = []
    for pid in TARGET_IDS:
        pid = pid.upper()
        cif_path = os.path.join(CIF_DIR, f"{pid}.cif")
        if not os.path.exists(cif_path):
            ok, err = download_cif(pid, cif_path)
            if not ok:
                results.append([pid, "download_failed", "", "", err])
                print(f"[{pid}] download failed:", err)
                continue
            else:
                print(f"[{pid}] downloaded.")

        try:
            out_prefix = os.path.join(OUT_DIR, pid)
            info = analyze_one(cif_path, out_prefix)
            results.append([pid, "ok", info["csv"], f"{out_prefix}_overview.png", f"{out_prefix}_zoom.png", f"ŒºO={info['muO_count']} W={','.join(info['waters'])}"])
            print(f"[{pid}] done.")
        except Exception as e:
            tb = traceback.format_exc(limit=2)
            results.append([pid, "analysis_failed", "", "", str(e)])
            print(f"[{pid}] analysis failed:", e, "\n", tb)

    with open(summary_csv, "w", newline="") as fh:
        wr = csv.writer(fh)
        wr.writerow(["PDB_ID","status","dist_csv","overview_png","zoom_png","notes"])
        wr.writerows(results)
    print("Summary saved to:", summary_csv)

if __name__ == "__main__":
    # In Colab, ensure deps are present in THIS runtime:
    try:
        import gemmi  # noqa: F401
    except ModuleNotFoundError:
        import sys, subprocess
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "gemmi"])

    main()

CIF directory: /content/pdb_cif_batch
Output directory: /content/oec_outputs
[6W1U] analysis failed: No Mn or Ca atoms present (polymer-only deposition). 
 Traceback (most recent call last):
  File "/tmp/ipython-input-345286049.py", line 340, in main
    info = analyze_one(cif_path, out_prefix)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipython-input-345286049.py", line 256, in analyze_one
    Ca1, Mn4 = pick_cubane_CaMn4(model)
               ^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: No Mn or Ca atoms present (polymer-only deposition).

[6W1V] analysis failed: No Mn or Ca atoms present (polymer-only deposition). 
 Traceback (most recent call last):
  File "/tmp/ipython-input-345286049.py", line 340, in main
    info = analyze_one(cif_path, out_prefix)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipython-input-345286049.py", line 256, in analyze_one
    Ca1, Mn4 = pick_cubane_CaMn4(model)
               ^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: No Mn or Ca atoms p