Passo 0 — Imports e carregar o CSV

In [1]:
import re
import numpy as np
import pandas as pd

PATH_CSV = "CoV-AbDab_080224.csv"  # ajusta o path se necessário

df_raw = pd.read_csv(PATH_CSV)
print(df_raw.shape)
print(df_raw.columns.tolist())
df_raw.head(3)


(12918, 23)
['Name', 'Ab or Nb', 'Binds to', "Doesn't Bind to", 'Neutralising Vs', 'Not Neutralising Vs', 'Protein + Epitope', 'Origin', 'VHorVHH', 'VL', 'Heavy V Gene', 'Heavy J Gene', 'Light V Gene', 'Light J Gene', 'CDRH3', 'CDRL3', 'Structures', 'ABB Homology Model (if no structure)', 'Sources', 'Date Added', 'Last Updated', 'Update Description', 'Notes/Following Up?']


Unnamed: 0,Name,Ab or Nb,Binds to,Doesn't Bind to,Neutralising Vs,Not Neutralising Vs,Protein + Epitope,Origin,VHorVHH,VL,...,Light J Gene,CDRH3,CDRL3,Structures,ABB Homology Model (if no structure),Sources,Date Added,Last Updated,Update Description,Notes/Following Up?
0,Curtis_3548_S-2,Ab,SARS-CoV2_WT;SARS-CoV2_Beta,SARS-CoV2_Omicron-BA1;HKU1,SARS-CoV2_WT (weak),,S; RBD/non-RBD,B-cells; SARS-CoV2 Human Patient,ND,ND,...,ND,ARGSRNDLRDFDY,QSYNSSLSGLVV,ND,,"Nicholas Curtis et al., 2023 (https://www.bior...","Feb 8, 2024","Feb 8, 2024",,Complete
1,Curtis_3548_S-7,Ab,SARS-CoV2_WT;SARS-CoV2_Beta,SARS-CoV2_Omicron-BA1;HKU1,,SARS-CoV2_WT,S; non-RBD,B-cells; SARS-CoV2 Human Patient,ND,ND,...,ND,AREPYSSGMGGRDY,QQYGSSPYT,ND,,"Nicholas Curtis et al., 2023 (https://www.bior...","Feb 8, 2024","Feb 8, 2024",,Complete
2,Curtis_3548_RBD-15,Ab,SARS-CoV2_WT,SARS-CoV2_Beta;SARS-CoV2_Omicron-BA1;HKU1,,SARS-CoV2_WT,S; iso-RBD,B-cells; SARS-CoV2 Human Patient,ND,ND,...,ND,AKGIYSSSSYWFGP,QAWDSSTVV,ND,,"Nicholas Curtis et al., 2023 (https://www.bior...","Feb 8, 2024","Feb 8, 2024",,Binds only non-prefusion stabilised RBD. Complete


Passo 1 — Funções utilitárias (parsing de variantes e PDB IDs)

In [2]:
AA_VALID = set(list("ACDEFGHIKLMNPQRSTVWY"))  # 20 AAs padrão

def split_multi_field(x: str) -> list[str]:
    """Divide campos multi-valor; ignora ND/vazios."""
    if pd.isna(x):
        return []
    s = str(x).strip()
    if s in ("", "ND", "nan"):
        return []
    parts = re.split(r"[;,]\s*", s)
    parts = [p.strip() for p in parts if p.strip() and p.strip() != "ND"]
    return parts

def normalize_variant(v: str) -> str:
    """Remove ruído tipo '(weak)' e normaliza espaços."""
    v = str(v)
    v = v.replace("(weak)", "").replace("weak", "")
    v = re.sub(r"\s+", "", v)
    # casos raros do tipo '(weak)SARS-CoV2_...' ou colados
    v = v.replace(")SARS", "SARS")
    v = v.strip()
    return v

def is_sarscov2(v: str) -> bool:
    """Mantém apenas variantes SARS-CoV-2 (depois de normalizar)."""
    v = normalize_variant(v)
    return v.startswith("SARS-CoV2")

def extract_pdb_ids(structures_field: str) -> list[str]:
    """
    Extrai IDs PDB (4 chars) de:
    - 'ND'
    - URLs do RCSB: https://www.rcsb.org/structure/7YH6;...
    - ou strings que contenham o padrão de PDB id
    """
    if pd.isna(structures_field):
        return []
    s = str(structures_field)
    if s.strip() in ("", "ND"):
        return []
    # encontra padrões 4-char típicos de PDB (ex: 7YH6, 8IX3)
    # PDB IDs: [0-9][A-Za-z0-9]{3}
    ids = re.findall(r"\b[0-9][A-Za-z0-9]{3}\b", s)
    # normaliza para uppercase
    ids = [i.upper() for i in ids]
    # remove duplicados preservando ordem
    seen = set()
    out = []
    for i in ids:
        if i not in seen:
            out.append(i)
            seen.add(i)
    return out

def clean_sequence(seq: str) -> str | None:
    """Remove espaços, troca 'ND' por None e valida se só tem aminoácidos."""
    if pd.isna(seq):
        return None
    s = str(seq).strip().upper()
    if s in ("", "ND", "N/A", "NA"):
        return None
    s = re.sub(r"\s+", "", s)
    # valida caracteres
    if any(ch not in AA_VALID for ch in s):
        return None
    return s


Passo 2 — Selecionar e limpar colunas essenciais

In [19]:
cols_needed = ["VHorVHH", "VL", "Structures", "Neutralising Vs", "Not Neutralising Vs"]
missing = [c for c in cols_needed if c not in df_raw.columns]
if missing:
    raise ValueError(f"Faltam colunas esperadas: {missing}")

df = df_raw[cols_needed].copy()

# limpar sequências
df["chain_heavy"] = df["VHorVHH"].apply(clean_sequence)
df["chain_light"] = df["VL"].apply(clean_sequence)

# extrair PDBs
df["pdb_ids"] = df["Structures"].apply(extract_pdb_ids)

print("Antes filtros:", df.shape)
df.head(3)


Antes filtros: (12918, 8)


Unnamed: 0,VHorVHH,VL,Structures,Neutralising Vs,Not Neutralising Vs,chain_heavy,chain_light,pdb_ids
0,ND,ND,ND,SARS-CoV2_WT (weak),,,,[]
1,ND,ND,ND,,SARS-CoV2_WT,,,[]
2,ND,ND,ND,,SARS-CoV2_WT,,,[]


Passo 3 — Filtrar: apenas entradas com VH + VL + pelo menos 1 PDB

In [4]:
df_f = df.dropna(subset=["chain_heavy", "chain_light"]).copy()
df_f = df_f[df_f["pdb_ids"].map(len) > 0].copy()

print("Depois filtros VH+VL+PDB:", df_f.shape)


Depois filtros VH+VL+PDB: (576, 8)


Passo 4 — Expandir para o formato “1 interação por linha”

Aqui criamos:

linhas positivas (label=1) para cada variante em Neutralising Vs

linhas negativas (label=0) para cada variante em Not Neutralising Vs

E mantemos só SARS-CoV-2

In [5]:
records = []

for _, row in df_f.iterrows():
    vh = row["chain_heavy"]
    vl = row["chain_light"]
    pdb_list = row["pdb_ids"]

    pos_vars = [normalize_variant(v) for v in split_multi_field(row["Neutralising Vs"]) if is_sarscov2(v)]
    neg_vars = [normalize_variant(v) for v in split_multi_field(row["Not Neutralising Vs"]) if is_sarscov2(v)]

    # Se a mesma variante aparecer como pos e neg, é ambíguo -> removemos essa variante
    pos_set = set(pos_vars)
    neg_set = set(neg_vars)
    conflicts = pos_set & neg_set
    if conflicts:
        pos_vars = [v for v in pos_vars if v not in conflicts]
        neg_vars = [v for v in neg_vars if v not in conflicts]

    # Estratégia simples: usar o 1º PDB como representante do anticorpo
    # (Se quiseres, depois fazemos versão "explode PDBs" também.)
    pdb_id = pdb_list[0]

    for v in pos_vars:
        records.append((pdb_id, vh, vl, v, 1))

    for v in neg_vars:
        records.append((pdb_id, vh, vl, v, 0))

df_pairs = pd.DataFrame(records, columns=["pdb_id", "chain_heavy", "chain_light", "variant_target", "label"])
print("Interações (antes dedup):", df_pairs.shape)
df_pairs.head(5)


Interações (antes dedup): (4196, 5)


Unnamed: 0,pdb_id,chain_heavy,chain_light,variant_target,label
0,8J1T,VQLVESGGGLVQPGGSLRLSCAASGLTVSSNYMNWVRQAPGKGLEW...,DIQMTQSPSFLSASVGDRVTITCRASQGISSYLAWYQQKPGKAPKL...,SARS-CoV2_WT,1
1,8J1T,VQLVESGGGLVQPGGSLRLSCAASGLTVSSNYMNWVRQAPGKGLEW...,DIQMTQSPSFLSASVGDRVTITCRASQGISSYLAWYQQKPGKAPKL...,SARS-CoV2_Alpha,1
2,8J1T,VQLVESGGGLVQPGGSLRLSCAASGLTVSSNYMNWVRQAPGKGLEW...,DIQMTQSPSFLSASVGDRVTITCRASQGISSYLAWYQQKPGKAPKL...,SARS-CoV2_Beta,1
3,8J1T,VQLVESGGGLVQPGGSLRLSCAASGLTVSSNYMNWVRQAPGKGLEW...,DIQMTQSPSFLSASVGDRVTITCRASQGISSYLAWYQQKPGKAPKL...,SARS-CoV2_Gamma,1
4,8J1T,VQLVESGGGLVQPGGSLRLSCAASGLTVSSNYMNWVRQAPGKGLEW...,DIQMTQSPSFLSASVGDRVTITCRASQGISSYLAWYQQKPGKAPKL...,SARS-CoV2_Delta,1


Passo 5 — Remover duplicados e validar labels

In [6]:
df_pairs = df_pairs.drop_duplicates(subset=["pdb_id", "variant_target", "label"]).copy()

# sanity checks
assert set(df_pairs["label"].unique()).issubset({0, 1})
assert df_pairs["pdb_id"].str.len().eq(4).all()

print("Interações (depois dedup):", df_pairs.shape)
print("PDBs únicos:", df_pairs["pdb_id"].nunique())
print(df_pairs["label"].value_counts(normalize=True))


Interações (depois dedup): (3994, 5)
PDBs únicos: 472
label
1    0.639209
0    0.360791
Name: proportion, dtype: float64


Passo 6 — Criar antibody_sequence e features simples (bioquímicas)

estas features são ótimas para:

baseline

EDA

justificar processamento

In [11]:
def aa_freq_features(seq: str, prefix: str) -> dict:
    # frequência relativa de cada AA
    n = len(seq)
    counts = {aa: 0 for aa in AA_VALID}
    for ch in seq:
        counts[ch] += 1
    return {f"{prefix}_freq_{aa}": counts[aa] / n for aa in sorted(AA_VALID)}

def basic_seq_features(seq: str, prefix: str) -> dict:
    return {
        f"{prefix}_len": len(seq),
        f"{prefix}_frac_gly": seq.count("G") / len(seq),
        f"{prefix}_frac_pro": seq.count("P") / len(seq),
        f"{prefix}_frac_aromatic": sum(seq.count(x) for x in ["F", "W", "Y"]) / len(seq),
        f"{prefix}_frac_charged": sum(seq.count(x) for x in ["D","E","K","R","H"]) / len(seq),
    }

df_pairs["antibody_sequence"] = df_pairs["chain_heavy"] + df_pairs["chain_light"]

# construir features numéricas (opcional, mas recomendado)
feat_rows = []
for _, r in df_pairs.iterrows():
    vh, vl, ab = r["chain_heavy"], r["chain_light"], r["antibody_sequence"]
    feats = {}
    feats |= basic_seq_features(vh, "vh")
    feats |= basic_seq_features(vl, "vl")
    feats |= basic_seq_features(ab, "ab")
    # (se quiseres muitas features: aa_freq_features também)
    feat_rows.append(feats)

df_feats = pd.DataFrame(feat_rows)
df_final = pd.concat([df_pairs.reset_index(drop=True), df_feats.reset_index(drop=True)], axis=1)

print(df_final.shape)
df_final.head(3)


(3994, 21)


Unnamed: 0,pdb_id,chain_heavy,chain_light,variant_target,label,antibody_sequence,vh_len,vh_frac_gly,vh_frac_pro,vh_frac_aromatic,...,vl_len,vl_frac_gly,vl_frac_pro,vl_frac_aromatic,vl_frac_charged,ab_len,ab_frac_gly,ab_frac_pro,ab_frac_aromatic,ab_frac_charged
0,8J1T,VQLVESGGGLVQPGGSLRLSCAASGLTVSSNYMNWVRQAPGKGLEW...,DIQMTQSPSFLSASVGDRVTITCRASQGISSYLAWYQQKPGKAPKL...,SARS-CoV2_WT,1,VQLVESGGGLVQPGGSLRLSCAASGLTVSSNYMNWVRQAPGKGLEW...,115,0.121739,0.026087,0.104348,...,108,0.083333,0.055556,0.12037,0.12963,223,0.103139,0.040359,0.112108,0.147982
1,8J1T,VQLVESGGGLVQPGGSLRLSCAASGLTVSSNYMNWVRQAPGKGLEW...,DIQMTQSPSFLSASVGDRVTITCRASQGISSYLAWYQQKPGKAPKL...,SARS-CoV2_Alpha,1,VQLVESGGGLVQPGGSLRLSCAASGLTVSSNYMNWVRQAPGKGLEW...,115,0.121739,0.026087,0.104348,...,108,0.083333,0.055556,0.12037,0.12963,223,0.103139,0.040359,0.112108,0.147982
2,8J1T,VQLVESGGGLVQPGGSLRLSCAASGLTVSSNYMNWVRQAPGKGLEW...,DIQMTQSPSFLSASVGDRVTITCRASQGISSYLAWYQQKPGKAPKL...,SARS-CoV2_Beta,1,VQLVESGGGLVQPGGSLRLSCAASGLTVSSNYMNWVRQAPGKGLEW...,115,0.121739,0.026087,0.104348,...,108,0.083333,0.055556,0.12037,0.12963,223,0.103139,0.040359,0.112108,0.147982


Passo 7 — Preparar anti–data leakage (GroupKFold)

A ideia é: o mesmo pdb_id nunca pode estar em treino e teste.

In [12]:
# isto é o "grupo" para GroupKFold
groups = df_final["pdb_id"].copy()

# quick check: tamanho consistente
assert len(groups) == len(df_final)

# guardar ficheiro auxiliar
df_groups = pd.DataFrame({"pdb_id": df_final["pdb_id"]})
df_groups.to_csv("groups_pdb.csv", index=False)

print(df_groups.head())


  pdb_id
0   8J1T
1   8J1T
2   8J1T
3   8J1T
4   8J1T


Passo 8 — Guardar outputs finais desta fase

In [13]:
df_final.to_csv("dataset_molecular_clean.csv", index=False)

print("Guardado:")
print("- dataset_molecular_clean.csv")
print("- groups_pdb.csv")


Guardado:
- dataset_molecular_clean.csv
- groups_pdb.csv


Passo 9 - Integração com PDBs para GNN

In [20]:
import os
import re
import csv
import zipfile
from pathlib import Path
from typing import Optional, Dict, List

import numpy as np
import pandas as pd

# -------------------------
# Config
# -------------------------
DATASET_CSV = Path("dataset_molecular_clean.csv")
ZIP_PATH = Path("Structures_pdbs.zip")   # <- o teu ficheiro
OUT_DIR = Path("graphs_npz")

# Se tiveres uma lista fixa de PDBs (ex: 472), podes ligar isto depois:
PDB_LIST_CSV: Optional[Path] = None  # ex: Path("pdb_all_472.csv")

CUTOFF = 8.0
RESUME = True
MAX_CA = None  # ex: 2000 se quiseres limitar estruturas muito grandes

PDB_ID_RE = re.compile(r"([0-9][a-z0-9]{3})", re.IGNORECASE)


# -------------------------
# Helpers
# -------------------------
def infer_pdb_id_from_name(name: str) -> Optional[str]:
    base = Path(name).name
    m = PDB_ID_RE.search(base)
    return m.group(1).upper() if m else None



def ensure_dir(p: Path) -> None:
    p.mkdir(parents=True, exist_ok=True)


def load_target_pdb_ids(dataset_csv: Path, pdb_list_csv: Optional[Path] = None) -> List[str]:
    df = pd.read_csv(dataset_csv)
    if "pdb_id" not in df.columns:
        raise ValueError("dataset_molecular_clean.csv tem de ter coluna 'pdb_id'.")

    if pdb_list_csv and pdb_list_csv.exists():
        s = pd.read_csv(pdb_list_csv)["pdb_id"].astype(str).str.upper()
        return sorted(set(s.tolist()))

    return sorted(df["pdb_id"].astype(str).str.upper().unique().tolist())


def parse_pdb_ca_coords_from_lines(lines: List[str]) -> np.ndarray:
    coords = []
    for line in lines:
        if not line.startswith("ATOM"):
            continue
        atom_name = line[12:16].strip()
        if atom_name != "CA":
            continue
        try:
            x = float(line[30:38])
            y = float(line[38:46])
            z = float(line[46:54])
        except ValueError:
            continue
        coords.append((x, y, z))

    if not coords:
        return np.zeros((0, 3), dtype=np.float32)
    return np.asarray(coords, dtype=np.float32)


def build_radius_graph(coords: np.ndarray, cutoff: float = 8.0) -> np.ndarray:
    n = coords.shape[0]
    if n == 0:
        return np.zeros((2, 0), dtype=np.int64)

    diff = coords[:, None, :] - coords[None, :, :]
    d2 = np.einsum("ijk,ijk->ij", diff, diff)
    mask = (d2 <= cutoff * cutoff) & (d2 > 0.0)

    src, dst = np.where(mask)
    return np.stack([src, dst], axis=0).astype(np.int64)


# -------------------------
# ZIP source
# -------------------------
class ZipPDBSource:
    def __init__(self, zip_path: Path):
        self.zip_path = zip_path
        self.zf = zipfile.ZipFile(zip_path, "r")

    def iter_available(self) -> Dict[str, str]:
        out = {}
        for name in self.zf.namelist():
            if name.endswith("/"):
                continue
            if not name.lower().endswith(".pdb"):
                continue

            pid = infer_pdb_id_from_name(Path(name).name) or infer_pdb_id_from_name(name)
            if pid:
                out[pid] = name
        return out

    def read_pdb_lines(self, internal_ref: str) -> List[str]:
        with self.zf.open(internal_ref, "r") as f:
            content = f.read().decode("utf-8", errors="ignore")
        return content.splitlines(True)

    def close(self):
        self.zf.close()


# -------------------------
# Main
# -------------------------
def build_graphs():
    if not DATASET_CSV.exists():
        raise FileNotFoundError(f"Não encontrei {DATASET_CSV} no diretório atual.")

    if not ZIP_PATH.exists():
        raise FileNotFoundError(f"Não encontrei {ZIP_PATH} no diretório atual.")

    ensure_dir(OUT_DIR)

    log_path = OUT_DIR / "pdb_processing_log.txt"
    report_path = OUT_DIR / "pdb_processing_report.csv"

    target_pdbs = load_target_pdb_ids(DATASET_CSV, PDB_LIST_CSV)

    src = ZipPDBSource(ZIP_PATH)
    try:
        available = src.iter_available()
        print(f"PDBs no ZIP: {len(available)}")
        print(f"PDBs alvo (do dataset/lista): {len(target_pdbs)}")

        # cria report se não existir
        if not report_path.exists():
            with open(report_path, "w", newline="", encoding="utf-8") as f:
                w = csv.writer(f)
                w.writerow(["pdb_id", "status", "n_nodes", "n_edges", "message"])

        def already_done(pdb_id: str) -> bool:
            return (OUT_DIR / f"{pdb_id}.npz").exists()

        processed = skipped = missing = failed = 0

        with open(log_path, "a", encoding="utf-8") as logf, open(report_path, "a", newline="", encoding="utf-8") as repf:
            repw = csv.writer(repf)

            for pdb_id in target_pdbs:
                if RESUME and already_done(pdb_id):
                    skipped += 1
                    continue

                if pdb_id not in available:
                    missing += 1
                    repw.writerow([pdb_id, "missing", "", "", "PDB not found in ZIP"])
                    continue

                try:
                    lines = src.read_pdb_lines(available[pdb_id])
                    coords = parse_pdb_ca_coords_from_lines(lines)

                    if coords.shape[0] == 0:
                        repw.writerow([pdb_id, "no_ca", 0, 0, "No CA atoms found"])
                        continue

                    if MAX_CA is not None and coords.shape[0] > MAX_CA:
                        coords = coords[:MAX_CA, :]

                    edge_index = build_radius_graph(coords, cutoff=CUTOFF)

                    np.savez_compressed(
                        OUT_DIR / f"{pdb_id}.npz",
                        coords=coords.astype(np.float32),
                        edge_index=edge_index.astype(np.int64),
                        cutoff=np.array([CUTOFF], dtype=np.float32),
                    )

                    repw.writerow([pdb_id, "ok", coords.shape[0], edge_index.shape[1], ""])
                    processed += 1

                    if processed % 25 == 0:
                        logf.write(f"[OK] processed {processed} graphs so far...\n")
                        logf.flush()

                except Exception as e:
                    failed += 1
                    msg = f"{type(e).__name__}: {e}"
                    repw.writerow([pdb_id, "error", "", "", msg])
                    logf.write(f"[ERROR] {pdb_id} -> {msg}\n")
                    logf.flush()

        print("\nDone ✅")
        print(f"Processed: {processed}")
        print(f"Skipped (already done): {skipped}")
        print(f"Missing in ZIP: {missing}")
        print(f"Failed: {failed}")
        print(f"Outputs in: {OUT_DIR.resolve()}")

    finally:
        src.close()


if __name__ == "__main__":
    build_graphs()


PDBs no ZIP: 988
PDBs alvo (do dataset/lista): 472

Done ✅
Processed: 413
Skipped (already done): 16
Missing in ZIP: 43
Failed: 0
Outputs in: C:\Users\filip\OneDrive\Documents\Universidade\mestrado\ano_2\1_semestre\SIB\Trabalho_grupo\graphs_npz


Passo 10 - Criação de Dataset final com PDBs com grafos

In [23]:
import os
from pathlib import Path
import numpy as np
import pandas as pd

# --- Paths (ajusta se precisares)
DATASET_IN = Path("dataset_molecular_clean.csv")
REPORT_CSV = Path("pdb_processing_report.csv")        # o teu report anexado/gerado
GRAPHS_DIR = Path("graphs_npz")                       # onde estão os .npz
OUT_DATASET = Path("dataset_molecular_gnn_ready.csv")
OUT_GROUPS  = Path("groups_pdb_gnn_ready.csv")

# --- 1) Carregar dataset
df = pd.read_csv(DATASET_IN)
df["pdb_id"] = df["pdb_id"].astype(str).str.upper()

print("Dataset IN:", df.shape, "| PDBs únicos:", df["pdb_id"].nunique())

# --- 2) Ler report e obter PDBs com status ok (se o report existir)
ok_from_report = set()
missing_from_report = set()
if REPORT_CSV.exists():
    df_report = pd.read_csv(REPORT_CSV)
    for _, row in df_report.iterrows():
        pid = str(row["pdb_id"]).upper()
        status = str(row["status"]).lower()
        if status == "ok":
            ok_from_report.add(pid)
        else:
            missing_from_report.add(pid)

    print("OK por report:", len(ok_from_report))
    print("Missing/failed por report:", len(missing_from_report))
    
# --- 3) Confirmar via filesystem (fonte de verdade): PDBs com .npz em graphs_npz
ok_from_npz = set(p.stem.upper() for p in GRAPHS_DIR.glob("*.npz"))
print("OK por .npz:", len(ok_from_npz))

# --- 4) PDBs finais utilizáveis (interseção ajuda a evitar falsos positivos)
# Se tiveres report e npz, usa a interseção; se não, usa npz.
if ok_from_report:
    ok_pdbs = ok_from_report.intersection(ok_from_npz)
else:
    ok_pdbs = ok_from_npz

print("OK finais (usáveis):", len(ok_pdbs))

# --- 5) Filtrar dataset para apenas PDBs com grafo
df_gnn = df[df["pdb_id"].isin(ok_pdbs)].copy()

print("Dataset GNN-ready:", df_gnn.shape, "| PDBs únicos:", df_gnn["pdb_id"].nunique())

# --- 6) Guardar outputs finais
df_gnn.to_csv(OUT_DATASET, index=False)
pd.DataFrame({"pdb_id": df_gnn["pdb_id"]}).to_csv(OUT_GROUPS, index=False)

print("\nGuardado:")
print("-", OUT_DATASET)
print("-", OUT_GROUPS)

# --- 7) (opcional) lista de PDBs que ficaram de fora (útil para documentação)
pdbs_all = set(df["pdb_id"].unique())
pdbs_dropped = sorted(pdbs_all - set(df_gnn["pdb_id"].unique()))
pd.Series(pdbs_dropped, name="pdb_id_dropped").to_csv("pdbs_dropped_no_graph.csv", index=False)
print("- pdbs_dropped_no_graph.csv (para registo)")


Dataset IN: (3994, 21) | PDBs únicos: 472
OK por .npz: 429
OK finais (usáveis): 429
Dataset GNN-ready: (3760, 21) | PDBs únicos: 429

Guardado:
- dataset_molecular_gnn_ready.csv
- groups_pdb_gnn_ready.csv
- pdbs_dropped_no_graph.csv (para registo)
