# 1) Append `.sdf` to filenames in a folder
This cell walks a folder and renames every file that does **not** already end in `.sdf` to add the suffix. Directories are skipped.

In [None]:
# --- Append .sdf to filenames ---
import os

# Set this to the folder that contains your files
folder = "FOLDER NAME"

for filename in os.listdir(folder):
    file_path = os.path.join(folder, filename)
    # Skip directories
    if os.path.isdir(file_path):
        continue
    # If it doesn't already end with .sdf, rename it
    if not filename.lower().endswith(".sdf"):
        new_name = filename + ".sdf"
        new_path = os.path.join(folder, new_name)
        os.rename(file_path, new_path)
        print(f"Renamed: {filename} -> {new_name}")
print("Done.")

# 2) Rename files using a CSV mapping from **PubChem CID** to **Ligand Name**
Loads a CSV with columns (e.g.) `Pubchem ID` and `Drug Ligand  name`, builds a mapping, and renames files in a folder whose base name matches a CID.

In [None]:
# --- Rename by PubChem CID -> Ligand Name mapping ---
import os
import pandas as pd

# Paths and column names
csv_path = "Data.csv"                    # CSV file with mapping
folder_path = "raw_ligand"               # Folder where files are stored
cid_col = "Pubchem ID"                   # Column with PubChem IDs
lig_col = "Drug Ligand  name"            # Column with ligand names

# Read mapping from CSV
df = pd.read_csv(csv_path)
mapping = dict(zip(df[cid_col].astype(str), df[lig_col].astype(str)))

# Iterate over files and rename when a match is found
for filename in os.listdir(folder_path):
    name, ext = os.path.splitext(filename)
    if name in mapping:
        new_name = f"{mapping[name]}{ext}"
        old_path = os.path.join(folder_path, filename)
        new_path = os.path.join(folder_path, new_name)
        os.rename(old_path, new_path)
        print(f"Renamed: {filename} -> {new_name}")
    else:
        print(f"No match for: {filename}")
print("Done.")

# 3) Cleanup PDB filenames and remove duplicates
This cell:
1. Splits filenames into tokens, drops single-letter chain markers, removes a trailing `malaria` token.
2. Reassembles as `target_ligand_malaria.pdb` (first 3 tokens), avoiding collisions by appending `_N`.
3. Deletes any files matching `*_1.pdb` or `*-1.pdb` recursively.

In [None]:
#!/usr/bin/env python3
import os
from pathlib import Path
from typing import Iterable, Optional, Tuple, Set

input_folder = Path("malaria")         # folder with original pdb files
output_folder = Path("malaria_raw")    # folder for cleaned pdb files

# --- Cleaning options (edit as needed) ---
KEEP_CHAINS: Set[str] = set()  # e.g., {"A","B"}; empty set = keep all chains
STRIP_H = True                 # remove existing hydrogens (recommended)
KEEP_WATERS = False            # keep all waters?
KEEP_POCKET_WATERS = False     # keep only waters near center within RADIUS
CENTER: Tuple[float, float, float] = (0.0, 0.0, 0.0)  # used only if KEEP_POCKET_WATERS=True
RADIUS = 4.0                   # Å for pocket waters
KEEP_ALTLOC = "A"              # keep only altloc 'A' (and blanks)
RESEQUENCE = False             # resequence residues per chain starting from 1

# Whitelist of cofactors/ions to keep (resname)
KEEP_HET = {
    "ZN","MG","CA","MN","FE","FE2","CU","NI","CO",
    "NA","K","CL",
    "HEM","HEC","NAD","NDP","NAP","FAD","FMN","SAM","SAH","COA","PLP",
    "ATP","ADP","AMP","GTP","GDP","GNP","TPP",
}
WATER_NAMES = {"HOH","WAT","H2O"}

# ---------- PDB helpers ----------
def pad80(line: str) -> str:
    if len(line) < 80:
        return line.rstrip("\n") + " " * (80 - len(line)) + "\n"
    return line if line.endswith("\n") else line + "\n"

def rectype(line: str) -> str:
    return line[:6].strip()

def chain_id(line: str) -> str:
    return line[21:22]

def resname(line: str) -> str:
    return line[17:20].strip()

def altloc(line: str) -> str:
    return line[16:17]

def atom_name(line: str) -> str:
    return line[12:16]

def element(line: str) -> str:
    el = line[76:78].strip()
    if el:
        return el
    nm = atom_name(line).strip()
    return nm[0] if nm else ""

def is_hydrogen(line: str) -> bool:
    el = element(line).upper()
    if el == "H":
        return True
    return atom_name(line).upper().startswith("H")

def coords(line: str) -> Tuple[float, float, float]:
    return float(line[30:38]), float(line[38:46]), float(line[46:54])

def within_radius(line: str, center: Tuple[float,float,float], r: float) -> bool:
    x, y, z = coords(line)
    cx, cy, cz = center
    dx, dy, dz = x - cx, y - cy, z - cz
    return (dx*dx + dy*dy + dz*dz) <= r*r

def set_atom_serial(line: str, serial: int) -> str:
    return f"{line[:6]}{serial:>5}{line[11:]}"

def set_resseq_and_icode(line: str, resseq: int, icode: str=" ") -> str:
    return f"{line[:22]}{resseq:>4}{icode}{line[28:]}"

# ---------- Filters ----------
def keep_line(line: str) -> bool:
    rt = rectype(line)
    if rt not in {"ATOM", "HETATM"}:
        return False

    if KEEP_CHAINS and chain_id(line) not in KEEP_CHAINS:
        return False

    alt = altloc(line).strip()
    if KEEP_ALTLOC:
        if alt and alt != KEEP_ALTLOC:
            return False
    else:
        if alt:
            return False

    if STRIP_H and is_hydrogen(line):
        return False

    if rt == "ATOM":
        return True

    # HETATM rules
    rn = resname(line)
    if rn in WATER_NAMES:
        if KEEP_WATERS:
            return True
        if KEEP_POCKET_WATERS:
            return within_radius(line, CENTER, RADIUS)
        return False

    return rn in KEEP_HET

def iter_cleaned_lines(lines: Iterable[str]) -> Iterable[str]:
    current_chain = None
    atom_serial = 0

    # For optional residue resequencing
    reskey_to_new = {}
    resseq_counter = {}

    def resequence(line: str) -> str:
        if not RESEQUENCE:
            return line
        ch = chain_id(line)
        resseq = line[22:26]
        icode = line[26:27]
        key = (ch, resseq, icode)
        if ch not in resseq_counter:
            resseq_counter[ch] = 0
        if key not in reskey_to_new:
            resseq_counter[ch] += 1
            reskey_to_new[key] = (resseq_counter[ch], " ")
        new_resseq, new_icode = reskey_to_new[key]
        return set_resseq_and_icode(line, new_resseq, new_icode)

    for raw in lines:
        line = pad80(raw.replace("\t", " "))
        if rectype(line) not in {"ATOM", "HETATM"}:
            continue
        if not keep_line(line):
            continue

        ch = chain_id(line)
        if current_chain is None:
            current_chain = ch
        elif ch != current_chain:
            yield "TER\n"
            current_chain = ch

        atom_serial += 1
        line = set_atom_serial(line, atom_serial)
        line = resequence(line)
        yield pad80(line)

    yield "TER\n"
    yield "END\n"

# ---------- Driver ----------
def clean_one_file(inp: Path, outp: Path) -> bool:
    try:
        with inp.open("r") as fin, outp.open("w") as fout:
            for line in iter_cleaned_lines(fin):
                fout.write(line)
        print(f"[ok] {inp.name} → {outp.name}")
        return True
    except Exception as e:
        print(f"[fail] {inp.name}: {e}")
        return False

def main():
    output_folder.mkdir(parents=True, exist_ok=True)
    if not input_folder.exists():
        raise SystemExit(f"Input folder not found: {input_folder}")

    pdbs = sorted(p for p in input_folder.iterdir() if p.suffix.lower() == ".pdb")
    if not pdbs:
        print(f"No .pdb files in {input_folder}")
        return

    ok = fail = 0
    for p in pdbs:
        dest = output_folder / p.name
        if clean_one_file(p, dest):
            ok += 1
        else:
            fail += 1
    print(f"\nDone. OK={ok} Failed={fail}")

if __name__ == "__main__":
    main()


# 4) Move a filtered subset of PDBQT files using a CSV list
Reads `Data.csv`, cleans each value in the `3D Interaction` column by removing single-character segments between underscores, then moves corresponding `.pdbqt` files from `raw_malaria_pdbqt` to `Malaria_Dataset_only`.

In [None]:
# --- Move selected PDBQT files to a target folder ---
import os
import re
import shutil
import pandas as pd

input_folder = "raw_malaria_pdbqt"    # Source folder with .pdbqt files
out_folder = "Malaria_Dataset_only"   # Destination folder
os.makedirs(out_folder, exist_ok=True)

# Read your CSV
df = pd.read_csv("Data.csv")

def clean_name(name: str) -> str:
    """Remove single-character segments between underscores.
    Example: 2J50_A_1V0O_A_627 -> 2J50_1V0O_627
    """
    base = os.path.splitext(name)[0]
    cleaned = re.sub(r'_[A-Za-z0-9]_', '_', base)
    return cleaned + ".pdbqt"

for raw_name in df['3D Interaction']:
    pdbqt_name = clean_name(raw_name)
    src_file = os.path.join(input_folder, pdbqt_name)
    dst_file = os.path.join(out_folder, pdbqt_name)

    if os.path.exists(src_file):
        shutil.move(src_file, dst_file)
        print(f"[moved] {pdbqt_name}")
    else:
        print(f"[skip] {pdbqt_name} not found")

# 5) Rename ligand PDBQT files using an external mapping
Reads `ExpData.csv` with columns `pubchem_cid` and `ligand_id`, then renames files in `ligand_pdbqt` to the corresponding `ligand_id` while preserving the extension. Writes results into `ligand_pdbqt_new`.

In [None]:
# --- Rename ligand files based on ExpData mapping ---
import os
import pandas as pd

csv_path = "ExpData.csv"            # CSV with columns: pubchem_cid, ligand_id
input_folder = "ligand_pdbqt"       # Folder of original files (named by cid)
output_folder = "ligand_pdbqt_new"  # Output folder for renamed files
os.makedirs(output_folder, exist_ok=True)

# Build mapping {cid -> ligand_id}
df = pd.read_csv(csv_path)
cid_to_ligand = dict(zip(df["pubchem_cid"].astype(str), df["ligand_id"]))
print("Mapping size:", len(cid_to_ligand))

# Rename files into output folder
for filename in os.listdir(input_folder):
    cid = os.path.splitext(filename)[0]
    if cid in cid_to_ligand:
        new_name = f"{cid_to_ligand[cid]}{os.path.splitext(filename)[1]}"
        src = os.path.join(input_folder, filename)
        dst = os.path.join(output_folder, new_name)
        os.rename(src, dst)
        print(f"Renamed: {filename} -> {new_name}")
    else:
        print(f"No match for {filename}, skipped.")

print("Done.")

# 6) Download ligand SDF files from PubChem by CID
For each `Pubchem ID` in `Data.csv`, fetches the 3D SDF from PubChem using the PUG REST API, saves to `raw_ligand/`, and logs failures to `failed_downloads.log`.

In [None]:
# --- Download SDF files from PubChem by CID ---
import os
import pandas as pd
import requests

# Output locations
save_dir = 'raw_ligand'
log_file = 'failed_downloads.log'

os.makedirs(save_dir, exist_ok=True)

downloaded = []
failed = []

# Load the list of CIDs
df = pd.read_csv("Data.csv")

for cid in df['Pubchem ID']:
    # Save each compound to a file named exactly by CID (add .sdf if you prefer)
    sdf_name = f"{cid}.sdf"
    sdf_path = os.path.join(save_dir, sdf_name)

    # Skip if already downloaded
    if os.path.exists(sdf_path):
        print(f"Already downloaded: {sdf_name}")
        downloaded.append(cid)
        continue

    # PubChem PUG REST URL for 3D SDF
    url = (
        f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/CID/{cid}/record/SDF"
        f"?record_type=3d&response_type=save&response_basename=Conformer3D_COMPOUND_CID_{cid}"
    )

    try:
        response = requests.get(url, timeout=20)
        if response.status_code == 200 and response.text.strip():
            with open(sdf_path, 'w') as f:
                f.write(response.text)
            print(f"Downloaded: {sdf_name}")
            downloaded.append(cid)
        else:
            raise Exception(f"HTTP {response.status_code}")
    except Exception as e:
        print(f"Failed: {cid} ({e})")
        failed.append(cid)
        with open(log_file, 'a') as log:
            log.write(f"{cid}\t{e}\n")

print(f"Finished. {len(downloaded)} downloaded, {len(failed)} failed.")

# 7) Extract only protein chain **M** from PDB files (remove ligands and chain T)
Uses Biopython to parse structures, keep **only** standard residue atoms from chain `M`, and explicitly removes any residue named `D`. Writes results to `raw_malaria/`.

In [None]:
# --- Keep only chain M (no ligands / HETATM / 'D') ---
import os
from Bio.PDB import PDBParser, PDBIO, Select

# Input/output
in_dir  = "malaria"      # Folder with .pdb files
out_dir = "raw_malaria"  # Output folder
os.makedirs(out_dir, exist_ok=True)

# Parameters
chain_to_keep = "M"                  # Chain to keep
ligand_resname_to_remove = "D"       # Residue name to remove if encountered

class KeepChainM_NoLigandD(Select):
    """Biopython Select class to keep only standard residues in chain M and exclude ligand 'D'."""
    def accept_atom(self, atom):
        residue = atom.get_parent()
        chain = residue.get_parent()
        chain_id = getattr(chain, "id", None)
        hetflag = residue.id[0]  # ' ' for standard residues, otherwise hetero

        # Keep only chain M
        if chain_id != chain_to_keep:
            return 0

        # Drop all hetero atoms (HETATM), including ligand D
        if hetflag != ' ':
            return 0

        # Extra safety: drop any residue explicitly named 'D'
        if residue.get_resname().strip() == ligand_resname_to_remove:
            return 0

        return 1

def process_one_pdb(in_path, out_path):
    """Load a PDB, count kept atoms for reporting, and write filtered output."""
    parser = PDBParser(QUIET=True)
    io = PDBIO()
    try:
        structure = parser.get_structure("struct", in_path)
    except Exception as e:
        print(f"Failed to parse {in_path}: {e}")
        return False

    kept_atoms = 0
    for model in structure:
        for chain in model:
            if getattr(chain, "id", None) != chain_to_keep:
                continue
            for residue in chain:
                hetflag = residue.id[0]
                if hetflag == ' ' and residue.get_resname().strip() != ligand_resname_to_remove:
                    kept_atoms += sum(1 for _ in residue.get_atoms())

    if kept_atoms == 0:
        print(f"Skipped (no atoms kept for chain '{chain_to_keep}'): {in_path}")
        return False

    io.set_structure(structure)
    io.save(out_path, select=KeepChainM_NoLigandD())
    print(f"Wrote: {out_path}  (kept atoms: {kept_atoms})")
    return True

def main():
    for fname in os.listdir(in_dir):
        if not fname.lower().endswith(".pdb"):
            continue
        in_path  = os.path.join(in_dir, fname)
        out_path = os.path.join(out_dir, fname)
        process_one_pdb(in_path, out_path)

if __name__ == "__main__":
    main()

# 8) Extract only ligand residue named **D** from PDB files
Uses Biopython to retain only HETATM residues where `resname == 'D'` and writes these to `ligand/`.

In [None]:
# --- Keep only residue 'D' (ligand) ---
import os
from Bio.PDB import PDBParser, PDBIO, Select

# Input/output
in_dir  = "malaria"   # Input folder with .pdb files
out_dir = "ligand"    # Output folder
ligand_resname = "D"  # Residue name to keep
os.makedirs(out_dir, exist_ok=True)

class KeepLigandByResname(Select):
    """Keep only hetero residues whose resname matches 'D'."""
    def accept_atom(self, atom):
        residue = atom.get_parent()
        hetflag = residue.id[0]  # ' ' -> standard residue, otherwise hetero ('H_', 'W', etc.)
        if hetflag == ' ':
            return 0  # drop protein/standard residues
        if residue.get_resname().strip() == ligand_resname:
            return 1
        return 0

def process_one_pdb(in_path, out_path):
    """Parse a PDB file, ensure at least one matching atom exists, then save only the ligand."""
    parser = PDBParser(QUIET=True)
    io = PDBIO()
    try:
        structure = parser.get_structure("struct", in_path)
    except Exception as e:
        print(f"Failed to parse {in_path}: {e}")
        return False

    # Count atoms that match for info / skip empty
    kept_atoms = 0
    for model in structure:
        for chain in model:
            for residue in chain:
                hetflag = residue.id[0]
                if hetflag == ' ':
                    continue
                if residue.get_resname().strip() == ligand_resname:
                    kept_atoms += sum(1 for _ in residue.get_atoms())

    if kept_atoms == 0:
        print(f"Skipped (no ligand '{ligand_resname}' atoms): {in_path}")
        return False

    io.set_structure(structure)
    io.save(out_path, select=KeepLigandByResname())
    print(f"Wrote: {out_path}  (kept atoms: {kept_atoms})")
    return True

def main():
    for fname in os.listdir(in_dir):
        if not fname.lower().endswith(".pdb"):
            continue
        in_path  = os.path.join(in_dir, fname)
        out_path = os.path.join(out_dir, fname)
        process_one_pdb(in_path, out_path)

if __name__ == "__main__":
    main()