In [None]:
import warnings
from pathlib import Path
import subprocess
from concurrent.futures import ThreadPoolExecutor
from urllib.request import urlretrieve
from urllib.error import URLError
import time
import MDAnalysis as mda
from openbabel import pybel

# Suppress warnings
warnings.filterwarnings("ignore")
ob_log_handler = pybel.ob.OBMessageHandler()
pybel.ob.obErrorLog.SetOutputLevel(0)

# Define main directory
HERE = Path(_dh[-1])
DATA_ROOT = HERE / "data"


# Function to download a PDB file with retry mechanism
def download_pdb(pdb_id, pdb_path, max_retries=3):
    """下载PDB文件，带重试机制"""
    pdb_url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
    retries = 0
    while retries < max_retries:
        try:
            urlretrieve(pdb_url, pdb_path)
            print(f"已下载 PDB {pdb_id} 到 {pdb_path}")
            return
        except URLError as e:
            print(f"下载 PDB {pdb_id} 失败 ({e}), 重试 {retries + 1}/{max_retries}...")
            retries += 1
            time.sleep(2)
    raise Exception(f"无法下载 PDB {pdb_id}，请检查网络连接。")


# Function to retrieve protein structure and best ligand
def retrieve_protein_and_ligand(pdb_id):
    data_dir = DATA_ROOT / f"data_{pdb_id}"
    data_dir.mkdir(exist_ok=True)
    pdb_path = data_dir / f"{pdb_id}.pdb"

    # 下载PDB文件
    download_pdb(pdb_id, pdb_path)

    # 使用MDAnalysis加载下载的PDB文件
    u = mda.Universe(str(pdb_path))

    # 保存蛋白质部分
    protein = u.select_atoms("protein")
    protein_path = data_dir / "protein.pdb"
    protein.write(str(protein_path))

    # 检测并选择最佳配体
    ligands = {resname for resname in u.residues.resnames if resname != "HOH"}
    chosen_ligand = max(ligands, key=lambda x: len(u.select_atoms(f"resname {x}")))

    print(f"Selected ligand {chosen_ligand} for PDB {pdb_id}")
    return protein_path, data_dir, chosen_ligand


# Convert PDB to PDBQT format
def pdb_to_pdbqt(pdb_path, pdbqt_path, pH=7.4):
    molecule = list(pybel.readfile("pdb", str(pdb_path)))[0]
    molecule.OBMol.CorrectForPH(pH)
    molecule.addh()
    for atom in molecule.atoms:
        atom.OBAtom.GetPartialCharge()
    molecule.write("pdbqt", str(pdbqt_path), overwrite=True)


# Convert SMILES to PDBQT format
def smiles_to_pdbqt(smiles, pdbqt_path, pH=7.4):
    molecule = pybel.readstring("smi", smiles)
    molecule.OBMol.CorrectForPH(pH)
    molecule.addh()
    molecule.make3D(forcefield="mmff94s", steps=10000)
    for atom in molecule.atoms:
        atom.OBAtom.GetPartialCharge()
    molecule.write("pdbqt", str(pdbqt_path), overwrite=True)


# Define binding site based on the chosen ligand
def define_binding_site(universe, ligand_resname):
    ligand = universe.select_atoms(f"resname {ligand_resname}")
    pocket_center = (ligand.positions.max(axis=0) + ligand.positions.min(axis=0)) / 2
    pocket_size = ligand.positions.max(axis=0) - ligand.positions.min(axis=0) + 5
    return pocket_center, pocket_size


# Run docking with Smina
def run_smina(ligand_path, protein_path, out_path, pocket_center, pocket_size, num_poses=10, exhaustiveness=8):
    output_text = subprocess.check_output(
        [
            "smina",
            "--ligand", str(ligand_path),
            "--receptor", str(protein_path),
            "--out", str(out_path),
            "--center_x", str(pocket_center[0]),
            "--center_y", str(pocket_center[1]),
            "--center_z", str(pocket_center[2]),
            "--size_x", str(pocket_size[0]),
            "--size_y", str(pocket_size[1]),
            "--size_z", str(pocket_size[2]),
            "--num_modes", str(num_poses),
            "--exhaustiveness", str(exhaustiveness),
        ],
        universal_newlines=True,
    )
    return output_text


# Function to perform docking for a single PDB-SMILES pair and save results
def process_docking(pdb_id, smiles):
    # Retrieve protein structure, detect ligands, and prepare PDBQT files
    protein_path, data_dir, ligand_resname = retrieve_protein_and_ligand(pdb_id)
    protein_pdbqt_path = protein_path.with_suffix(".pdbqt")
    pdb_to_pdbqt(protein_path, protein_pdbqt_path)

    # Prepare ligand in PDBQT format
    ligand_pdbqt_path = data_dir / "ligand.pdbqt"
    smiles_to_pdbqt(smiles, ligand_pdbqt_path)

    # Define binding site based on chosen ligand
    u = mda.Universe(str(protein_path))
    pocket_center, pocket_size = define_binding_site(u, ligand_resname)

    # Run docking
    docking_output_path = data_dir / f"docking_{pdb_id}_{smiles}.sdf"
    output_text = run_smina(
        ligand_pdbqt_path,
        protein_pdbqt_path,
        docking_output_path,
        pocket_center,
        pocket_size,
    )

    # Save docking result to result_{pdb_id}_{smiles}.txt with PDB and SMILES info
    result_path = data_dir / f"result_{pdb_id}_{smiles}.txt"
    with open(result_path, "w") as result_file:
        result_file.write(
            f"Docking results for PDB {pdb_id} with ligand {ligand_resname} and SMILES {smiles}:\n{output_text}")
    print(f"Docking completed for PDB {pdb_id} and SMILES {smiles}, results saved to {result_path}")


# Main function to run docking for all PDB and SMILES combinations with 10 threads
def main(pdb_list, smiles_list):
    # Generate Cartesian product of pdb_list and smiles_list
    tasks = [(pdb_id, smiles) for pdb_id in pdb_list for smiles in smiles_list]
    with ThreadPoolExecutor(max_workers=10) as executor:  # Set 10 threads
        # Submit tasks for each PDB-SMILES pair
        futures = [executor.submit(process_docking, pdb_id, smiles) for pdb_id, smiles in tasks]
        # Wait for all tasks to complete
        for future in futures:
            future.result()


# Define PDB IDs and SMILES strings
pdb_list = ["3U7C", "3V5G", "3VBD", "3KS3", "3PYK"]  # Replace with actual PDB IDs
smiles_list = [
    "C1CCC(CC1)SC2=CC(=C(C=C2C(=O)NCC3=CC=CC=C3)S(=O)(=O)N)Cl",
    "C1COCCN1C(=O)C2=NC=C(O2)C3=CC=C(C=C3)S(=O)(=O)N",
    "C1CCN(C1)C(=O)C2=NC=C(O2)C3=CC=C(C=C3)S(=O)(=O)N",
    "CC1C(C(C(C(O1)NC(=O)C2=CC=C(C=C2)S(=O)(=O)N)O)O)O",
    "C1=CC(=CC=C1C(=O)NC2C(C(C(C(O2)CO)O)O)O)S(=O)(=O)N",
    "C1C(C(C(C(O1)NC(=O)C2=CC=C(C=C2)S(=O)(=O)N)O)O)O",
    "COC(=O)C1C(C(C(C(O1)O)O)O)(O)S(=O)(=O)NC2=CC=CC=C2C(=O)N",
    "C1=CC(=CC=C1C(=O)NC2C(C(C(O2)CO)O)O)S(=O)(=O)N",
    "CC1=NC=C(O1)C2=CSC(=C2)S(=O)(=O)N",
    "COC1=C(C=C(C=C1)C2=CN=C(O2)C(=O)N3CCCC3)S(=O)(=O)N"
]  # Replace with actual SMILES

# Run main docking function
main(pdb_list, smiles_list)


In [None]:
import warnings
from pathlib import Path
import subprocess
from concurrent.futures import ThreadPoolExecutor
from urllib.request import urlretrieve
from urllib.error import URLError
import time
import MDAnalysis as mda
from openbabel import pybel
import pandas as pd
import os

# Suppress warnings
warnings.filterwarnings("ignore")
ob_log_handler = pybel.ob.OBMessageHandler()
pybel.ob.obErrorLog.SetOutputLevel(0)

# Define main directory
HERE = Path(_dh[-1])
DATA_ROOT = HERE / "data"


# Function to download a PDB file with retry mechanism
def download_pdb(pdb_id, pdb_path, max_retries=3):
    """下载PDB文件，带重试机制"""
    pdb_url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
    retries = 0
    while retries < max_retries:
        try:
            urlretrieve(pdb_url, pdb_path)
            print(f"已下载 PDB {pdb_id} 到 {pdb_path}")
            return
        except URLError as e:
            print(f"下载 PDB {pdb_id} 失败 ({e}), 重试 {retries + 1}/{max_retries}...")
            retries += 1
            time.sleep(2)
    raise Exception(f"无法下载 PDB {pdb_id}，请检查网络连接。")


# Function to retrieve protein structure and best ligand
def retrieve_protein_and_ligand(pdb_id):
    data_dir = DATA_ROOT / f"data_{pdb_id}"
    data_dir.mkdir(parents=True, exist_ok=True)  # Ensure directory exists
    pdb_path = data_dir / f"{pdb_id}.pdb"

    # 下载PDB文件
    download_pdb(pdb_id, pdb_path)

    # 使用MDAnalysis加载下载的PDB文件
    u = mda.Universe(str(pdb_path))

    # 保存蛋白质部分
    protein = u.select_atoms("protein")
    protein_path = data_dir / "protein.pdb"
    protein.write(str(protein_path))

    # 检测并选择最佳配体
    ligands = {resname for resname in u.residues.resnames if resname != "HOH"}
    chosen_ligand = max(ligands, key=lambda x: len(u.select_atoms(f"resname {x}")))

    print(f"Selected ligand {chosen_ligand} for PDB {pdb_id}")
    return protein_path, data_dir, chosen_ligand


# Convert PDB to PDBQT format
def pdb_to_pdbqt(pdb_path, pdbqt_path, pH=7.4):
    molecule = list(pybel.readfile("pdb", str(pdb_path)))[0]
    molecule.OBMol.CorrectForPH(pH)
    molecule.addh()
    for atom in molecule.atoms:
        atom.OBAtom.GetPartialCharge()
    molecule.write("pdbqt", str(pdbqt_path), overwrite=True)


# Convert SMILES to PDBQT format
def smiles_to_pdbqt(smiles, pdbqt_path, pH=7.4):
    molecule = pybel.readstring("smi", smiles)
    molecule.OBMol.CorrectForPH(pH)
    molecule.addh()
    molecule.make3D(forcefield="mmff94s", steps=10000)
    for atom in molecule.atoms:
        atom.OBAtom.GetPartialCharge()
    molecule.write("pdbqt", str(pdbqt_path), overwrite=True)


# Define binding site based on the chosen ligand
def define_binding_site(universe, ligand_resname):
    ligand = universe.select_atoms(f"resname {ligand_resname}")
    pocket_center = (ligand.positions.max(axis=0) + ligand.positions.min(axis=0)) / 2
    pocket_size = ligand.positions.max(axis=0) - ligand.positions.min(axis=0) + 5
    return pocket_center, pocket_size


# Run docking with Smina
def run_smina(ligand_path, protein_path, out_path, pocket_center, pocket_size, num_poses=10, exhaustiveness=8):
    output_text = subprocess.check_output(
        [
            "smina",
            "--ligand", str(ligand_path),
            "--receptor", str(protein_path),
            "--out", str(out_path),
            "--center_x", str(pocket_center[0]),
            "--center_y", str(pocket_center[1]),
            "--center_z", str(pocket_center[2]),
            "--size_x", str(pocket_size[0]),
            "--size_y", str(pocket_size[1]),
            "--size_z", str(pocket_size[2]),
            "--num_modes", str(num_poses),
            "--exhaustiveness", str(exhaustiveness),
        ],
        universal_newlines=True,
    )
    return output_text


# Sanitize SMILES for file naming
def sanitize_filename(smiles):
    return "".join(c if c.isalnum() else "_" for c in smiles)

# Function to save docking results to CSV
def save_results_to_csv(data_dir, pdb_id, smiles, ligand_resname, output_text):
    result_path = data_dir / f"docking_results.csv"
    if not os.path.exists(result_path):
        df = pd.DataFrame(columns=["PDB_ID", "SMILES", "Ligand", "Result_Docking_Info"])
    else:
        df = pd.read_csv(result_path)

    new_row = {"PDB_ID": pdb_id, "SMILES": smiles, "Ligand": ligand_resname, "Result_Docking_Info": output_text}
    df = df.append(new_row, ignore_index=True)
    df.to_csv(result_path, index=False)


# # Function to perform docking for a single PDB-SMILES pair and save results
# def process_docking(pdb_id, smiles):
#     # Retrieve protein structure, detect ligands, and prepare PDBQT files
#     protein_path, data_dir, ligand_resname = retrieve_protein_and_ligand(pdb_id)
#     protein_pdbqt_path = protein_path.with_suffix(".pdbqt")
#     pdb_to_pdbqt(protein_path, protein_pdbqt_path)
# 
#     # Prepare ligand in PDBQT format
#     ligand_pdbqt_path = data_dir / "ligand.pdbqt"
#     smiles_to_pdbqt(smiles, ligand_pdbqt_path)
# 
#     # Define binding site based on chosen ligand
#     u = mda.Universe(str(protein_path))
#     pocket_center, pocket_size = define_binding_site(u, ligand_resname)
# 
#     # Run docking
#     docking_output_path = data_dir / f"docking_{pdb_id}_{sanitize_filename(smiles)}.sdf"
#     output_text = run_smina(
#         ligand_pdbqt_path,
#         protein_pdbqt_path,
#         docking_output_path,
#         pocket_center,
#         pocket_size,
#     )
# Function to process docking and save results
def process_docking(pdb_id, smiles):
    # Check if this PDB-SMILES组合已处理
    result_path = DATA_ROOT / "docking_results.csv"
    if os.path.exists(result_path):
        df = pd.read_csv(result_path)
        if any((df["PDB_ID"] == pdb_id) & (df["SMILES"] == smiles)):
            print(f"已处理 PDB {pdb_id} 和 SMILES {smiles}，跳过.")
            return

    # Retrieve protein structure, detect ligands, and prepare PDBQT files
    protein_path, data_dir, ligand_resname = retrieve_protein_and_ligand(pdb_id)
    protein_pdbqt_path = protein_path.with_suffix(".pdbqt")
    pdb_to_pdbqt(protein_path, protein_pdbqt_path)

    # Prepare ligand in PDBQT format
    ligand_pdbqt_path = data_dir / "ligand.pdbqt"
    smiles_to_pdbqt(smiles, ligand_pdbqt_path)

    # Define binding site based on chosen ligand
    u = mda.Universe(str(protein_path))
    pocket_center, pocket_size = define_binding_site(u, ligand_resname)

    # Run docking
    docking_output_path = data_dir / f"docking_{pdb_id}_{sanitize_filename(smiles)}.sdf"
    output_text = run_smina(
        ligand_pdbqt_path,
        protein_pdbqt_path,
        docking_output_path,
        pocket_center,
        pocket_size,
    )

    # Save docking result to CSV
    save_results_to_csv(data_dir, pdb_id, smiles, ligand_resname, output_text)
    print(f"Docking completed for PDB {pdb_id} and SMILES {smiles}, results saved.")


    # Save docking result to text file
    result_path = data_dir / f"result__{pdb_id}_{sanitize_filename(smiles)}.txt"
    with open(result_path, "w") as result_file:
        result_file.write(
            f"Docking results for PDB {pdb_id} with ligand {ligand_resname} and SMILES {smiles}:\n{output_text}")

 

# Main function to run docking for all PDB and SMILES combinations with 10 threads
def main(pdb_list, smiles_list):
    # Generate Cartesian product of pdb_list and smiles_list
    tasks = [(pdb_id, smiles) for pdb_id in pdb_list for smiles in smiles_list]
    with ThreadPoolExecutor(max_workers=10) as executor:  # Set 10 threads
        # Submit tasks for each PDB-SMILES pair
        futures = [executor.submit(process_docking, pdb_id, smiles) for pdb_id, smiles in tasks]
        # Wait for all tasks to complete
        for future in futures:
            future.result()

pdb_list = ["6B00",
            "5EH7",
            "5EH5",
            "5EHV",
            "8WEP",
            "3N3J",
            "6ROF",
            "3TVN",
            "3HKU",
            "5E2S", 
            "3U7C", 
            "3V5G", 
            "3VBD", 
            "3KS3", 
            "3PYK"]
smiles_list = [
    "C1CCC(CC1)SC2=CC(=C(C=C2C(=O)NCC3=CC=CC=C3)S(=O)(=O)N)Cl",
    "C1COCCN1C(=O)C2=NC=C(O2)C3=CC=C(C=C3)S(=O)(=O)N",
    "C1CCN(C1)C(=O)C2=NC=C(O2)C3=CC=C(C=C3)S(=O)(=O)N",
    "CC1C(C(C(C(O1)NC(=O)C2=CC=C(C=C2)S(=O)(=O)N)O)O)O",
    "C1=CC(=CC=C1C(=O)NC2C(C(C(C(O2)CO)O)O)O)S(=O)(=O)N",
    "C1C(C(C(C(O1)NC(=O)C2=CC=C(C=C2)S(=O)(=O)N)O)O)O",
    "COC(=O)C1C(C(C(C(O1)O)O)O)(O)S(=O)(=O)NC2=CC=CC=C2C(=O)N",
    "C1=CC(=CC=C1C(=O)NC2C(C(C(O2)CO)O)O)S(=O)(=O)N",
    "CC1=NC=C(O1)C2=CSC(=C2)S(=O)(=O)N",
    "COC1=C(C=C(C=C1)C2=CN=C(O2)C(=O)N3CCCC3)S(=O)(=O)N",
    "C1CC(C1)C2=NC=C(O2)C3=CSC(=C3)S(=O)(=O)N",
    "C1CCN(C1)C(=O)C2=NC=C(O2)C3=CC=C(S3)S(=O)(=O)N",
    "C1CC(C1)C2=NC=C(O2)C3=CC=C(S3)S(=O)(=O)N",
    "C1CC1C2=NC=C(O2)C3=CC=C(C=C3)S(=O)(=O)N",
    "CC1=C(C=C(C=C1)C2=CN=C(O2)C(=O)N3CCCC3)S(=O)(=O)N",
    "C1=CC=C(C=C1)CNC2=CC(=C(C=C2C(=O)NCC3=CC=CC=C3)S(=O)(=O)N)Cl",
    "CC1=CC(=C(C(=C1S(=O)(=O)N)C)Cl)N2CC(CC2=O)C(=O)NN3C(=CC(=N3)C)C",
    "CC1=CC(=C(C(=C1S(=O)(=O)N)C)Cl)N2CC(CC2=O)C(=O)N3C(=CC(=N3)C)C",
    "CC1=CC(=C(C(=C1S(=O)(=O)N)C)Cl)N2CC(CC2=O)C(=O)OC",
    "C1=CN=C(N=C1)SCC(=O)C2=CC=C(C=C2)S(=O)(=O)N",
]

# ❤️ Run main docking function
main(pdb_list, smiles_list)


已下载 PDB 6B00 到 /Users/wangyang/Desktop/BCBM/16AIDD/06_Protein_ligand_docking/data/data_6B00/6B00.pdb
已下载 PDB 6B00 到 /Users/wangyang/Desktop/BCBM/16AIDD/06_Protein_ligand_docking/data/data_6B00/6B00.pdb
已下载 PDB 6B00 到 /Users/wangyang/Desktop/BCBM/16AIDD/06_Protein_ligand_docking/data/data_6B00/6B00.pdb
Selected ligand LYS for PDB 6B00
已下载 PDB 6B00 到 /Users/wangyang/Desktop/BCBM/16AIDD/06_Protein_ligand_docking/data/data_6B00/6B00.pdb
已下载 PDB 6B00 到 /Users/wangyang/Desktop/BCBM/16AIDD/06_Protein_ligand_docking/data/data_6B00/6B00.pdb
已下载 PDB 6B00 到 /Users/wangyang/Desktop/BCBM/16AIDD/06_Protein_ligand_docking/data/data_6B00/6B00.pdb
已下载 PDB 6B00 到 /Users/wangyang/Desktop/BCBM/16AIDD/06_Protein_ligand_docking/data/data_6B00/6B00.pdb
已下载 PDB 6B00 到 /Users/wangyang/Desktop/BCBM/16AIDD/06_Protein_ligand_docking/data/data_6B00/6B00.pdb
Selected ligand LYS for PDB 6B00
已下载 PDB 6B00 到 /Users/wangyang/Desktop/BCBM/16AIDD/06_Protein_ligand_docking/data/data_6B00/6B00.pdb
已下载 PDB 6B00 到 /Users/wan