# Automatic Protein-Ligand Docking
# 全自动分子对接

In [1]:
import warnings  # 导入警告模块
from pathlib import Path  # 导入路径处理模块
import subprocess  # 导入子进程模块
import numpy as np  # 导入NumPy模块
from MDAnalysis import Universe  # 从MDAnalysis导入Universe类
from openbabel import pybel  # 从Open Babel导入pybel模块
import pandas as pd  # 导入pandas用于数据处理
import os
from urllib.request import urlretrieve  # 用于下载PDB文件

In [2]:
# 设置工作目录
HERE = Path(os.getcwd())
DATA = HERE / 'data'
DATA.mkdir(parents=True, exist_ok=True)
print(f"数据目录: {DATA}")

# 忽略警告信息
warnings.filterwarnings("ignore")

数据目录: /Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data


In [3]:
class Structure(Universe):
    """用于加载结构的核心对象。"""
    
    @classmethod
    def from_string(cls, pdb_path):
        """从本地PDB文件加载结构。"""
        return cls(pdb_path)

def pdb_to_pdbqt(pdb_path, pdbqt_path, pH=7.4):
    """将PDB文件转换为PDBQT文件。"""
    molecule = list(pybel.readfile("pdb", str(pdb_path)))[0]
    molecule.OBMol.CorrectForPH(pH)
    molecule.addh()
    for atom in molecule.atoms:
        atom.OBAtom.GetPartialCharge()
    molecule.write("pdbqt", str(pdbqt_path), overwrite=True)

def find_ligand_resname(structure):
    """自动检测配体残基并返回其名称。"""
    ligand_atoms = structure.select_atoms("not protein and not resname HOH")
    ligand_resnames = set(ligand_atoms.resnames)
    if len(ligand_resnames) == 0:
        raise ValueError("在结构中未找到配体。")
    return list(ligand_resnames)[0]

def smiles_to_pdbqt(smiles, pdbqt_path, pH=7.4):
    """将SMILES字符串转换为PDBQT文件。"""
    molecule = pybel.readstring("smi", smiles)
    molecule.OBMol.CorrectForPH(pH)
    molecule.addh()
    molecule.make3D(forcefield="mmff94s", steps=1000)
    for atom in molecule.atoms:
        atom.OBAtom.GetPartialCharge()
    molecule.write("pdbqt", str(pdbqt_path), overwrite=True)

def run_smina(ligand_path, protein_path, out_path, pocket_center, pocket_size):
    """使用Smina进行分子对接。"""
    output_text = subprocess.check_output([
        "smina",
        "--receptor", str(protein_path),
        "--ligand", str(ligand_path),
        "--out", str(out_path),
        "--center_x", str(pocket_center[0]),
        "--center_y", str(pocket_center[1]),
        "--center_z", str(pocket_center[2]),
        "--size_x", str(pocket_size[0]),
        "--size_y", str(pocket_size[1]),
        "--size_z", str(pocket_size[2]),
        "--log", str(out_path.with_suffix('.log')),
    ])
    return output_text.decode("utf-8")

def split_sdf_file(sdf_path):
    """将SDF文件拆分为单独的分子文件。"""
    sdf_path = Path(sdf_path)
    stem = sdf_path.stem
    parent = sdf_path.parent
    molecules = pybel.readfile("sdf", str(sdf_path))
    for i, molecule in enumerate(molecules, 1):
        molecule.write("sdf", str(parent / f"{stem}_{i}.sdf"), overwrite=True)

def download_pdb(pdb_id, pdb_path):
    """下载PDB文件。"""
    pdb_url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
    try:
        urlretrieve(pdb_url, pdb_path)
        print(f"已下载 PDB {pdb_id} 到 {pdb_path}")
    except Exception as e:
        print(f"下载 PDB {pdb_id} 失败: {e}")
        raise

In [4]:
# 读取CSV文件
csv_file = DATA / 'pic50_greater_8.0.csv'
df = pd.read_csv(csv_file)

In [5]:
# PDB ID列表
pdb_ids = [
    '6Q4G',
    '6Q49',
    '6Q4H',
    '6Q48',
    '6Q4J',
    '6Q4K',
    '6Q4E',
    '6Q4D',
    '6Q3B',
    '6Q4I',
]

In [6]:
# 存储对接结果的列表
docking_results = []


In [8]:
# 遍历每个PDB ID
for pdb_id in pdb_ids:
    # 为当前PDB创建目录
    pdb_dir = DATA / pdb_id
    pdb_dir.mkdir(exist_ok=True)
    
    # 检查并下载PDB文件
    pdb_path = pdb_dir / f"{pdb_id}.pdb"
    if not pdb_path.exists():
        download_pdb(pdb_id, pdb_path)
    
    # 加载结构
    structure = Structure.from_string(pdb_path)
    
    # 准备蛋白质文件
    protein_path = pdb_dir / "protein.pdb"
    protein = structure.select_atoms("protein")
    protein.write(str(protein_path))
    
    # 将蛋白质转换为PDBQT格式
    protein_pdbqt_path = pdb_dir / "protein.pdbqt"
    pdb_to_pdbqt(protein_path, protein_pdbqt_path)
    
    # 自动检测配体残基名称
    try:
        ligand_resname = find_ligand_resname(structure)
        print(f"PDB ID {pdb_id}: 使用配体残基名称: {ligand_resname}")
        ligand = structure.select_atoms(f"resname {ligand_resname}")
    except ValueError:
        print(f"PDB ID {pdb_id}: 未找到配体，跳过。")
        continue
    
    # 计算口袋中心和大小
    pocket_center = (ligand.positions.max(axis=0) + ligand.positions.min(axis=0)) / 2
    pocket_size = ligand.positions.max(axis=0) - ligand.positions.min(axis=0) + 5  # 加入缓冲
    
    # 遍历每个SMILES字符串
    for idx, row in df.iterrows():
        molecule_chembl_id = row['molecule_chembl_id']
        smiles = row['smiles']
        pIC50 = row['pIC50']
        
        # 为当前配体创建目录
        ligand_dir = pdb_dir / f"ligand_{molecule_chembl_id}"
        ligand_dir.mkdir(exist_ok=True)
        
        # 将SMILES转换为PDBQT
        ligand_pdbqt_path = ligand_dir / "ligand.pdbqt"
        smiles_to_pdbqt(smiles, ligand_pdbqt_path)
        
        # 执行分子对接
        docking_out_path = ligand_dir / "docking_poses.sdf"
        try:
            docker_info = run_smina(ligand_pdbqt_path, protein_pdbqt_path, docking_out_path, pocket_center, pocket_size)
        except subprocess.CalledProcessError as e:
            print(f"对接失败 PDB {pdb_id}, 配体 {molecule_chembl_id}: {e}")
            docker_info = f"对接失败: {e}"
        
        # 保存对接信息
        with open(ligand_dir / "docker_info.txt", "w") as f:
            f.write(docker_info)
        
        # 如果对接成功，拆分SDF文件
        if docking_out_path.exists():
            split_sdf_file(docking_out_path)
        else:
            print(f"没有对接输出 PDB {pdb_id}, 配体 {molecule_chembl_id}")
        
        # 记录对接信息
        docking_results.append({
            'PDB_ID': pdb_id,
            'molecule_chembl_id': molecule_chembl_id,
            'smiles': smiles,
            'pIC50': pIC50,
            'ligand_resname': ligand_resname,
            'docker_info': docker_info,
        })


  Failed to kekulize aromatic bonds in OBMol::PerceiveBondOrders (title is /Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q4G/protein.pdb)



PDB ID 6Q4G: 使用配体残基名称: HJK




Parse error on line 28 in file "/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q4G/ligand_CHEMBL3951333/ligand.pdbqt": Unknown or inappropriate tag


对接失败 PDB 6Q4G, 配体 CHEMBL3951333: Command '['smina', '--receptor', '/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q4G/protein.pdbqt', '--ligand', '/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q4G/ligand_CHEMBL3951333/ligand.pdbqt', '--out', '/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q4G/ligand_CHEMBL3951333/docking_poses.sdf', '--center_x', '-1.0255', '--center_y', '-5.8515', '--center_z', '-24.709', '--size_x', '11.7630005', '--size_y', '12.0789995', '--size_z', '13.8220005', '--log', '/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q4G/ligand_CHEMBL3951333/docking_poses.log']' returned non-zero exit status 1.




Parse error on line 28 in file "/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q4G/ligand_CHEMBL3977678/ligand.pdbqt": Unknown or inappropriate tag


对接失败 PDB 6Q4G, 配体 CHEMBL3977678: Command '['smina', '--receptor', '/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q4G/protein.pdbqt', '--ligand', '/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q4G/ligand_CHEMBL3977678/ligand.pdbqt', '--out', '/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q4G/ligand_CHEMBL3977678/docking_poses.sdf', '--center_x', '-1.0255', '--center_y', '-5.8515', '--center_z', '-24.709', '--size_x', '11.7630005', '--size_y', '12.0789995', '--size_z', '13.8220005', '--log', '/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q4G/ligand_CHEMBL3977678/docking_poses.log']' returned non-zero exit status 1.




Parse error on line 28 in file "/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q4G/ligand_CHEMBL3897985/ligand.pdbqt": Unknown or inappropriate tag


对接失败 PDB 6Q4G, 配体 CHEMBL3897985: Command '['smina', '--receptor', '/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q4G/protein.pdbqt', '--ligand', '/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q4G/ligand_CHEMBL3897985/ligand.pdbqt', '--out', '/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q4G/ligand_CHEMBL3897985/docking_poses.sdf', '--center_x', '-1.0255', '--center_y', '-5.8515', '--center_z', '-24.709', '--size_x', '11.7630005', '--size_y', '12.0789995', '--size_z', '13.8220005', '--log', '/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q4G/ligand_CHEMBL3897985/docking_poses.log']' returned non-zero exit status 1.
已下载 PDB 6Q49 到 /Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q49/6Q49.pdb


  Failed to kekulize aromatic bonds in OBMol::PerceiveBondOrders (title is /Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q49/protein.pdb)



PDB ID 6Q49: 使用配体残基名称: DMS




Parse error on line 28 in file "/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q49/ligand_CHEMBL3951333/ligand.pdbqt": Unknown or inappropriate tag


对接失败 PDB 6Q49, 配体 CHEMBL3951333: Command '['smina', '--receptor', '/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q49/protein.pdbqt', '--ligand', '/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q49/ligand_CHEMBL3951333/ligand.pdbqt', '--out', '/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q49/ligand_CHEMBL3951333/docking_poses.sdf', '--center_x', '6.852', '--center_y', '-11.8995', '--center_z', '9.9935', '--size_x', '7.504', '--size_y', '6.0109997', '--size_z', '7.583', '--log', '/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q49/ligand_CHEMBL3951333/docking_poses.log']' returned non-zero exit status 1.




Parse error on line 28 in file "/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q49/ligand_CHEMBL3977678/ligand.pdbqt": Unknown or inappropriate tag


对接失败 PDB 6Q49, 配体 CHEMBL3977678: Command '['smina', '--receptor', '/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q49/protein.pdbqt', '--ligand', '/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q49/ligand_CHEMBL3977678/ligand.pdbqt', '--out', '/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q49/ligand_CHEMBL3977678/docking_poses.sdf', '--center_x', '6.852', '--center_y', '-11.8995', '--center_z', '9.9935', '--size_x', '7.504', '--size_y', '6.0109997', '--size_z', '7.583', '--log', '/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q49/ligand_CHEMBL3977678/docking_poses.log']' returned non-zero exit status 1.




Parse error on line 28 in file "/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q49/ligand_CHEMBL3897985/ligand.pdbqt": Unknown or inappropriate tag


对接失败 PDB 6Q49, 配体 CHEMBL3897985: Command '['smina', '--receptor', '/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q49/protein.pdbqt', '--ligand', '/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q49/ligand_CHEMBL3897985/ligand.pdbqt', '--out', '/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q49/ligand_CHEMBL3897985/docking_poses.sdf', '--center_x', '6.852', '--center_y', '-11.8995', '--center_z', '9.9935', '--size_x', '7.504', '--size_y', '6.0109997', '--size_z', '7.583', '--log', '/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q49/ligand_CHEMBL3897985/docking_poses.log']' returned non-zero exit status 1.
已下载 PDB 6Q4H 到 /Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q4H/6Q4H.pdb


  Failed to kekulize aromatic bonds in OBMol::PerceiveBondOrders (title is /Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q4H/protein.pdb)



PDB ID 6Q4H: 使用配体残基名称: HGH




Parse error on line 28 in file "/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q4H/ligand_CHEMBL3951333/ligand.pdbqt": Unknown or inappropriate tag


对接失败 PDB 6Q4H, 配体 CHEMBL3951333: Command '['smina', '--receptor', '/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q4H/protein.pdbqt', '--ligand', '/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q4H/ligand_CHEMBL3951333/ligand.pdbqt', '--out', '/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q4H/ligand_CHEMBL3951333/docking_poses.sdf', '--center_x', '-0.528', '--center_y', '-6.2095003', '--center_z', '-24.554', '--size_x', '10.366', '--size_y', '13.289', '--size_z', '13.382', '--log', '/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q4H/ligand_CHEMBL3951333/docking_poses.log']' returned non-zero exit status 1.




Parse error on line 28 in file "/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q4H/ligand_CHEMBL3977678/ligand.pdbqt": Unknown or inappropriate tag


对接失败 PDB 6Q4H, 配体 CHEMBL3977678: Command '['smina', '--receptor', '/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q4H/protein.pdbqt', '--ligand', '/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q4H/ligand_CHEMBL3977678/ligand.pdbqt', '--out', '/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q4H/ligand_CHEMBL3977678/docking_poses.sdf', '--center_x', '-0.528', '--center_y', '-6.2095003', '--center_z', '-24.554', '--size_x', '10.366', '--size_y', '13.289', '--size_z', '13.382', '--log', '/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q4H/ligand_CHEMBL3977678/docking_poses.log']' returned non-zero exit status 1.




Parse error on line 28 in file "/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q4H/ligand_CHEMBL3897985/ligand.pdbqt": Unknown or inappropriate tag


对接失败 PDB 6Q4H, 配体 CHEMBL3897985: Command '['smina', '--receptor', '/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q4H/protein.pdbqt', '--ligand', '/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q4H/ligand_CHEMBL3897985/ligand.pdbqt', '--out', '/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q4H/ligand_CHEMBL3897985/docking_poses.sdf', '--center_x', '-0.528', '--center_y', '-6.2095003', '--center_z', '-24.554', '--size_x', '10.366', '--size_y', '13.289', '--size_z', '13.382', '--log', '/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q4H/ligand_CHEMBL3897985/docking_poses.log']' returned non-zero exit status 1.
下载 PDB 6Q48 失败: <urlopen error EOF occurred in violation of protocol (_ssl.c:1131)>


URLError: <urlopen error EOF occurred in violation of protocol (_ssl.c:1131)>

In [9]:
# 将对接结果保存到CSV文件
results_df = pd.DataFrame(docking_results)
results_df.to_csv(DATA / "docking_results.csv", index=False)