# The most intelligent Protein-ligand docking on the earth's surface
# 地表上最聪明的 Protein-ligand 对接

1. **将对接得分为正数的结果（不好的结果）筛选出来，放到一个单独的表格中。**
2. **将对接得分 ≤ -10 的结果放入一个表格，包括 `docker_info` 和模式 1 的亲和力值（affinity）。**
3. **将对接得分在 -10 到 -9 之间的结果放入另一个表格。**
4. **将对接得分 > -9 的结果（即 -9 以下的得分）放入一个单独的表格。** 
5. **将所有结果汇总到一个表格中，并按 docking 得分排序。**

In [14]:
import warnings  # 导入警告模块
from pathlib import Path  # 导入路径处理模块
import subprocess  # 导入子进程模块
import numpy as np  # 导入NumPy模块
from MDAnalysis import Universe  # 从MDAnalysis导入Universe类
from openbabel import pybel  # 从Open Babel导入pybel模块
import pandas as pd  # 导入pandas用于数据处理
import os
from urllib.request import urlretrieve  # 用于下载PDB文件

In [15]:
# 设置工作目录
HERE = Path(os.getcwd())
DATA = HERE / 'data'
DATA.mkdir(parents=True, exist_ok=True)
print(f"数据目录: {DATA}")

数据目录: /Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data


In [16]:
# 忽略警告信息
warnings.filterwarnings("ignore")

In [17]:
class Structure(Universe):
    """用于加载结构的核心对象。"""
    
    @classmethod
    def from_string(cls, pdb_path):
        """从本地PDB文件加载结构。"""
        return cls(pdb_path)

def pdb_to_pdbqt(pdb_path, pdbqt_path, pH=7.4):
    """将PDB文件转换为PDBQT文件。"""
    molecule = list(pybel.readfile("pdb", str(pdb_path)))[0]
    molecule.OBMol.CorrectForPH(pH)
    molecule.addh()
    for atom in molecule.atoms:
        atom.OBAtom.GetPartialCharge()
    molecule.write("pdbqt", str(pdbqt_path), overwrite=True)

def find_ligand_resname(structure):
    """自动检测配体残基并返回其名称。"""
    # 定义要排除的常见非配体残基名称列表（如溶剂、离子等）
    non_ligand_resnames = set([
        'HOH', 'H2O', 'DMS', 'SO4', 'PO4', 'CL', 'NA', 'K', 'MG', 'ZN',
        'CA', 'MN', 'GOL', 'EDO', 'PEG', 'MPD', 'DTT', 'FMT', 'NO3',
        'ACT', 'ACE', 'EOH', 'IOD', 'IME', 'TRS', 'NH4', 'MES',
        'BME', 'BOG', 'PGE', 'PGR', 'PG4', 'P6G', 'DIO',
        'DOD', 'OLA', 'OLC', 'OLB', 'BTB', 'BTN', 'BCT', 'LDA', 'LMT',
        'NI', 'FE', 'CU', 'CO', 'CD', 'AG', 'AU', 'PT', 'PB', 'SR', 'CS',
        'BA', 'LI', 'F', 'BR', 'I', 'IOD', 'PER', 'OCS', 'SCN', 'CO3',
        'OH', 'CN', 'SUC', 'TAR', 'MAN', 'FUC', 'NAG', 'NDG',
        'BMA', 'GLC', 'GAL', 'MAL', 'MEL', 'TRE'
    ])

    # 选择所有非蛋白质且非水的原子
    ligand_atoms = structure.select_atoms("not protein and not resname HOH")

    # 获取这些原子的残基
    ligand_residues = ligand_atoms.residues

    # 排除常见的非配体残基
    candidate_residues = [res for res in ligand_residues if res.resname.strip() not in non_ligand_resnames]

    if not candidate_residues:
        raise ValueError("在结构中未找到配体。")

    # 从候选残基中选择原子数量最多的残基，认为它是配体
    ligand_residue = max(candidate_residues, key=lambda res: len(res.atoms))

    ligand_resname = ligand_residue.resname.strip()
    return ligand_resname


def smiles_to_pdbqt(smiles, pdbqt_path, pH=7.4):
    """将SMILES字符串转换为PDBQT文件。"""
    molecule = pybel.readstring("smi", smiles)
    molecule.OBMol.CorrectForPH(pH)
    molecule.addh()
    molecule.make3D(forcefield="mmff94s", steps=1000)
    for atom in molecule.atoms:
        atom.OBAtom.GetPartialCharge()
    molecule.write("pdbqt", str(pdbqt_path), overwrite=True)

def run_smina(ligand_path, protein_path, out_path, pocket_center, pocket_size):
    """使用Smina进行分子对接。"""
    log_path = out_path.with_suffix('.log')  # 日志文件路径
    output_text = subprocess.check_output([
        "smina",
        "--receptor", str(protein_path),
        "--ligand", str(ligand_path),
        "--out", str(out_path),
        "--center_x", str(pocket_center[0]),
        "--center_y", str(pocket_center[1]),
        "--center_z", str(pocket_center[2]),
        "--size_x", str(pocket_size[0]),
        "--size_y", str(pocket_size[1]),
        "--size_z", str(pocket_size[2]),
        "--log", str(log_path),
    ])
    return output_text.decode("utf-8"), log_path  # 返回输出文本和日志文件路径


In [18]:
# parse_smina_log函数用于解析Smina日志文件，提取所有模式的对接得分。
def parse_smina_log(log_path):
    """解析Smina日志文件，提取所有模式的对接得分。"""
    affinities = []
    with open(log_path, 'r') as f:
        lines = f.readlines()
        parsing = False
        for line in lines:
            if line.strip().startswith("-----+------------+----------+----------"):
                parsing = True
                continue
            if parsing:
                if line.strip() == '':
                    break  # 结束解析
                parts = line.strip().split()
                if len(parts) >= 2:
                    try:
                        mode = int(parts[0])
                        affinity = float(parts[1])
                        affinities.append((mode, affinity))
                    except ValueError:
                        continue
    return affinities  # 返回一个列表，包含所有模式的亲和力值


In [19]:
def split_sdf_file(sdf_path):
    """将SDF文件拆分为单独的分子文件。"""
    sdf_path = Path(sdf_path)
    stem = sdf_path.stem
    parent = sdf_path.parent
    molecules = pybel.readfile("sdf", str(sdf_path))
    for i, molecule in enumerate(molecules, 1):
        molecule.write("sdf", str(parent / f"{stem}_{i}.sdf"), overwrite=True)

In [20]:
def download_pdb(pdb_id, pdb_path):
    """下载PDB文件。"""
    pdb_url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
    try:
        urlretrieve(pdb_url, pdb_path)
        print(f"已下载 PDB {pdb_id} 到 {pdb_path}")
    except Exception as e:
        print(f"下载 PDB {pdb_id} 失败: {e}")
        raise

In [21]:
# 读取CSV文件 ❤️，这里需要放你们需要的文件哦，不要跟我一样，傻傻的~~~~~~⚠️
csv_file = DATA / 'pic50_greater_8.0.csv'
df = pd.read_csv(csv_file)

In [22]:
# 检查必需的列是否存在
required_columns = ['molecule_chembl_id', 'smiles', 'pIC50']
for col in required_columns:
    if col not in df.columns:
        raise ValueError(f"缺少必需的列: {col}")

In [23]:
# PDB ID列表；❤️ 这个也是你们自己的哦，不要跟我一样哦，傻傻的~~~~~~⚠️
# 或者我们也可以去DATA目录下找，我这里是直接写死的，你们可以改成你们自己的, 我这里是为了测试
# csv文件里面的pdb_id这一列
# 读取csv文件
# csv_file = DATA / 'pdb_ids.csv' # ❤️ 这里是你们自己的文件路径哦，不要跟我一样哦，傻傻的~~~~~~⚠️
# df = pd.read_csv(csv_file)
# pdb_ids = df['pdb_id'].tolist()
pdb_ids = [
    # '6Q4G',
    # '6Q49',
    # '6Q4H',
    '6Q48',
    # '6Q4J',
    # '6Q4K',
    # '6Q4E',
    # '6Q4D',
    # '6Q3B',
    # '6Q4I',
]

In [24]:
# 存储对接结果的列表，这个是定义全局的变量，可以直接在函数中使用，不需要传参，我不知道你们计算机水平如何，反正就是全局的
docking_results = []

In [26]:
# 遍历每个PDB ID
for pdb_id in pdb_ids:
    # 为当前PDB创建目录
    pdb_dir = DATA / pdb_id
    pdb_dir.mkdir(exist_ok=True)
    
    # 检查并下载PDB文件
    pdb_path = pdb_dir / f"{pdb_id}.pdb"
    if not pdb_path.exists():
        download_pdb(pdb_id, pdb_path)
    
    # 加载结构
    structure = Structure.from_string(pdb_path)
    
    # 准备蛋白质文件
    protein_path = pdb_dir / "protein.pdb"
    protein = structure.select_atoms("protein")
    protein.write(str(protein_path))
    
    # 将蛋白质转换为PDBQT格式
    protein_pdbqt_path = pdb_dir / "protein.pdbqt"
    pdb_to_pdbqt(protein_path, protein_pdbqt_path)
    
    # 自动检测配体残基名称，并选择配体残基，这样子可以保证只对接到正确的配体，不用那些苦力活，傻傻的人工标记的配体残基
    try:
        ligand_resname = find_ligand_resname(structure)
        print(f"PDB ID {pdb_id}: 使用配体残基名称: {ligand_resname}")
        # 选择配体原子
        ligand = structure.select_atoms(f"resname {ligand_resname}")
    except ValueError as e:
        print(f"PDB ID {pdb_id}: {e}，跳过。")
        continue
    
    # 计算口袋中心和大小
    pocket_center = (ligand.positions.max(axis=0) + ligand.positions.min(axis=0)) / 2
    pocket_size = ligand.positions.max(axis=0) - ligand.positions.min(axis=0) + 5  # 加入缓冲
    
    # 遍历每个SMILES字符串
    for idx, row in df.iterrows():
        molecule_chembl_id = row['molecule_chembl_id']
        smiles = row['smiles']
        PIC50 = row['pIC50']  # 修改为正确的列名
        
        # 为当前配体创建目录
        ligand_dir = pdb_dir / f"ligand_{molecule_chembl_id}"
        ligand_dir.mkdir(exist_ok=True)
        
        # 将SMILES转换为PDBQT
        ligand_pdbqt_path = ligand_dir / "ligand.pdbqt"
        smiles_to_pdbqt(smiles, ligand_pdbqt_path)
        
        # 执行分子对接
        docking_out_path = ligand_dir / "docking_poses.sdf"
        try:
            docker_info, log_path = run_smina(ligand_pdbqt_path, protein_pdbqt_path, docking_out_path, pocket_center, pocket_size)
            # 解析对接得分
            affinities = parse_smina_log(log_path)
            # 获取模式 1 的亲和力值
            mode1_affinity = None
            if affinities:
                mode1_affinity = affinities[0][1]  # 第一个元素是模式 1
                docking_score = mode1_affinity  # 使用模式 1 的得分作为对接得分
            else:
                docking_score = None
                mode1_affinity = None
        except subprocess.CalledProcessError as e:
            print(f"对接失败 PDB {pdb_id}, 配体 {molecule_chembl_id}: {e}")
            docker_info = f"对接失败: {e}"
            docking_score = None  # 对接失败时，得分为None
            mode1_affinity = None
        
        # 保存对接信息
        with open(ligand_dir / "docker_info.txt", "w") as f:
            f.write(docker_info)
        
        # 如果对接成功，拆分SDF文件
        if docking_out_path.exists():
            split_sdf_file(docking_out_path)
        else:
            print(f"没有对接输出 PDB {pdb_id}, 配体 {molecule_chembl_id}")
        
        # 记录对接信息
        docking_results.append({
            'PDB_ID': pdb_id,
            'molecule_chembl_id': molecule_chembl_id,
            'smiles': smiles,
            'pIC50': PIC50,
            'ligand_resname': ligand_resname,
            'docking_score': docking_score,
            'mode1_affinity': mode1_affinity,
            'docker_info': docker_info,
        })

已下载 PDB 6Q48 到 /Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q48/6Q48.pdb


  Failed to kekulize aromatic bonds in OBMol::PerceiveBondOrders (title is /Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q48/protein.pdb)



PDB ID 6Q48: 使用配体残基名称: HHQ




Parse error on line 28 in file "/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q48/ligand_CHEMBL3951333/ligand.pdbqt": Unknown or inappropriate tag


对接失败 PDB 6Q48, 配体 CHEMBL3951333: Command '['smina', '--receptor', '/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q48/protein.pdbqt', '--ligand', '/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q48/ligand_CHEMBL3951333/ligand.pdbqt', '--out', '/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q48/ligand_CHEMBL3951333/docking_poses.sdf', '--center_x', '2.012', '--center_y', '-5.3045', '--center_z', '-27.3255', '--size_x', '8.3', '--size_y', '9.543', '--size_z', '7.5429993', '--log', '/Users/wangyang/Desktop/Breast_cancer_brain_metastasis/wang_Gene/AIDD/15_Protein_ligand_docking/data/6Q48/ligand_CHEMBL3951333/docking_poses.log']' returned non-zero exit status 1.


KeyboardInterrupt: 

In [27]:
# 将对接结果保存到DataFrame
results_df = pd.DataFrame(docking_results)

In [28]:
# 删除没有对接得分的行
results_df = results_df.dropna(subset=['docking_score'])

In [29]:
# 将得分为正数的结果筛选出来
positive_scores_df = results_df[results_df['docking_score'] > 0]

In [30]:
# 将得分 ≤ -10 的结果筛选出来
score_leq_neg10_df = results_df[results_df['docking_score'] <= -10]

In [31]:
# 将得分在 -10 到 -9 之间的结果筛选出来
score_between_neg10_neg9_df = results_df[(results_df['docking_score'] > -10) & (results_df['docking_score'] <= -9)]

In [32]:
# 将得分 > -9 的结果筛选出来
score_gt_neg9_df = results_df[results_df['docking_score'] > -9]

In [33]:
# 保存到CSV文件
positive_scores_df.to_csv(DATA / "docking_positive_scores.csv", index=False)
score_leq_neg10_df.to_csv(DATA / "docking_score_leq_neg10.csv", index=False)
score_between_neg10_neg9_df.to_csv(DATA / "docking_score_between_neg10_neg9.csv", index=False)
score_gt_neg9_df.to_csv(DATA / "docking_score_gt_neg9.csv", index=False)

In [39]:
import nglview as nv

def visualize_docking_pose(pdb_id, molecule_chembl_id, docking_pose_id):
    """
    根据指定的 PDB ID、ChEMBL ID 和对接位点 ID，可视化对接结果。

    参数
    ----------
    pdb_id: str
        蛋白质的 PDB ID。
    molecule_chembl_id: str
        分子的 ChEMBL ID。
    docking_pose_id: int
        要查看的对接位点 ID。
    """
    ligand_dir = DATA / pdb_id / f"ligand_{molecule_chembl_id}"
    sdf_file = ligand_dir / f"docking_poses_{docking_pose_id}.sdf"
    protein_file = DATA / pdb_id / "protein.pdb"

    if not sdf_file.exists():
        print(f"SDF 文件不存在: {sdf_file}")
        return

    if not protein_file.exists():
        print(f"蛋白质 PDB 文件不存在: {protein_file}")
        return

    # 创建 NGLView 小部件
    view = nv.NGLWidget()

    # 添加蛋白质结构
    view.add_component(str(protein_file))
    view.add_representation('cartoon', selection='protein', color='blue')

    # 添加配体对接位点
    view.add_component(str(sdf_file))
    view.add_representation('ball+stick', selection='all', color='orange')

    # 将视图中心对准配体
    view.center(selection='all')

    return view


In [41]:
pdb_id = '6Q4G'
molecule_chembl_id = 'CHEMBL3600873'  # ❤️ 替换为实际的 ChEMBL ID，不要跟着写我一样哦，傻傻的~~~~~~⚠️
docking_pose_id = 1  # 要查看的对接位点 ID

view = visualize_docking_pose(pdb_id, molecule_chembl_id, docking_pose_id)

if view:
    view  # 在 Jupyter Notebook 中显示视图


NGLWidget()
