# 1. BioEmu 简介

BioEmu 是由 Frank Noé 领导的 Microsoft Research 团队开发的一种基于`深度学习`的生物分子仿真器，旨在高效采样蛋白质构象的`近似平衡分布`。该方法基于分布式架构 DiG 的神经网络模型，并采用去噪扩散框架进行结构生成。BioEmu 以蛋白质序列为输入，通过 `AlphaFold2` 生成序列的成对表示，并将其作为`扩散模型`的条件输入，从而生成蛋白质`三维结构构象集合`。

# 2. BioEmu 与其他技术对比

与传统仅预测单一静态结构的方法不同，BioEmu 重点刻画蛋白质的`动态构象行为`，通过多次随机采样获得在热力学平衡条件下可能出现的多种构象状态。由此，BioEmu 能够近似描述蛋白质的构象分布、构象柔性及构象转变趋势，为研究蛋白质的构象动力学特征、构象多态性以及功能相关的结构变化提供重要支持。

与传统`分子动力学`（MD）模拟相比，BioEmu 仅需对每条蛋白质序列进行一次编码，并通过 30–50 步去噪过程即可生成三维结构，从而使单个 GPU 在几分钟到数小时内即可生成多达上万个独立构象，大幅`提升了构象采样效率`。它以深度生成模型替代了部分长时间尺度的 MD 模拟过程，在显著降低计算成本的同时，仍能在短时间内构建大规模构象集合。这使其能够高效刻画蛋白质的构象变化趋势、柔性区域特征、构象跃迁行为以及构象熵特征，为蛋白质动态结构研究提供了一条新的技术路径。

# 3. BioEmu 基本流程

## 3.1 模型任务参数设置

脚本在运行模型前，对任务名称和序列进行规范化，生成唯一哈希以避免文件夹冲突，设置样本数量、模型及筛选参数，并创建输出目录用于存储生成的结果。

In [None]:
sequence = "MTVAYIAIGSNLASPLEQVNAALKALGDIPESHILTVSSFYRTPPLGPQDQPDYLNAAVALETSLAPEELLNHTQRIELQQGRVRKAERWGPRTLDLDIMLFGNEVINTERLTVPHYDMKNRGFMLWPLFEIAPELVFPDGEMLRQILHTRAFDKLNKW"  #@param {type:"string"}
num_samples = 165  
jobname = "Pyrophosphokinase"  
filter_samples = True
model_name = "bioemu-v1.1" 
import os
import re
import hashlib

def add_hash(x, seq):
    """Append a short SHA-1 hash of seq to x."""
    return x + "_" + hashlib.sha1(seq.encode()).hexdigest()[:5]

def folder_is_free(folder):
    """Return True if folder doesn't exist."""
    return not os.path.exists(folder)

jobname_clean = re.sub(r'\W+', '', jobname)
sequence = "".join(sequence.split())
jobname = add_hash(jobname_clean, sequence)

if not folder_is_free(jobname):
    n = 0
    while not folder_is_free(f"{jobname}_{n}"):
        n += 1
    jobname = f"{jobname}_{n}"

output_dir = os.path.join("/content", jobname)
os.makedirs(output_dir, exist_ok=True)


## 3.2 环境配置

Colab 中安装并配置 BioEmu 所需的 Conda 环境、Python、依赖库和工具，并创建标记文件以防止重复安装。

In [None]:
import os
import sys

_is_bioemu_setup_file = '/content/.BIOEMU_SETUP'

conda_prefix = '/usr/local/'
miniconda_link = 'https://repo.anaconda.com/miniconda/Miniconda3-py312_25.5.1-1-Linux-x86_64.sh'
miniconda_basename = os.path.basename(miniconda_link)
os.makedirs(conda_prefix, exist_ok=True)

if not os.path.exists(_is_bioemu_setup_file):
  os.system(f'wget {miniconda_link}')
  os.system(f'chmod +x {miniconda_basename}')
  os.system(f'./{miniconda_basename} -b -f -p {conda_prefix}')
  os.system(f'conda install -q -y --prefix {conda_prefix} python=3.12')
  os.system('uv pip install --prerelease if-necessary-or-explicit bioemu')

  os.system('conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main')
  os.system('conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r')
  os.system('conda install -c conda-forge openmm cuda-version=11 --yes')

  sys.path.append(os.path.join(conda_prefix, 'lib/python3.12/site-packages/'))

  os.environ['CONDA_PREFIX'] = conda_prefix
  os.environ['CONDA_PREFIX_1'] = os.path.join(conda_prefix, 'envs/myenv')
  os.environ['CONDA_DEFAULT_ENV'] = 'base'
  os.system(f"touch {_is_bioemu_setup_file}")
  os.system('wget https://mmseqs.com/foldseek/foldseek-linux-avx2.tar.gz; tar xvzf foldseek-linux-avx2.tar.gz')
  os.system('/usr/bin/python3 -m pip install uv')
  os.unlink(miniconda_basename)

## 3.3 结构预测

对输入的氨基酸序列使用 BioEmu 模型进行多样化样本生成（采样预测），根据指定的样本数量和模型参数生成潜在的蛋白质变体或结构候选，并将所有生成结果保存到事先创建的输出目录中，以便后续分析和使用。

In [None]:
from bioemu.sample import main as sample
output_dir = f'/content/{jobname}'
sample(sequence=sequence, num_samples=num_samples, model_name=model_name, output_dir=output_dir, filter_samples=filter_samples)

# 4. Foldseek 聚类分析

`Foldseek` 是一款高效的蛋白质结构比对与聚类工具，能够在大规模结构集合中快速识别相似蛋白质。与传统的`RMSD`方法不同，Foldseek 直接基于三维结构信息进行比对，结合 `TM-score`、`结构覆盖率`和`序列相似性`等指标，实现精确的结构聚类和相似性分析。其优势在于计算速度快、可处理上万条结构，同时保留高精度的结构关系信息，因此广泛应用于蛋白质结构预测结果的聚类筛选、代表性结构提取以及结构库构建等任务。

在完成 BioEmu 的样本生成后，脚本首先将轨迹文件中的多个采样帧提取为单独的 PDB 文件，便于后续结构分析与聚类处理。随后，利用 Foldseek 对这些结构样本进行聚类，对于每个聚类簇，仅选择一条代表样本生成新的轨迹文件和拓扑文件。

In [None]:
n_write_samples = -1 
tmscore_threshold = 0.6 
coverage_threshold = 0.7 
seq_id = 0.95 

import numpy as np
import mdtraj

_py3dmol_installed_file = '/content/.py3dmol'
if not os.path.exists(_py3dmol_installed_file):
    os.system('uv pip install py3Dmol')
    os.system(f"touch {_py3dmol_installed_file}")

import py3Dmol
pdb_sample_dir = os.path.join('/content', 'pdb_samples')
os.makedirs(pdb_sample_dir, exist_ok=True)

def write_some_samples(topology_file: str, trajectory_file: str, output_dir:str, n_samples: int) -> None:
    traj = mdtraj.load(trajectory_file, top=topology_file)
    assert traj.n_frames >= n_samples
    if n_samples == -1:
        sample_indices = np.arange(traj.n_frames)
    else:
        sample_indices = np.random.choice(np.arange(traj.n_frames), size=n_samples, replace=False)
    for idx in sample_indices:
        traj[idx].save_pdb(os.path.join(output_dir, f'sample_{idx}.pdb'))


topology_file = os.path.join(output_dir, "topology.pdb")
trajectory_file = os.path.join(output_dir, "samples.xtc")

write_some_samples(topology_file=topology_file,
                   trajectory_file=trajectory_file,
                   output_dir=pdb_sample_dir,
                   n_samples=n_write_samples)

# Foldseek
import os
import subprocess
import tempfile

import pandas as pd

def parse_foldseek_cluster_results(cluster_table_path: str) -> dict[int, list[str]]:
    """
    Parses the result of foldseek clustering

    Args:
        cluster_table: path of the output cluster table from foldseek

    Returns:
        Dictionary mapping cluster indices to members

    """

    cluster_table = pd.read_csv(cluster_table_path, sep=r"\s+", header=None)

    cluster_idx_to_members = {}

    for index, group in enumerate(cluster_table.groupby(0)):
        cluster_idx_to_members[index] = sorted(list(group[1][1]))

    return cluster_idx_to_members


def foldseek_cluster(
    input_dir: str,
    out_prefix: str | None = None,
    tmscore_threshold: float = 0.7,
    coverage_threshold: float = 0.9,
    seq_id: float = 0.7,
    coverage_mode: int = 1,
) -> dict[int, set[str]]:
    """
    Runs foldseek easy cluster

    Args:
        input_dir (str): input directory with .cif or .pdb files
        out_prefix (str | None): the prefix of the output files, if None a temporary directory will be used
        tmscore_threshold (float): the tm-score threshold used for clustering
        coverage_threshold (float): the coverage threshold used for clustering
        seq_id (float): the sequence identity threshold used for clustering
        coverage_mode (int): mode used by mmseqs/foldseek to compute coverage

    Returns:
        Dictionary mapping cluster indices to members
    """

    with tempfile.TemporaryDirectory() as temp_dir:

        with tempfile.TemporaryDirectory() as temp_out_dir:
            if out_prefix is None:
                out_prefix = os.path.join(temp_out_dir, "output")

            res = subprocess.run(
                "/content/foldseek/bin/foldseek easy-cluster "
                + input_dir
                + " "
                + out_prefix
                + " "
                + temp_dir
                + " -c  "
                + str(coverage_threshold)
                + " --min-seq-id "
                + str(seq_id)
                + " --tmscore-threshold "
                + str(tmscore_threshold)
                + " --cov-mode "
                + str(coverage_mode)
                + " --single-step-clustering",
                shell=True,
            )
            assert res.returncode == 0, "Something went wrong with foldseek"

            cluster_idx_to_members = parse_foldseek_cluster_results(out_prefix + "_cluster.tsv")

    return cluster_idx_to_members

!chmod +x '/content/foldseek/bin/foldseek'

# Get foldseek clusters
clusters = foldseek_cluster(input_dir=pdb_sample_dir, tmscore_threshold=tmscore_threshold,
                            coverage_threshold=coverage_threshold, seq_id=seq_id)
n_clusters = len(clusters)
print(f'{n_clusters} clusters detected')

# Write foldseek clusters to output dir
import json

with open(os.path.join(output_dir, 'foldseek_clusters.json'), 'w') as json_handle:
    json.dump(clusters, json_handle)


# Write XTC with one sample per cluster only
cluster_trajs = []
for _cluster_idx, samples in clusters.items():
    sample = list(samples)[0] # Choose first sample in cluster
    pdb_file = os.path.join(pdb_sample_dir, f"{sample}.pdb")
    traj = mdtraj.load_pdb(pdb_file)
    cluster_trajs.append(traj)
joint_traj = mdtraj.join(cluster_trajs)
cluster_topology_file = os.path.join(output_dir, "clustered_topology.pdb")
cluster_trajectory_file = os.path.join(output_dir, "clustered_samples.xtc")
joint_traj[0].save_pdb(cluster_topology_file)
joint_traj.save_xtc(cluster_trajectory_file)

# 5. 侧链重建与分子动力学优化

在聚类筛选后的蛋白质代表结构基础上，通过侧链重建恢复缺失的侧链原子，并根据指定的分子力学优化协议（如局部最小化或 NVT 平衡）对结构进行能量优化，以消除不合理的原子碰撞或构象应力，最终生成完整、稳定且可用于下游分析和可视化的高质量蛋白质结构。

In [None]:
reconstruct_sidechains = True 
run_md = True 
one_per_cluster = True 
md_protocol = "LOCAL_MINIMIZATION"
import bioemu.sidechain_relax
bioemu.sidechain_relax.HPACKER_PYTHONBIN = os.path.join(conda_prefix, '/envs/hpacker/bin/python')

from bioemu.sidechain_relax import main as sidechainrelax
from bioemu.sidechain_relax import MDProtocol
md_protocol = MDProtocol[md_protocol]
os.environ['CONDA_PREFIX_1'] = conda_prefix

if one_per_cluster:
    topology_file = cluster_topology_file
    trajectory_file = cluster_trajectory_file

prefix = 'hpacker-openmm'
if reconstruct_sidechains:
    relaxed_dir = os.path.join(output_dir, prefix)
    os.makedirs(relaxed_dir, exist_ok=True)
    sidechainrelax(pdb_path=topology_file, xtc_path=trajectory_file,
                  outpath=relaxed_dir, prefix=prefix, md_protocol=md_protocol,
                  md_equil=run_md)
    if run_md:
        os.system(f'touch {relaxed_dir}/.RELAXED')


# 6. 结果展示

数据已上传至 GitHub 仓库 https://github.com/kinggmars/Pyrophosphokinase-/tree/main/data/predictions/bioemu
，存放在 protein1 和 protein2 两个文件夹中：
- 165pdb.zip：包含所有生成的蛋白质 PDB 样本
- clustered_samples.xtc 与 clustered_topology.pdb：对应聚类后的代表结构，但未经过后续 MD 优化
- hpacker-openmm_sidechain_rec.pdb 和 hpacker-openmm_sidechain_rec.xtc：侧链重建后的结构与轨迹
- hpacker-openmm_md_equil.pdb 和 hpacker-openmm_md_equil.xtc：侧链重建后再进行分子动力学优化的结构与轨迹

具体情况如下：
- protein1 文件夹包含 15 个聚类结构，并进行了后续的 MD 优化
- protein2 文件夹包含 22 个聚类结构，但未进行后续 MD 优化

相关流程可通过 Google Colab 平台运行的 ColabFold 官方 BioEmu Notebook 实现（`https://colab.research.google.com/github/sokrypton/ColabFold/blob/main/BioEmu.ipynb`）。