# Prepare Datasets for Protein Reaction Benchmarking Tasks

> author: Shizhenkun   
> email: zhenkun.shi@tib.cas.cn   
> date: 2024-12-14  



## 1. Import packages

In [1]:
import sys,os
sys.path.insert(0, os.path.dirname(os.path.realpath('__file__')))
sys.path.insert(1,'../')
from config import conf as cfg
from tools import filetool as ftool
from tools import bioFunctionLib as bfl
from tools import uniprottool as uptool
from tools import rheatool as rheatool
from tools import ectool as etool
from tqdm import tqdm
from IPython.display import display_markdown
from pandarallel import pandarallel # 导入pandaralle
pandarallel.initialize(progress_bar=False)
from tkinter import _flatten
import json
import shutil
import subprocess
import swifter
import pandas as pd
import numpy as np

FIRST_TIME_RUN = False # For the initial run, please set this flag to True. This will allow the program to download data from UniProt and RHEA, which may take longer depending on your internet speed.

%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 128 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## 2. 定义处理函数

In [74]:

def cp_pdb(src, dst):
    """
    Copy a PDB file from source to destination, ensuring the destination directory exists.
    Returns:
        1: if the copy is successful.
        0: if the copy fails or the source file does not exist.
    """

    # if not os.path.exists(os.path.dirname(dst)):
    #     print(os.path.dirname(dst))
    #     os.makedirs(os.path.dirname(dst), exist_ok=True)
    
    # 复制文件
    if os.path.exists(src):
        
        if not os.path.exists(os.path.dirname(dst)):
            os.makedirs(os.path.dirname(dst), exist_ok=True)
        
        shutil.copy(src, dst)
        return True
    else:
        return False



def prep_pdb(uniprot_id):
    src = f'/hpcfs/fpublic/database/alphafold/predicted_pdbs/AF-{uniprot_id}-F1-model_v4.pdb.gz'
    dst = f'{cfg.DATA_ROOT}structure/pdb/{str(uniprot_id[0:2])}/AF-{uniprot_id}-F1-model_v4.pdb.gz'
    res = cp_pdb(src, dst)
    # print(res)
    
    return dst, res

def get_pdb_files(directory):
    """
    Get a list of all .pdb files in the specified directory and its subdirectories.

    Parameters:
    directory (str): The root directory to search for .pdb files.

    Returns:
    list: A list of paths to .pdb files.
    """
    pdb_files = []
    for root, dirs, files in os.walk(directory):  # 遍历目录及子目录
        for file in files:
            if file.endswith(".pdb"):  # 筛选 .pdb 文件
                pdb_files.append(os.path.join(root, file))
    return pdb_files


def download_with_wget(url, dst):
    """
    Download a file using wget and handle errors.

    Parameters:
    url (str): The URL of the file to download.
    dst (str): The destination path to save the file.

    Returns:
    bool: True if download is successful, False if not.
    """
    try:
        # 调用 wget 命令并捕获输出
        result = subprocess.run(
            ["wget", "-O", dst, url],
            stdout=subprocess.PIPE,  # 捕获标准输出
            stderr=subprocess.PIPE,  # 捕获错误输出
            text=True  # 输出为文本而非字节
        )
        
        # 检查返回码和输出内容
        if result.returncode == 0:
            print(f"File downloaded successfully to {dst}")
            return True
        else:
            return False
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return False


def get_pdb_file_from_afdb_web(uniprot_id):
    src = f'https://alphafold.ebi.ac.uk/files/AF-{uniprot_id}-F1-model_v4.pdb'
    dst = f'/hpcfs/fhome/shizhenkun/codebase/RXNRECer/temp/pdb/AF-{uniprot_id}-F1-model_v4.pdb'

    download_with_wget(src, dst)


def is_file_empty(file_path):
    """检查文件是否为空"""
    if os.path.isfile(file_path):
        size = os.path.getsize(file_path)
        if size == 0:
            return True
        else:
            return False
    else:
        raise FileNotFoundError(f"文件不存在: {file_path}")

def sup_esm_pdb(uniprot_id):
    """ 将ESM计算的PDB 补充到数据库"""
    ESM_OUTPUT_DIR=f"{cfg.RESULTS_DIR}intermediate/esmfold/output/"
    res = False
    src = f'{ESM_OUTPUT_DIR}{uniprot_id}.pdb'
    dst = f'{cfg.DATA_ROOT}structure/pdb/{str(uniprot_id[0:2])}/ESM-{uniprot_id}.pdb'
    
    if os.path.exists(f'{ESM_OUTPUT_DIR}{uniprot_id}.pdb'):
        res = cp_pdb(src, dst)
    if res:
        return dst
    else:
        return None

## 3. 获取需要计算PDB的所有uniprot_id

In [3]:
ds_train = pd.read_feather(cfg.FILE_DS_TRAIN)
ds_test = pd.read_feather(cfg.FILE_DS_TEST)
data_pdb = pd.concat([ds_train, ds_test], axis=0).reset_index(drop=True)[['uniprot_id', 'seq']]
data_pdb.head(2)

Unnamed: 0,uniprot_id,seq
0,Q6GZX4,MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQV...
1,Q6GZX3,MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQT...


## 4. 从已知库里（AF2-DB-V4）匹配绝大多数

In [8]:
if FIRST_TIME_RUN:
    if not  os.path.exists(cfg.FILE_DS_PDB_LIST):
        data_pdb[['path_pdb', 'pdb_exist']] = data_pdb.uniprot_id.swifter.apply(lambda x: pd.Series(prep_pdb(x)))
        data_pdb.to_feather(cfg.FILE_DS_PDB_LIST)
    else:
        data_pdb = pd.read_feather(cfg.FILE_DS_PDB_LIST)
else:
    data_pdb = pd.read_feather(cfg.FILE_DS_PDB_LIST)

## 5. 从网络用API补充没有pDB的数据，AF2-DB-ONLINE

In [5]:
if FIRST_TIME_RUN:
    # 指定目录
    directory = "/hpcfs/fhome/shizhenkun/codebase/RXNRECer/temp/pdb/"
    pdb_file_list = get_pdb_files(directory)

    # 输出文件列表
    print(f"Found {len(pdb_file_list)} .pdb files:")

    #  添加已有数据
    for item in tqdm(pdb_file_list): 
        uniprot_id = item.split('/')[-1].split('-')[1]
        
        try:
            if data_pdb[data_pdb['uniprot_id'] == uniprot_id].pdb_exist.values[0] == False:
                dst = f'{cfg.DATA_ROOT}structure/pdb/{str(uniprot_id[0:2])}/AF-{uniprot_id}-F1-model_v4.pdb'
                res = cp_pdb(item, dst)
                data_pdb.loc[data_pdb['uniprot_id'] == uniprot_id, 'pdb_exist'] = True
                data_pdb.loc[data_pdb['uniprot_id'] == uniprot_id, 'path_pdb'] = dst
        except Exception as e:
            print(f'{uniprot_id}:{e}')

    # 回写文件
    data_pdb.to_feather(cfg.FILE_DS_PDB_LIST)

## 6. 用ESMFOLD 预测没有的结构

In [None]:
if FIRST_TIME_RUN:
    # 计算需要从官网拿pdb的uniprot_id
    need_wget = data_pdb[data_pdb.pdb_exist==False].reset_index(drop=True)

    # 从 af2 v4 官网下载pdb结构
    tqdm.pandas()
    need_wget.uniprot_id.progress_apply(lambda x: get_pdb_file_from_afdb_web(x))


    # 检查文件是否完整
    data_pdb['pdb_check_ok'] = data_pdb.path_pdb.parallel_apply(lambda x: not is_file_empty(x))
    need_esm_fold =data_pdb[data_pdb.pdb_check_ok == False].reset_index(drop=True)

    # 制作需要解析PDB的序列文件
    NUM_GPU=32
    chunks = np.array_split(need_esm_fold, NUM_GPU)

    for item in range(len(chunks)):
        bfl.table2fasta(table=chunks[item][['uniprot_id', 'seq']].reset_index(drop=True), file_out=f'{cfg.RESULTS_DIR}intermediate/esmfold/input/chunk{item}.fasta')

    # seq to pdb
    # !sbatch get_esmfold_pdb.sh
    
    
    # 检查文件是否完整
    data_pdb['pdb_check_ok'] = data_pdb.path_pdb.parallel_apply(lambda x: not is_file_empty(x))
    uniprot_ids_esm_pdb = data_pdb[data_pdb.pdb_check_ok==False].uniprot_id.to_list()

    # 添加已经计算好的PDB
    for uid in tqdm(uniprot_ids_esm_pdb):
        res = sup_esm_pdb(uniprot_id=uid)
        if res!=None:
            data_pdb.loc[data_pdb['uniprot_id'] == uid, 'path_pdb'] = res

    # 回写文件
    data_pdb.to_feather(cfg.FILE_DS_PDB_LIST)

In [76]:
# 检查文件是否完整
data_pdb['pdb_check_ok'] = data_pdb.path_pdb.parallel_apply(lambda x: not is_file_empty(x))
need_esm_fold =data_pdb[data_pdb.pdb_check_ok == False].reset_index(drop=True)

In [None]:
item = 34
bfl.table2fasta(table=need_esm_fold[['uniprot_id', 'seq']].reset_index(drop=True), file_out=f'{cfg.RESULTS_DIR}intermediate/esmfold/input/chunk{item}.fasta')
! CUDA_VISIBLE_DEVICES=7 singularity exec --nv /hpcfs/fpublic/container/singularity/app/esmfold/esmfold.sif bash /esmfold.sh -i /hpcfs/fhome/shizhenkun/codebase/RXNRECer/results/intermediate/esmfold/input/chunk34.fasta -o /hpcfs/fhome/shizhenkun/codebase/RXNRECer/results/intermediate/esmfold/output/

  0%|          | 0/22633 [00:00<?, ?it/s]

100%|██████████| 22633/22633 [11:52<00:00, 31.77it/s] 


In [80]:
need_esm_fold


Unnamed: 0,uniprot_id,seq,path_pdb,pdb_exist,pdb_check_ok
0,Q197E9,MMESPKYKKSTCSVTNLGGTCILPQKGATAPKAKDVSPELLVNKMD...,/hpcfs/fhome/shizhenkun/codebase/RXNRECer/data...,True,False
1,Q6GZV6,MATNYCDEFERNPTRNPRTGRTIKRGGPVFRALERECSDGAARVFP...,/hpcfs/fhome/shizhenkun/codebase/RXNRECer/data...,True,False
2,Q6GZU9,MANFLQDVNCETVSEYDGPDASIPEGVWEGYVGHDHAALWRTWSYI...,/hpcfs/fhome/shizhenkun/codebase/RXNRECer/data...,True,False
3,Q6GZT5,MRVVVNAKALEVPVGMSFTEWTRTLSPGSSPRFLAWNPVRPRTFKD...,/hpcfs/fhome/shizhenkun/codebase/RXNRECer/data...,True,False
4,P05080,MANGNFKLSQLLNVDEMSAEQRSHFFDLMLTKPDCEIGQMMQRVVV...,/hpcfs/fhome/shizhenkun/codebase/RXNRECer/data...,True,False
...,...,...,...,...,...
2701,Q9FCY7,MRLNLHATEKKTTVQNVENPNNSTIPPLQQGSSSSAPQASGGTLAS...,/hpcfs/fhome/shizhenkun/codebase/RXNRECer/data...,True,False
2702,A0A7L9EYL1,MSGQDPVKESGQREPIAVVGSGFRFPGSSNNPSKLWDLLVKPRDLL...,/hpcfs/fhome/shizhenkun/codebase/RXNRECer/data...,True,False
2703,A0A9P5GG56,MPGGVRDLPALWEFLKEQKDVHREFDEPRFSAKGFSHPNPDRPGTA...,/hpcfs/fhome/shizhenkun/codebase/RXNRECer/data...,True,False
2704,B6H999,MTAPWPVKHDPIALVGIGCHMPGGVRDIPALWEFLRKQKDVHREFD...,/hpcfs/fhome/shizhenkun/codebase/RXNRECer/data...,True,False


In [60]:
sup_esm_pdb(uniprot_id=data_pdb[data_pdb.pdb_check_ok==False].uniprot_id[1])

'/hpcfs/fhome/shizhenkun/codebase/RXNRECer/data/structure/pdb/Q6/ESM-Q6GZX3.pdb'

In [50]:
! ls -lht /hpcfs/fhome/shizhenkun/codebase/RXNRECer/data/structure/pdb/Q6/ESM-Q6GZX3.pdb


-rw-r--r-- 1 shizhenkun hpcadmin 192K Dec 16 13:33 /hpcfs/fhome/shizhenkun/codebase/RXNRECer/data/structure/pdb/Q6/ESM-Q6GZX3.pdb


In [28]:
512/8

64.0