# Prepare Datasets for Protein Reaction Benchmarking Tasks

> author: Shizhenkun   
> email: zhenkun.shi@tib.cas.cn   
> date: 2024-12-14  



## 1. Import packages

In [1]:
import sys,os
sys.path.insert(0, os.path.dirname(os.path.realpath('__file__')))
sys.path.insert(1,'../')
from config import conf as cfg
from tools import filetool as ftool
from tools import bioFunctionLib as bfl
from tools import rheatool as rheatool
from tqdm import tqdm
from pandarallel import pandarallel # 导入pandaralle
pandarallel.initialize(progress_bar=False)
from tkinter import _flatten
import pandas as pd
import numpy as np

FIRST_TIME_RUN = False # For the initial run, please set this flag to True. This will allow the program to download data from UniProt and RHEA, which may take longer depending on your internet speed.

%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 128 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## 2. 定义处理函数

In [2]:
def prep_pdb(uniprot_id):
    src = f'/hpcfs/fpublic/database/alphafold/predicted_pdbs/AF-{uniprot_id}-F1-model_v4.pdb.gz'
    dst = f'{cfg.DATA_ROOT}structure/pdb/{str(uniprot_id[0:2])}/AF-{uniprot_id}-F1-model_v4.pdb.gz'
    res = ftool.cp_pdb(src, dst)
    # print(res)
    
    return dst, res


def get_pdb_file_from_afdb_web(uniprot_id):
    src = f'https://alphafold.ebi.ac.uk/files/AF-{uniprot_id}-F1-model_v4.pdb'
    dst = f'/hpcfs/fhome/shizhenkun/codebase/RXNRECer/temp/pdb/AF-{uniprot_id}-F1-model_v4.pdb'

    ftool.download_with_wget(src, dst)


def sup_esm_pdb(uniprot_id):
    """ 将ESM计算的PDB 补充到数据库"""
    ESM_OUTPUT_DIR=f"{cfg.RESULTS_DIR}intermediate/esmfold/output/"
    res = False
    src = f'{ESM_OUTPUT_DIR}{uniprot_id}.pdb'
    dst = f'{cfg.DATA_ROOT}structure/pdb/{str(uniprot_id[0:2])}/ESM-{uniprot_id}.pdb'
    
    if os.path.exists(f'{ESM_OUTPUT_DIR}{uniprot_id}.pdb'):
        res = ftool.cp_pdb(src, dst)
    if res:
        return dst
    else:
        return None

## 3. 获取需要计算PDB的所有uniprot_id

In [3]:
ds_train = pd.read_feather(cfg.FILE_DS_TRAIN)
ds_test = pd.read_feather(cfg.FILE_DS_TEST)
data_pdb = pd.concat([ds_train, ds_test], axis=0).reset_index(drop=True)[['uniprot_id', 'seq']]
data_pdb.head(2)

Unnamed: 0,uniprot_id,seq
0,Q6GZX4,MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQV...
1,Q6GZX3,MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQT...


## 4. 从已知库里（AF2-DB-V4）匹配绝大多数

In [4]:
if FIRST_TIME_RUN:
    if not  os.path.exists(cfg.FILE_DS_PDB_LIST):
        data_pdb[['path_pdb', 'pdb_exist']] = data_pdb.uniprot_id.swifter.apply(lambda x: pd.Series(prep_pdb(x)))
        data_pdb.to_feather(cfg.FILE_DS_PDB_LIST)
    else:
        data_pdb = pd.read_feather(cfg.FILE_DS_PDB_LIST)
else:
    data_pdb = pd.read_feather(cfg.FILE_DS_PDB_LIST)

## 5. 从网络用API补充没有pDB的数据，AF2-DB-ONLINE

In [5]:
if FIRST_TIME_RUN:
    # 指定目录
    directory = "/hpcfs/fhome/shizhenkun/codebase/RXNRECer/temp/pdb/"
    pdb_file_list = ftool.get_pdb_files(directory)

    # 输出文件列表
    print(f"Found {len(pdb_file_list)} .pdb files:")

    #  添加已有数据
    for item in tqdm(pdb_file_list): 
        uniprot_id = item.split('/')[-1].split('-')[1]
        
        try:
            if data_pdb[data_pdb['uniprot_id'] == uniprot_id].pdb_exist.values[0] == False:
                dst = f'{cfg.DATA_ROOT}structure/pdb/{str(uniprot_id[0:2])}/AF-{uniprot_id}-F1-model_v4.pdb'
                res = ftool.cp_pdb(item, dst)
                data_pdb.loc[data_pdb['uniprot_id'] == uniprot_id, 'pdb_exist'] = True
                data_pdb.loc[data_pdb['uniprot_id'] == uniprot_id, 'path_pdb'] = dst
        except Exception as e:
            print(f'{uniprot_id}:{e}')

    # 回写文件
    data_pdb.to_feather(cfg.FILE_DS_PDB_LIST)

## 6. 用ESMFOLD 预测没有的结构

In [6]:
if FIRST_TIME_RUN:
    # 计算需要从官网拿pdb的uniprot_id
    need_wget = data_pdb[data_pdb.pdb_exist==False].reset_index(drop=True)

    # 从 af2 v4 官网下载pdb结构
    tqdm.pandas()
    need_wget.uniprot_id.progress_apply(lambda x: get_pdb_file_from_afdb_web(x))


    # 检查文件是否完整
    data_pdb['pdb_check_ok'] = data_pdb.path_pdb.parallel_apply(lambda x: not ftool.is_file_empty(x))
    need_esm_fold =data_pdb[data_pdb.pdb_check_ok == False].reset_index(drop=True)

    # 制作需要解析PDB的序列文件
    NUM_GPU=32
    chunks = np.array_split(need_esm_fold, NUM_GPU)

    for item in range(len(chunks)):
        bfl.table2fasta(table=chunks[item][['uniprot_id', 'seq']].reset_index(drop=True), file_out=f'{cfg.RESULTS_DIR}intermediate/esmfold/input/chunk{item}.fasta')

    # seq to pdb
    # !sbatch get_esmfold_pdb.sh
    
    
    # 检查文件是否完整
    data_pdb['pdb_check_ok'] = data_pdb.path_pdb.parallel_apply(lambda x: not ftool.is_file_empty(x))
    uniprot_ids_esm_pdb = data_pdb[data_pdb.pdb_check_ok==False].uniprot_id.to_list()

    # 添加已经计算好的PDB
    for uid in tqdm(uniprot_ids_esm_pdb):
        res = sup_esm_pdb(uniprot_id=uid)
        if res!=None:
            data_pdb.loc[data_pdb['uniprot_id'] == uid, 'path_pdb'] = res

    # 回写文件
    data_pdb.to_feather(cfg.FILE_DS_PDB_LIST)

In [7]:
data_pdb

Unnamed: 0,uniprot_id,seq,path_pdb,pdb_exist,pdb_check_ok
0,Q6GZX4,MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQV...,/hpcfs/fhome/shizhenkun/codebase/RXNRECer/data...,True,True
1,Q6GZX3,MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQT...,/hpcfs/fhome/shizhenkun/codebase/RXNRECer/data...,True,True
2,Q197F8,MASNTVSAQGGSNRPVRDFSNIQDVAQFLLFDPIWNEQPGSIVPWK...,/hpcfs/fhome/shizhenkun/codebase/RXNRECer/data...,True,True
3,Q197F7,MYQAINPCPQSWYGSPQLEREIVCKMSGAPHYPNYYPVHPNALGGA...,/hpcfs/fhome/shizhenkun/codebase/RXNRECer/data...,True,True
4,Q6GZX2,MARPLLGKTSSVRRRLESLSACSIFFFLRKFCQKMASLVFLNSPVY...,/hpcfs/fhome/shizhenkun/codebase/RXNRECer/data...,True,True
...,...,...,...,...,...
522097,P0DW91,MSGAEEAGGGGPAAGPAGSVPAGVGVGAGAGAGVGVGAGPGAAAGP...,/hpcfs/fhome/shizhenkun/codebase/RXNRECer/data...,True,True
522098,P0DTL6,MSGAEEAGGGGPAAGPAGSVPAGVGVGVGAGPGAAAGQAAAAALGE...,/hpcfs/fhome/shizhenkun/codebase/RXNRECer/data...,True,True
522099,P0DW87,MSGAEEAGGGGPAAGPAGAVPAGVGVGAGPGAAAGPAAAALGEAAG...,/hpcfs/fhome/shizhenkun/codebase/RXNRECer/data...,True,True
522100,P0DW89,MSGAEEAGGGGPAAGPAGAVPAGVGVGVGPGAAAGPAAAALGEAAG...,/hpcfs/fhome/shizhenkun/codebase/RXNRECer/data...,True,True
