# Prepare 3Di for Protein Reaction Benchmarking Tasks

> author: Shizhenkun   
> email: zhenkun.shi@tib.cas.cn   
> date: 2024-12-19  



## 1. Import packages

In [6]:
import sys,os
sys.path.insert(0, os.path.dirname(os.path.realpath('__file__')))
sys.path.insert(1,'../')
from config import conf as cfg
from tools import rheatool as rheatool
from modules.structure.Tdi import Tdi
from tools import  bioFunctionLib as bfl
from pandarallel import pandarallel # 导入pandaralle
pandarallel.initialize(progress_bar=False)
from tkinter import _flatten
import subprocess
import h5py
import pandas as pd
import numpy as np
import swifter

FIRST_TIME_RUN = False # For the initial run, please set this flag to True. This will allow the program to download data from UniProt and RHEA, which may take longer depending on your internet speed.

%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 128 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 2. 获取需要计算3DId的所有数据

In [7]:
data = pd.read_feather(cfg.FILE_DS_3DI_LIST)
print(f'Total number of 3DI structures: {len(data)}')
data.head(2)

Total number of 3DI structures: 522102


Unnamed: 0,uniprot_id,seq,path_pdb,pdb_exist,pdb_check_ok,pdb_name,cacl_seq,token_3di
0,Q6GZX4,MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQV...,/hpcfs/fhome/shizhenkun/codebase/RXNRECer/data...,True,True,ESM-Q6GZX4.pdb,MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQV...,DDDDPVNVVVVVVVVLVVVLVVVCVVVVPPCVSVVPPDDDPPPPPP...
1,Q6GZX3,MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQT...,/hpcfs/fhome/shizhenkun/codebase/RXNRECer/data...,True,True,ESM-Q6GZX3.pdb,MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQT...,DPPVPVVPPVPDPPPPPPDDVPPDDPPDDDDPDDDPPDDPDDDDPP...


In [3]:
# 转化3Di 文件
# !foldseek structureto3didescriptor prot.pdb res_prot.3di --threads 10 
if FIRST_TIME_RUN:
    data[['pdb_name', 'cacl_seq', 'token_3di', 'matrix_3di']] = data['path_pdb'].swifter.apply(
        lambda x: pd.Series([
            (result :=  bfl. get_fold_seek_3di(pdb_path=x)).name,   # pdb_name
            result.seq,                                  # cacl_seq
            result.token_3di,                            # token_3di
            result.matrix_3di                            # matrix_3di
        ])
    )
    
    # 对特别断的序列填充ACD token
    data.loc[data.token_3di=='', 'token_3di']='ACD'
    data[data.token_3di=='']

## 3. 用ProstT5 获取结构3Di表示后的结构Embedding

In [18]:
# This take very long time, Recommend to use modules/structure/embedProstT5_3di.py with backend process instead.
NPICE = 32 
if FIRST_TIME_RUN:
    
    # Split the data into NPICE parts
    split_data = np.array_split(data, NPICE)

    # Iterate over each split and save as a separate FASTA file
    for i, part in enumerate(split_data):
        # Create a path for the current fold's FASTA file
        fold_fasta_path = os.path.join(cfg.RESULTS_DIR, f'intermediate/foldseek/3diembd/3difold{i+1}.fasta')
        
        # Save the part to a FASTA file using bfl.table2fasta
        bfl.table2fasta(table=part.head(300)[['uniprot_id', 'token_3di']], file_out=fold_fasta_path)
        print(f"Saved part {i + 1} to {fold_fasta_path}")
        
#    !sbatch get_t53di_embedding.slurm # 提交脚本

In [24]:
file_3di_embeddings = [f'{cfg.RESULTS_DIR}intermediate/foldseek/3diembd/3difold{i+1}.feather' for i in range(NPICE)]
list_3di = [pd.read_feather(file_3di_embeddings[i]) for i in range(NPICE)]
tdi_embedding = pd.concat(list_3di, axis=0).reset_index(drop=True)

In [26]:
tdi_embedding.to_feather(cfg.FILE_DS_3DI_EMBEDDING)