# Prepare 3Di for Protein Reaction Benchmarking Tasks

> author: Shizhenkun   
> email: zhenkun.shi@tib.cas.cn   
> date: 2024-12-19  



## 1. Import packages

In [1]:
import sys,os
sys.path.insert(0, os.path.dirname(os.path.realpath('__file__')))
sys.path.insert(1,'../')
from config import conf as cfg
from tools import rheatool as rheatool
from modules.structure.Tdi import Tdi
from modules.structure import embedding_3di
from tools import  bioFunctionLib as bfl
from pandarallel import pandarallel # 导入pandaralle
pandarallel.initialize(progress_bar=False)
from tkinter import _flatten
import subprocess
import h5py
import pandas as pd
import numpy as np
import swifter

FIRST_TIME_RUN = False # For the initial run, please set this flag to True. This will allow the program to download data from UniProt and RHEA, which may take longer depending on your internet speed.

%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 128 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## 2. 获取需要计算3DId的所有数据

In [2]:
data = pd.read_feather(cfg.FILE_DS_3DI_LIST)
data.head(2)

Unnamed: 0,uniprot_id,seq,path_pdb,pdb_exist,pdb_check_ok,pdb_name,cacl_seq,token_3di
0,Q6GZX4,MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQV...,/hpcfs/fhome/shizhenkun/codebase/RXNRECer/data...,True,True,ESM-Q6GZX4.pdb,MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQV...,DDDDPVNVVVVVVVVLVVVLVVVCVVVVPPCVSVVPPDDDPPPPPP...
1,Q6GZX3,MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQT...,/hpcfs/fhome/shizhenkun/codebase/RXNRECer/data...,True,True,ESM-Q6GZX3.pdb,MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQT...,DPPVPVVPPVPDPPPPPPDDVPPDDPPDDDDPDDDPPDDPDDDDPP...


In [3]:
# 转化3Di 文件
# !foldseek structureto3didescriptor prot.pdb res_prot.3di --threads 10 
if FIRST_TIME_RUN:
    data[['pdb_name', 'cacl_seq', 'token_3di', 'matrix_3di']] = data['path_pdb'].swifter.apply(
        lambda x: pd.Series([
            (result :=  bfl. get_fold_seek_3di(pdb_path=x)).name,   # pdb_name
            result.seq,                                  # cacl_seq
            result.token_3di,                            # token_3di
            result.matrix_3di                            # matrix_3di
        ])
    )
    
    # 对特别断的序列填充ACD token
    data.loc[data.token_3di=='', 'token_3di']='ACD'
    data[data.token_3di=='']

## 3. 用ProstT5 获取结构表示

In [4]:
data.head(3)

Unnamed: 0,uniprot_id,seq,path_pdb,pdb_exist,pdb_check_ok,pdb_name,cacl_seq,token_3di
0,Q6GZX4,MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQV...,/hpcfs/fhome/shizhenkun/codebase/RXNRECer/data...,True,True,ESM-Q6GZX4.pdb,MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQV...,DDDDPVNVVVVVVVVLVVVLVVVCVVVVPPCVSVVPPDDDPPPPPP...
1,Q6GZX3,MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQT...,/hpcfs/fhome/shizhenkun/codebase/RXNRECer/data...,True,True,ESM-Q6GZX3.pdb,MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQT...,DPPVPVVPPVPDPPPPPPDDVPPDDPPDDDDPDDDPPDDPDDDDPP...
2,Q197F8,MASNTVSAQGGSNRPVRDFSNIQDVAQFLLFDPIWNEQPGSIVPWK...,/hpcfs/fhome/shizhenkun/codebase/RXNRECer/data...,True,True,ESM-Q197F8.pdb,MASNTVSAQGGSNRPVRDFSNIQDVAQFLLFDPIWNEQPGSIVPWK...,DDPPPPPPPPPPPPPPPPCVVCCVVVCVVPPPVVVVDDPPPPPVVP...


In [6]:
# This take very long time, Recommend to use modules/structure/embedding_3di.py with backend process instead.
if FIRST_TIME_RUN:
    res = embedding_3di.get_embd_using_3di_batch(sequence_3di=data.token_3di.to_list(), batch_size=20)

In [18]:
bfl.table2fasta(table= data.head(30)[['uniprot_id','token_3di']],file_out='/tmp/3di.fasta')

In [9]:
aa = pd.read_feather('/tmp/3di_embeddings.feather')
aa

Using device: cuda:0


Unnamed: 0,uniprot_id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f1015,f1016,f1017,f1018,f1019,f1020,f1021,f1022,f1023,f1024
0,Q6GZV6,-0.004589,0.085327,-0.01828,-0.019821,0.022125,0.009155,-0.04129,-0.012627,0.025635,...,0.075073,-0.0354,0.023651,-0.064148,0.041748,0.046509,0.0336,-0.028091,-0.009712,0.036285
1,Q197E9,0.024658,0.039215,-0.020691,0.005226,0.029266,-0.032288,-0.007671,0.013977,0.036163,...,0.050751,-0.038788,0.025772,-0.032257,0.074524,0.070007,0.040466,-0.041077,-0.011353,0.014572
2,Q6GZV8,-0.057892,0.049622,-0.013268,-0.027374,0.034943,0.018509,0.011452,-0.019165,-0.003363,...,0.0401,0.047607,0.065247,-0.077881,-0.010887,-0.011993,0.009506,0.011963,-0.02951,-0.007828
3,Q197F8,-0.030609,0.034058,0.005417,0.009895,0.026733,0.024048,0.035889,-0.062866,-0.03418,...,0.058624,-0.014046,0.037323,-0.014854,0.045624,0.084167,0.026169,0.059418,0.023468,0.021332
4,Q197F3,-0.030716,0.04657,0.030365,0.016342,-0.009018,-0.001097,-0.016388,-0.011154,-1.7e-05,...,0.014412,-0.035187,0.046631,-0.019623,0.046234,0.015167,-0.024033,-0.007942,0.008148,0.012703
5,Q6GZX2,0.004292,0.018631,-0.006927,0.014725,-0.015945,0.011353,-0.051056,-0.015373,-0.029373,...,-0.006573,-0.036499,-0.014893,-0.03653,0.030548,-0.005802,-0.029007,-0.02269,-0.023727,-0.016266
6,Q6GZV2,-0.003847,-0.018616,-0.008415,0.01918,-0.029968,0.025452,0.001867,-0.012535,-0.014526,...,0.048584,-0.061554,0.042297,-0.008598,0.032318,0.037231,-0.006336,-0.020264,0.06311,-0.005333
7,Q91G88,-0.026978,-0.063721,0.028732,-0.04306,0.024551,0.011452,-0.003763,-0.00985,-0.029831,...,-0.039764,-0.006283,-0.000441,-0.045197,0.00872,-0.002808,-0.004494,-0.016251,-0.030273,-0.013107
8,Q197F2,0.012062,0.00988,0.055298,-0.022095,0.011925,0.005516,0.066284,-0.046356,-0.005272,...,0.002663,-0.014938,0.001825,-0.022446,-0.005085,0.025818,0.002289,0.059448,0.004559,-0.004059
9,Q6GZW0,-0.011665,0.002026,-0.023544,-0.011459,-0.02681,-0.027908,-0.060516,-0.001924,-0.052155,...,-0.006973,0.02356,-0.009026,-0.031158,0.031052,0.042603,0.017471,-0.042236,0.058563,-0.025192


In [15]:
import embed as embed
res = embed.get_embeddings_with_df(df_token_with_id=data.head(30)[['uniprot_id', 'token_3di']].rename(columns={'token_3di':'sequence'}), per_protein=True, half_precision=True, is_3Di=True)

Loading T5 model and tokenizer from: Rostlab/ProstT5
Using model in half-precision!
Example: Embedded protein Q6GZV6 (Length=851) to embedding of shape torch.Size([1024])
########################################
Total time: 0.43 seconds
Time per protein: 0.0144 seconds
########################################


In [16]:
res['Q6GZX1']

array([ 0.03345 ,  0.01929 ,  0.001092, ...,  0.1324  ,  0.06073 ,
       -0.003748], dtype=float16)

In [8]:
pd.DataFrame(res).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
Q6GZV6,-0.004589,0.085327,-0.01828,-0.019821,0.022125,0.009155,-0.04129,-0.012627,0.025635,-0.060303,...,0.075073,-0.0354,0.023651,-0.064148,0.041748,0.046509,0.0336,-0.028091,-0.009712,0.036285
Q197E9,0.024658,0.039215,-0.020691,0.005226,0.029266,-0.032288,-0.007671,0.013977,0.036163,-0.023102,...,0.050751,-0.038788,0.025772,-0.032257,0.074524,0.070007,0.040466,-0.041077,-0.011353,0.014572
Q6GZV8,-0.057892,0.049622,-0.013268,-0.027374,0.034943,0.018509,0.011452,-0.019165,-0.003363,-0.003622,...,0.0401,0.047607,0.065247,-0.077881,-0.010887,-0.011993,0.009506,0.011963,-0.02951,-0.007828
Q197F8,-0.030609,0.034058,0.005417,0.009895,0.026733,0.024048,0.035889,-0.062866,-0.03418,-0.068726,...,0.058624,-0.014046,0.037323,-0.014854,0.045624,0.084167,0.026169,0.059418,0.023468,0.021332
Q197F3,-0.030716,0.04657,0.030365,0.016342,-0.009018,-0.001097,-0.016388,-0.011154,-1.7e-05,2.6e-05,...,0.014412,-0.035187,0.046631,-0.019623,0.046234,0.015167,-0.024033,-0.007942,0.008148,0.012703
Q6GZX2,0.004292,0.018631,-0.006927,0.014725,-0.015945,0.011353,-0.051056,-0.015373,-0.029373,0.000665,...,-0.006573,-0.036499,-0.014893,-0.03653,0.030548,-0.005802,-0.029007,-0.02269,-0.023727,-0.016266
Q6GZV2,-0.003847,-0.018616,-0.008415,0.01918,-0.029968,0.025452,0.001867,-0.012535,-0.014526,0.006931,...,0.048584,-0.061554,0.042297,-0.008598,0.032318,0.037231,-0.006336,-0.020264,0.06311,-0.005333
Q91G88,-0.026978,-0.063721,0.028732,-0.04306,0.024551,0.011452,-0.003763,-0.00985,-0.029831,0.036804,...,-0.039764,-0.006283,-0.000441,-0.045197,0.00872,-0.002808,-0.004494,-0.016251,-0.030273,-0.013107
Q197F2,0.012062,0.00988,0.055298,-0.022095,0.011925,0.005516,0.066284,-0.046356,-0.005272,-0.007626,...,0.002663,-0.014938,0.001825,-0.022446,-0.005085,0.025818,0.002289,0.059448,0.004559,-0.004059
Q6GZW0,-0.011665,0.002026,-0.023544,-0.011459,-0.02681,-0.027908,-0.060516,-0.001924,-0.052155,-0.02211,...,-0.006973,0.02356,-0.009026,-0.031158,0.031052,0.042603,0.017471,-0.042236,0.058563,-0.025192


In [92]:
data.head(10)[['uniprot_id', 'token_3di']]

Unnamed: 0,uniprot_id,token_3di
0,Q6GZX4,DDDDPVNVVVVVVVVLVVVLVVVCVVVVPPCVSVVPPDDDPPPPPP...
1,Q6GZX3,DPPVPVVPPVPDPPPPPPDDVPPDDPPDDDDPDDDPPDDPDDDDPP...
2,Q197F8,DDPPPPPPPPPPPPPPPPCVVCCVVVCVVPPPVVVVDDPPPPPVVP...
3,Q197F7,DPPPPPPPPPVVPDDPVVVCVVVCLVVVDDDDPPPDPPPPPCCPLP...
4,Q6GZX2,DPDPPPDDPVVVVVVVVPQDVVNVVVCCVVVCVVCCCVCVVPAVVV...
5,Q6GZX1,DPPPPPVVVVVVVVVVVVVVVVVCVVVVVVVVCCVVVVPPPVPPPP...
6,Q197F5,DVVVVVVVVVVVVVVVVVVPPPPPPPPAADFFCFAPFQVNHHQFTQ...
7,Q6GZX0,DDDDDDPDPDPPPPPPDQPDQDPVLVVLQVQCVPDQLPVVSLVVSL...
8,Q91G88,DVVCVQFFDADLDDQWTWGDLPPDTWIARPVLQWIFLQSLLVVLVH...
9,Q6GZW9,DDWDFDDDPVPDTFTWDQDPDPDDPDPPLPPPSPTRPRPPDPVVPD...
